@mailwoman/neural 2.1.0 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/out/anchor-inference.d.ts +57 -0
- package/out/anchor-inference.d.ts.map +1 -0
- package/out/anchor-inference.js +94 -0
- package/out/anchor-inference.js.map +1 -0
- package/out/browser.d.ts +18 -0
- package/out/browser.d.ts.map +1 -0
- package/out/browser.js +19 -0
- package/out/browser.js.map +1 -0
- package/out/classifier.d.ts +145 -11
- package/out/classifier.d.ts.map +1 -1
- package/out/classifier.js +185 -20
- package/out/classifier.js.map +1 -1
- package/out/fst-prior.d.ts +71 -0
- package/out/fst-prior.d.ts.map +1 -0
- package/out/fst-prior.js +173 -0
- package/out/fst-prior.js.map +1 -0
- package/out/index.d.ts +7 -0
- package/out/index.d.ts.map +1 -1
- package/out/index.js +5 -0
- package/out/index.js.map +1 -1
- package/out/labels.d.ts +30 -6
- package/out/labels.d.ts.map +1 -1
- package/out/labels.js +43 -6
- package/out/labels.js.map +1 -1
- package/out/onnx-runner.d.ts +8 -1
- package/out/onnx-runner.d.ts.map +1 -1
- package/out/onnx-runner.js +31 -1
- package/out/onnx-runner.js.map +1 -1
- package/out/postcode-anchor.d.ts +117 -0
- package/out/postcode-anchor.d.ts.map +1 -0
- package/out/postcode-anchor.js +269 -0
- package/out/postcode-anchor.js.map +1 -0
- package/out/postcode-binary-resolver.d.ts +60 -0
- package/out/postcode-binary-resolver.d.ts.map +1 -0
- package/out/postcode-binary-resolver.js +208 -0
- package/out/postcode-binary-resolver.js.map +1 -0
- package/out/postcode-repair.d.ts +65 -0
- package/out/postcode-repair.d.ts.map +1 -0
- package/out/postcode-repair.js +171 -0
- package/out/postcode-repair.js.map +1 -0
- package/out/proposal-classifier.d.ts +5 -1
- package/out/proposal-classifier.d.ts.map +1 -1
- package/out/proposal-classifier.js +5 -3
- package/out/proposal-classifier.js.map +1 -1
- package/out/query-shape-prior.d.ts +74 -0
- package/out/query-shape-prior.d.ts.map +1 -0
- package/out/query-shape-prior.js +223 -0
- package/out/query-shape-prior.js.map +1 -0
- package/out/street-morphology-prior.d.ts +56 -0
- package/out/street-morphology-prior.d.ts.map +1 -0
- package/out/street-morphology-prior.js +159 -0
- package/out/street-morphology-prior.js.map +1 -0
- package/out/tokenizer.d.ts +6 -1
- package/out/tokenizer.d.ts.map +1 -1
- package/out/tokenizer.js +8 -3
- package/out/tokenizer.js.map +1 -1
- package/out/unit-repair.d.ts +46 -0
- package/out/unit-repair.d.ts.map +1 -0
- package/out/unit-repair.js +147 -0
- package/out/unit-repair.js.map +1 -0
- package/out/viterbi.d.ts +76 -0
- package/out/viterbi.d.ts.map +1 -0
- package/out/viterbi.js +163 -0
- package/out/viterbi.js.map +1 -0
- package/out/vitest.config.d.ts.map +1 -1
- package/out/vitest.config.js +3 -0
- package/out/vitest.config.js.map +1 -1
- package/out/weights.d.ts +42 -0
- package/out/weights.d.ts.map +1 -1
- package/out/weights.js +92 -4
- package/out/weights.js.map +1 -1
- package/package.json +10 -3
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Browser-side postcode resolver for the anchor (#240). A pure-JS, zero-dependency
|
|
7
|
+
* `PostcodeResolver` backed by a compact flat binary instead of SQLite, so the postcode anchor
|
|
8
|
+
* runs in the WASM/browser parser behind the same `lookup()` seam as the server-side
|
|
9
|
+
* `WofPostcodeLookup`.
|
|
10
|
+
*
|
|
11
|
+
* This file owns BOTH ends of the format — `serializePostcodeBinary` (run in Node by
|
|
12
|
+
* `scripts/build-postcode-binary.ts`) and `PostcodeBinaryResolver` (run in the browser) — so the
|
|
13
|
+
* layout can never drift between writer and reader.
|
|
14
|
+
*
|
|
15
|
+
* Binary layout (little-endian): magic "PCB1" (4 bytes) u32 recordCount u8 countryCount, then
|
|
16
|
+
* countryCount × 2 ASCII bytes (the country table) u8 keyWidth (max postcode length in bytes)
|
|
17
|
+
* records recordCount × { key[keyWidth] ASCII right-padded with 0x00, u8 countryIdx, i16 latQ,
|
|
18
|
+
* i16 lonQ }, sorted by key bytes ascending. A postcode present in two countries appears as two
|
|
19
|
+
* adjacent records (same key, different countryIdx).
|
|
20
|
+
*
|
|
21
|
+
* Coordinates are quantized to i16: latQ = round(lat/90 × 32767), lonQ = round(lon/180 × 32767),
|
|
22
|
+
* giving ~300 m resolution — ample for a "which city/region" anchor. A record with latQ = lonQ =
|
|
23
|
+
* 0 means "known postcode, no centroid" (membership only), matching the SQLite resolver's
|
|
24
|
+
* convention.
|
|
25
|
+
*/
|
|
26
|
+
const MAGIC = 0x31_42_43_50; // "PCB1" little-endian (P=0x50 C=0x43 B=0x42 1=0x31)
|
|
27
|
+
const REC_TAIL = 5; // countryIdx(1) + latQ(2) + lonQ(2)
|
|
28
|
+
const LAT_Q = 32767 / 90;
|
|
29
|
+
const LON_Q = 32767 / 180;
|
|
30
|
+
/**
|
|
31
|
+
* Right-pad an ASCII postcode to `width` with NUL; `\0` sorts below any real char, so shorter keys
|
|
32
|
+
* order before longer ones with the same prefix, which is what we want.
|
|
33
|
+
*/
|
|
34
|
+
function encodeKey(s, width, out, offset) {
|
|
35
|
+
for (let i = 0; i < width; i++)
|
|
36
|
+
out[offset + i] = i < s.length ? s.charCodeAt(i) & 0x7f : 0;
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Serialize postcode entries into the flat binary. Entries are sorted by (postcode, country) so
|
|
40
|
+
* equal postcodes land in adjacent records. Run in Node; consumed by
|
|
41
|
+
* {@link PostcodeBinaryResolver}.
|
|
42
|
+
*/
|
|
43
|
+
export function serializePostcodeBinary(entries) {
|
|
44
|
+
const sorted = [...entries].sort((a, b) => a.postcode < b.postcode
|
|
45
|
+
? -1
|
|
46
|
+
: a.postcode > b.postcode
|
|
47
|
+
? 1
|
|
48
|
+
: a.country < b.country
|
|
49
|
+
? -1
|
|
50
|
+
: a.country > b.country
|
|
51
|
+
? 1
|
|
52
|
+
: 0);
|
|
53
|
+
const countries = [...new Set(sorted.map((e) => e.country))].sort();
|
|
54
|
+
const countryIdx = new Map(countries.map((c, i) => [c, i]));
|
|
55
|
+
const keyWidth = sorted.reduce((m, e) => Math.max(m, e.postcode.length), 1);
|
|
56
|
+
const recSize = keyWidth + REC_TAIL;
|
|
57
|
+
const headerSize = 4 + 4 + 1 + countries.length * 2 + 1;
|
|
58
|
+
const buf = new Uint8Array(headerSize + sorted.length * recSize);
|
|
59
|
+
const view = new DataView(buf.buffer);
|
|
60
|
+
let o = 0;
|
|
61
|
+
view.setUint32(o, MAGIC, true);
|
|
62
|
+
o += 4;
|
|
63
|
+
view.setUint32(o, sorted.length, true);
|
|
64
|
+
o += 4;
|
|
65
|
+
buf[o++] = countries.length;
|
|
66
|
+
for (const c of countries) {
|
|
67
|
+
buf[o++] = c.charCodeAt(0) & 0x7f;
|
|
68
|
+
buf[o++] = c.charCodeAt(1) & 0x7f;
|
|
69
|
+
}
|
|
70
|
+
buf[o++] = keyWidth;
|
|
71
|
+
for (const e of sorted) {
|
|
72
|
+
encodeKey(e.postcode, keyWidth, buf, o);
|
|
73
|
+
o += keyWidth;
|
|
74
|
+
buf[o++] = countryIdx.get(e.country);
|
|
75
|
+
view.setInt16(o, Math.max(-32767, Math.min(32767, Math.round(e.lat * LAT_Q))), true);
|
|
76
|
+
o += 2;
|
|
77
|
+
view.setInt16(o, Math.max(-32767, Math.min(32767, Math.round(e.lon * LON_Q))), true);
|
|
78
|
+
o += 2;
|
|
79
|
+
}
|
|
80
|
+
return buf;
|
|
81
|
+
}
|
|
82
|
+
/**
|
|
83
|
+
* Pure-JS, browser-safe postcode resolver over the flat binary. Implements the same `lookup()` seam
|
|
84
|
+
* as the SQLite `WofPostcodeLookup`, so `extractPostcodeAnchors` is agnostic to which backs it.
|
|
85
|
+
*/
|
|
86
|
+
export class PostcodeBinaryResolver {
|
|
87
|
+
#buf;
|
|
88
|
+
#view;
|
|
89
|
+
#count;
|
|
90
|
+
#countries;
|
|
91
|
+
#keyWidth;
|
|
92
|
+
#recSize;
|
|
93
|
+
#recBase;
|
|
94
|
+
constructor(bytes) {
|
|
95
|
+
this.#buf = bytes;
|
|
96
|
+
this.#view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength);
|
|
97
|
+
if (this.#view.getUint32(0, true) !== MAGIC)
|
|
98
|
+
throw new Error("postcode binary: bad magic");
|
|
99
|
+
this.#count = this.#view.getUint32(4, true);
|
|
100
|
+
let o = 8;
|
|
101
|
+
const countryCount = bytes[o++];
|
|
102
|
+
this.#countries = [];
|
|
103
|
+
for (let i = 0; i < countryCount; i++) {
|
|
104
|
+
this.#countries.push(String.fromCharCode(bytes[o], bytes[o + 1]));
|
|
105
|
+
o += 2;
|
|
106
|
+
}
|
|
107
|
+
this.#keyWidth = bytes[o++];
|
|
108
|
+
this.#recSize = this.#keyWidth + REC_TAIL;
|
|
109
|
+
this.#recBase = o;
|
|
110
|
+
}
|
|
111
|
+
/** Compare the keyWidth bytes of record `i` against a padded query key. */
|
|
112
|
+
#cmpKey(i, key) {
|
|
113
|
+
const base = this.#recBase + i * this.#recSize;
|
|
114
|
+
for (let j = 0; j < this.#keyWidth; j++) {
|
|
115
|
+
const d = this.#buf[base + j] - key[j];
|
|
116
|
+
if (d !== 0)
|
|
117
|
+
return d;
|
|
118
|
+
}
|
|
119
|
+
return 0;
|
|
120
|
+
}
|
|
121
|
+
lookup(postcode) {
|
|
122
|
+
if (postcode.length > this.#keyWidth)
|
|
123
|
+
return []; // longer than any stored key → impossible
|
|
124
|
+
const key = new Uint8Array(this.#keyWidth);
|
|
125
|
+
encodeKey(postcode, this.#keyWidth, key, 0);
|
|
126
|
+
// Binary search for the first record whose key >= the query.
|
|
127
|
+
let lo = 0;
|
|
128
|
+
let hi = this.#count;
|
|
129
|
+
while (lo < hi) {
|
|
130
|
+
const mid = (lo + hi) >>> 1;
|
|
131
|
+
if (this.#cmpKey(mid, key) < 0)
|
|
132
|
+
lo = mid + 1;
|
|
133
|
+
else
|
|
134
|
+
hi = mid;
|
|
135
|
+
}
|
|
136
|
+
// Collect the contiguous run of equal keys (one per country).
|
|
137
|
+
const out = [];
|
|
138
|
+
for (let i = lo; i < this.#count && this.#cmpKey(i, key) === 0; i++) {
|
|
139
|
+
const base = this.#recBase + i * this.#recSize + this.#keyWidth;
|
|
140
|
+
out.push({
|
|
141
|
+
country: this.#countries[this.#buf[base]],
|
|
142
|
+
lat: this.#view.getInt16(base + 1, true) / LAT_Q,
|
|
143
|
+
lon: this.#view.getInt16(base + 3, true) / LON_Q,
|
|
144
|
+
});
|
|
145
|
+
}
|
|
146
|
+
return out;
|
|
147
|
+
}
|
|
148
|
+
/**
|
|
149
|
+
* Decode the whole binary into an {@link AnchorLookup} (`Map<postcode, AnchorEntry>`) for the
|
|
150
|
+
* neural anchor channel (#239/#240): each postcode → a uniform posterior over its member
|
|
151
|
+
* countries
|
|
152
|
+
*
|
|
153
|
+
* - The mean of its non-zero centroids. This is the browser-side equivalent of the pilot
|
|
154
|
+
* postcode→anchor lookup the model trained against, built live from the shipped binary instead
|
|
155
|
+
* of a precomputed JSON. Records are stored sorted by (postcode, country), so equal keys are
|
|
156
|
+
* contiguous.
|
|
157
|
+
*/
|
|
158
|
+
toAnchorLookup() {
|
|
159
|
+
const out = new Map();
|
|
160
|
+
let i = 0;
|
|
161
|
+
while (i < this.#count) {
|
|
162
|
+
// Decode this record's postcode key (ASCII, 0x00-right-padded).
|
|
163
|
+
const keyBase = this.#recBase + i * this.#recSize;
|
|
164
|
+
let postcode = "";
|
|
165
|
+
for (let j = 0; j < this.#keyWidth; j++) {
|
|
166
|
+
const c = this.#buf[keyBase + j];
|
|
167
|
+
if (c === 0)
|
|
168
|
+
break;
|
|
169
|
+
postcode += String.fromCharCode(c);
|
|
170
|
+
}
|
|
171
|
+
// Walk the contiguous run of records sharing this key (one per member country).
|
|
172
|
+
const posterior = {};
|
|
173
|
+
let latSum = 0;
|
|
174
|
+
let lonSum = 0;
|
|
175
|
+
let centroidCount = 0;
|
|
176
|
+
let k = i;
|
|
177
|
+
for (; k < this.#count; k++) {
|
|
178
|
+
const base = this.#recBase + k * this.#recSize;
|
|
179
|
+
let same = true;
|
|
180
|
+
for (let j = 0; j < this.#keyWidth; j++) {
|
|
181
|
+
if (this.#buf[base + j] !== this.#buf[keyBase + j]) {
|
|
182
|
+
same = false;
|
|
183
|
+
break;
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
if (!same)
|
|
187
|
+
break;
|
|
188
|
+
const tail = base + this.#keyWidth;
|
|
189
|
+
posterior[this.#countries[this.#buf[tail]]] = 1; // uniform — anchorFeatureVector renormalizes
|
|
190
|
+
const lat = this.#view.getInt16(tail + 1, true) / LAT_Q;
|
|
191
|
+
const lon = this.#view.getInt16(tail + 3, true) / LON_Q;
|
|
192
|
+
if (lat !== 0 || lon !== 0) {
|
|
193
|
+
latSum += lat;
|
|
194
|
+
lonSum += lon;
|
|
195
|
+
centroidCount++;
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
out.set(postcode, {
|
|
199
|
+
posterior,
|
|
200
|
+
lat: centroidCount ? latSum / centroidCount : 0,
|
|
201
|
+
lon: centroidCount ? lonSum / centroidCount : 0,
|
|
202
|
+
});
|
|
203
|
+
i = k;
|
|
204
|
+
}
|
|
205
|
+
return out;
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
//# sourceMappingURL=postcode-binary-resolver.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"postcode-binary-resolver.js","sourceRoot":"","sources":["../postcode-binary-resolver.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AAKH,MAAM,KAAK,GAAG,aAAa,CAAA,CAAC,qDAAqD;AACjF,MAAM,QAAQ,GAAG,CAAC,CAAA,CAAC,oCAAoC;AACvD,MAAM,KAAK,GAAG,KAAK,GAAG,EAAE,CAAA;AACxB,MAAM,KAAK,GAAG,KAAK,GAAG,GAAG,CAAA;AASzB;;;GAGG;AACH,SAAS,SAAS,CAAC,CAAS,EAAE,KAAa,EAAE,GAAe,EAAE,MAAc;IAC3E,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,EAAE,CAAC,EAAE;QAAE,GAAG,CAAC,MAAM,GAAG,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC,CAAA;AAC5F,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,uBAAuB,CAAC,OAAuC;IAC9E,MAAM,MAAM,GAAG,CAAC,GAAG,OAAO,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CACzC,CAAC,CAAC,QAAQ,GAAG,CAAC,CAAC,QAAQ;QACtB,CAAC,CAAC,CAAC,CAAC;QACJ,CAAC,CAAC,CAAC,CAAC,QAAQ,GAAG,CAAC,CAAC,QAAQ;YACxB,CAAC,CAAC,CAAC;YACH,CAAC,CAAC,CAAC,CAAC,OAAO,GAAG,CAAC,CAAC,OAAO;gBACtB,CAAC,CAAC,CAAC,CAAC;gBACJ,CAAC,CAAC,CAAC,CAAC,OAAO,GAAG,CAAC,CAAC,OAAO;oBACtB,CAAC,CAAC,CAAC;oBACH,CAAC,CAAC,CAAC,CACP,CAAA;IACD,MAAM,SAAS,GAAG,CAAC,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAA;IACnE,MAAM,UAAU,GAAG,IAAI,GAAG,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAA;IAC3D,MAAM,QAAQ,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAA;IAC3E,MAAM,OAAO,GAAG,QAAQ,GAAG,QAAQ,CAAA;IAEnC,MAAM,UAAU,GAAG,CAAC,GAAG,CAAC,GAAG,CAAC,GAAG,SAAS,CAAC,MAAM,GAAG,CAAC,GAAG,CAAC,CAAA;IACvD,MAAM,GAAG,GAAG,IAAI,UAAU,CAAC,UAAU,GAAG,MAAM,CAAC,MAAM,GAAG,OAAO,CAAC,CAAA;IAChE,MAAM,IAAI,GAAG,IAAI,QAAQ,CAAC,GAAG,CAAC,MAAM,CAAC,CAAA;IAErC,IAAI,CAAC,GAAG,CAAC,CAAA;IACT,IAAI,CAAC,SAAS,CAAC,CAAC,EAAE,KAAK,EAAE,IAAI,CAAC,CAAA;IAC9B,CAAC,IAAI,CAAC,CAAA;IACN,IAAI,CAAC,SAAS,CAAC,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC,CAAA;IACtC,CAAC,IAAI,CAAC,CAAA;IACN,GAAG,CAAC,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,MAAM,CAAA;IAC3B,KAAK,MAAM,CAAC,IAAI,SAAS,EAAE,CAAC;QAC3B,GAAG,CAAC,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,GAAG,IAAI,CAAA;QACjC,GAAG,CAAC,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,GAAG,IAAI,CAAA;IAClC,CAAC;IACD,GAAG,CAAC,CAAC,EAAE,CAAC,GAAG,QAAQ,CAAA;IAEnB,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;QACxB,SAAS,CAAC,CAAC,CAAC,QAAQ,EAAE,QAAQ,EAAE,GAAG,EAAE,CAAC,CAAC,CAAA;QACvC,CAAC,IAAI,QAAQ,CAAA;QACb,GAAG,CAAC,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,CAAE,CAAA;QACrC,IAAI,CAAC,QAAQ,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,IAAI,CAAC,GAAG,CAAC,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,GAAG,GAAG,KAAK,CAAC,CAAC,CAAC,EAAE,IAAI,CAAC,CAAA;QACpF,CAAC,IAAI,CAAC,CAAA;QACN,IAAI,CAAC,QAAQ,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,IAAI,CAAC,GAAG,CAAC,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,GAAG,GAAG,KAAK,CAAC,CAAC,CAAC,EAAE,IAAI,CAAC,CAAA;QACpF,CAAC,IAAI,CAAC,CAAA;IACP,CAAC;IACD,OAAO,GAAG,CAAA;AACX,CAAC;AAED;;;GAGG;AACH,MAAM,OAAO,sBAAsB;IACzB,IAAI,CAAY;IAChB,KAAK,CAAU;IACf,MAAM,CAAQ;IACd,UAAU,CAAU;IACpB,SAAS,CAAQ;IACjB,QAAQ,CAAQ;IAChB,QAAQ,CAAQ;IAEzB,YAAY,KAAiB;QAC5B,IAAI,CAAC,IAAI,GAAG,KAAK,CAAA;QACjB,IAAI,CAAC,KAAK,GAAG,IAAI,QAAQ,CAAC,KAAK,CAAC,MAAM,EAAE,KAAK,CAAC,UAAU,EAAE,KAAK,CAAC,UAAU,CAAC,CAAA;QAC3E,IAAI,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC,EAAE,IAAI,CAAC,KAAK,KAAK;YAAE,MAAM,IAAI,KAAK,CAAC,4BAA4B,CAAC,CAAA;QAC1F,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC,EAAE,IAAI,CAAC,CAAA;QAC3C,IAAI,CAAC,GAAG,CAAC,CAAA;QACT,MAAM,YAAY,GAAG,KAAK,CAAC,CAAC,EAAE,CAAE,CAAA;QAChC,IAAI,CAAC,UAAU,GAAG,EAAE,CAAA;QACpB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,YAAY,EAAE,CAAC,EAAE,EAAE,CAAC;YACvC,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,MAAM,CAAC,YAAY,CAAC,KAAK,CAAC,CAAC,CAAE,EAAE,KAAK,CAAC,CAAC,GAAG,CAAC,CAAE,CAAC,CAAC,CAAA;YACnE,CAAC,IAAI,CAAC,CAAA;QACP,CAAC;QACD,IAAI,CAAC,SAAS,GAAG,KAAK,CAAC,CAAC,EAAE,CAAE,CAAA;QAC5B,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC,SAAS,GAAG,QAAQ,CAAA;QACzC,IAAI,CAAC,QAAQ,GAAG,CAAC,CAAA;IAClB,CAAC;IAED,2EAA2E;IAC3E,OAAO,CAAC,CAAS,EAAE,GAAe;QACjC,MAAM,IAAI,GAAG,IAAI,CAAC,QAAQ,GAAG,CAAC,GAAG,IAAI,CAAC,QAAQ,CAAA;QAC9C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;YACzC,MAAM,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,GAAG,CAAC,CAAE,GAAG,GAAG,CAAC,CAAC,CAAE,CAAA;YACxC,IAAI,CAAC,KAAK,CAAC;gBAAE,OAAO,CAAC,CAAA;QACtB,CAAC;QACD,OAAO,CAAC,CAAA;IACT,CAAC;IAED,MAAM,CAAC,QAAgB;QACtB,IAAI,QAAQ,CAAC,MAAM,GAAG,IAAI,CAAC,SAAS;YAAE,OAAO,EAAE,CAAA,CAAC,0CAA0C;QAC1F,MAAM,GAAG,GAAG,IAAI,UAAU,CAAC,IAAI,CAAC,SAAS,CAAC,CAAA;QAC1C,SAAS,CAAC,QAAQ,EAAE,IAAI,CAAC,SAAS,EAAE,GAAG,EAAE,CAAC,CAAC,CAAA;QAE3C,6DAA6D;QAC7D,IAAI,EAAE,GAAG,CAAC,CAAA;QACV,IAAI,EAAE,GAAG,IAAI,CAAC,MAAM,CAAA;QACpB,OAAO,EAAE,GAAG,EAAE,EAAE,CAAC;YAChB,MAAM,GAAG,GAAG,CAAC,EAAE,GAAG,EAAE,CAAC,KAAK,CAAC,CAAA;YAC3B,IAAI,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE,GAAG,CAAC,GAAG,CAAC;gBAAE,EAAE,GAAG,GAAG,GAAG,CAAC,CAAA;;gBACvC,EAAE,GAAG,GAAG,CAAA;QACd,CAAC;QAED,8DAA8D;QAC9D,MAAM,GAAG,GAAoB,EAAE,CAAA;QAC/B,KAAK,IAAI,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,IAAI,IAAI,CAAC,OAAO,CAAC,CAAC,EAAE,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YACrE,MAAM,IAAI,GAAG,IAAI,CAAC,QAAQ,GAAG,CAAC,GAAG,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC,SAAS,CAAA;YAC/D,GAAG,CAAC,IAAI,CAAC;gBACR,OAAO,EAAE,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAE,CAAE;gBAC3C,GAAG,EAAE,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,IAAI,GAAG,CAAC,EAAE,IAAI,CAAC,GAAG,KAAK;gBAChD,GAAG,EAAE,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,IAAI,GAAG,CAAC,EAAE,IAAI,CAAC,GAAG,KAAK;aAChD,CAAC,CAAA;QACH,CAAC;QACD,OAAO,GAAG,CAAA;IACX,CAAC;IAED;;;;;;;;;OASG;IACH,cAAc;QACb,MAAM,GAAG,GAAiB,IAAI,GAAG,EAAE,CAAA;QACnC,IAAI,CAAC,GAAG,CAAC,CAAA;QACT,OAAO,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;YACxB,gEAAgE;YAChE,MAAM,OAAO,GAAG,IAAI,CAAC,QAAQ,GAAG,CAAC,GAAG,IAAI,CAAC,QAAQ,CAAA;YACjD,IAAI,QAAQ,GAAG,EAAE,CAAA;YACjB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;gBACzC,MAAM,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,GAAG,CAAC,CAAE,CAAA;gBACjC,IAAI,CAAC,KAAK,CAAC;oBAAE,MAAK;gBAClB,QAAQ,IAAI,MAAM,CAAC,YAAY,CAAC,CAAC,CAAC,CAAA;YACnC,CAAC;YACD,gFAAgF;YAChF,MAAM,SAAS,GAA2B,EAAE,CAAA;YAC5C,IAAI,MAAM,GAAG,CAAC,CAAA;YACd,IAAI,MAAM,GAAG,CAAC,CAAA;YACd,IAAI,aAAa,GAAG,CAAC,CAAA;YACrB,IAAI,CAAC,GAAG,CAAC,CAAA;YACT,OAAO,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBAC7B,MAAM,IAAI,GAAG,IAAI,CAAC,QAAQ,GAAG,CAAC,GAAG,IAAI,CAAC,QAAQ,CAAA;gBAC9C,IAAI,IAAI,GAAG,IAAI,CAAA;gBACf,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;oBACzC,IAAI,IAAI,CAAC,IAAI,CAAC,IAAI,GAAG,CAAC,CAAC,KAAK,IAAI,CAAC,IAAI,CAAC,OAAO,GAAG,CAAC,CAAC,EAAE,CAAC;wBACpD,IAAI,GAAG,KAAK,CAAA;wBACZ,MAAK;oBACN,CAAC;gBACF,CAAC;gBACD,IAAI,CAAC,IAAI;oBAAE,MAAK;gBAChB,MAAM,IAAI,GAAG,IAAI,GAAG,IAAI,CAAC,SAAS,CAAA;gBAClC,SAAS,CAAC,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAE,CAAE,CAAC,GAAG,CAAC,CAAA,CAAC,6CAA6C;gBAC/F,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,IAAI,GAAG,CAAC,EAAE,IAAI,CAAC,GAAG,KAAK,CAAA;gBACvD,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,IAAI,GAAG,CAAC,EAAE,IAAI,CAAC,GAAG,KAAK,CAAA;gBACvD,IAAI,GAAG,KAAK,CAAC,IAAI,GAAG,KAAK,CAAC,EAAE,CAAC;oBAC5B,MAAM,IAAI,GAAG,CAAA;oBACb,MAAM,IAAI,GAAG,CAAA;oBACb,aAAa,EAAE,CAAA;gBAChB,CAAC;YACF,CAAC;YACD,GAAG,CAAC,GAAG,CAAC,QAAQ,EAAE;gBACjB,SAAS;gBACT,GAAG,EAAE,aAAa,CAAC,CAAC,CAAC,MAAM,GAAG,aAAa,CAAC,CAAC,CAAC,CAAC;gBAC/C,GAAG,EAAE,aAAa,CAAC,CAAC,CAAC,MAAM,GAAG,aAAa,CAAC,CAAC,CAAC,CAAC;aAC/C,CAAC,CAAA;YACF,CAAC,GAAG,CAAC,CAAA;QACN,CAAC;QACD,OAAO,GAAG,CAAA;IACX,CAAC;CACD"}
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Postcode regex repair pass — v0.7 task #35 ("postcode regex pre-pass").
|
|
7
|
+
*
|
|
8
|
+
* The 2026-05-29 postcode diagnostic showed the neural model fragments alphanumeric postcodes at
|
|
9
|
+
* the SentencePiece layer (GB/CA/NL at 0%, US 80.5%, FR 70.1%). Three failure modes were visible
|
|
10
|
+
* in the data:
|
|
11
|
+
*
|
|
12
|
+
* 1. Total miss — "London SW1A 1AA" → (no postcode label)
|
|
13
|
+
* 2. Truncation — "M5V 2T6" → "2T6"; "B12 8QX" → "B12"
|
|
14
|
+
* 3. Char-drift — "75008" → "5008"; "62701" → "2701" (and smear: "1200-030 Lisboa" → "200-030 Lis")
|
|
15
|
+
*
|
|
16
|
+
* This pass runs AFTER the model's per-token BIO labels are decoded but BEFORE `buildAddressTree`.
|
|
17
|
+
* It detects postcode-shaped substrings with per-country regexes and repairs the label sequence
|
|
18
|
+
* so the postcode span matches the detected shape. The model is untouched — this is a
|
|
19
|
+
* deterministic decoder-side correction, the "lowest risk" lever in the v0.7 plan (vs. #36's soft
|
|
20
|
+
* FST shallow-fusion or #41's char-level encoder).
|
|
21
|
+
*
|
|
22
|
+
* PRECISION GUARDS (so we never regress the countries already passing):
|
|
23
|
+
*
|
|
24
|
+
* - Alphanumeric shapes (GB/CA/NL/DE-prefixed) are high-confidence "this IS a postcode" patterns →
|
|
25
|
+
* eligible to ADD a span where the model emitted none, but only over non-structural labels
|
|
26
|
+
* (never over house_number/street/etc.).
|
|
27
|
+
* - Numeric shapes (\d{5}, ZIP+4, JP, PT, PL) are ambiguous (a bare 5-digit could be a house number)
|
|
28
|
+
* → SNAP-only: they expand/clip an EXISTING postcode span, never create one from scratch.
|
|
29
|
+
* - Smear cleanup is LOCAL: only postcode tokens immediately flanking a snapped span are cleared. We
|
|
30
|
+
* never globally clear unmatched postcode tokens — that would regress shapes we don't
|
|
31
|
+
* pattern-match (AU 4-digit, IN 6-digit, …).
|
|
32
|
+
*/
|
|
33
|
+
import type { DecoderToken } from "@mailwoman/core/decoder";
|
|
34
|
+
/** A detected postcode-shaped substring with its char range and confidence class. */
|
|
35
|
+
export interface PostcodeMatch {
|
|
36
|
+
start: number;
|
|
37
|
+
end: number;
|
|
38
|
+
/** "alnum" shapes may ADD; "numeric" shapes may only SNAP an existing span. */
|
|
39
|
+
kind: "alnum" | "numeric";
|
|
40
|
+
/** Pattern priority (lower = more specific, wins overlap resolution). */
|
|
41
|
+
priority: number;
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* Per-country postcode shape patterns, ordered most-specific → least. Alphanumeric patterns require
|
|
45
|
+
* uppercase letters (postcodes are conventionally uppercase, and the eval data has them uppercase)
|
|
46
|
+
* — this keeps them from matching ordinary lowercase prose.
|
|
47
|
+
*/
|
|
48
|
+
export declare const POSTCODE_PATTERNS: Array<{
|
|
49
|
+
label: string;
|
|
50
|
+
kind: "alnum" | "numeric";
|
|
51
|
+
re: RegExp;
|
|
52
|
+
}>;
|
|
53
|
+
/** Collect non-overlapping postcode matches, preferring more-specific (earlier) patterns. */
|
|
54
|
+
export declare function collectMatches(text: string): PostcodeMatch[];
|
|
55
|
+
export interface RepairResult {
|
|
56
|
+
tokens: DecoderToken[];
|
|
57
|
+
/** Number of token labels changed — for telemetry / logging. */
|
|
58
|
+
changed: number;
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* Repair postcode label spans in a decoded token sequence using per-country regexes. Returns a NEW
|
|
62
|
+
* token array (inputs are not mutated) plus a change count.
|
|
63
|
+
*/
|
|
64
|
+
export declare function repairPostcodeLabels(text: string, input: readonly DecoderToken[]): RepairResult;
|
|
65
|
+
//# sourceMappingURL=postcode-repair.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"postcode-repair.d.ts","sourceRoot":"","sources":["../postcode-repair.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA+BG;AAEH,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,yBAAyB,CAAA;AAE3D,qFAAqF;AACrF,MAAM,WAAW,aAAa;IAC7B,KAAK,EAAE,MAAM,CAAA;IACb,GAAG,EAAE,MAAM,CAAA;IACX,+EAA+E;IAC/E,IAAI,EAAE,OAAO,GAAG,SAAS,CAAA;IACzB,yEAAyE;IACzE,QAAQ,EAAE,MAAM,CAAA;CAChB;AAED;;;;GAIG;AACH,eAAO,MAAM,iBAAiB,EAAE,KAAK,CAAC;IAAE,KAAK,EAAE,MAAM,CAAC;IAAC,IAAI,EAAE,OAAO,GAAG,SAAS,CAAC;IAAC,EAAE,EAAE,MAAM,CAAA;CAAE,CAiB7F,CAAA;AA0BD,6FAA6F;AAC7F,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,aAAa,EAAE,CAkB5D;AAED,MAAM,WAAW,YAAY;IAC5B,MAAM,EAAE,YAAY,EAAE,CAAA;IACtB,gEAAgE;IAChE,OAAO,EAAE,MAAM,CAAA;CACf;AAED;;;GAGG;AACH,wBAAgB,oBAAoB,CAAC,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,SAAS,YAAY,EAAE,GAAG,YAAY,CAmE/F"}
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Postcode regex repair pass — v0.7 task #35 ("postcode regex pre-pass").
|
|
7
|
+
*
|
|
8
|
+
* The 2026-05-29 postcode diagnostic showed the neural model fragments alphanumeric postcodes at
|
|
9
|
+
* the SentencePiece layer (GB/CA/NL at 0%, US 80.5%, FR 70.1%). Three failure modes were visible
|
|
10
|
+
* in the data:
|
|
11
|
+
*
|
|
12
|
+
* 1. Total miss — "London SW1A 1AA" → (no postcode label)
|
|
13
|
+
* 2. Truncation — "M5V 2T6" → "2T6"; "B12 8QX" → "B12"
|
|
14
|
+
* 3. Char-drift — "75008" → "5008"; "62701" → "2701" (and smear: "1200-030 Lisboa" → "200-030 Lis")
|
|
15
|
+
*
|
|
16
|
+
* This pass runs AFTER the model's per-token BIO labels are decoded but BEFORE `buildAddressTree`.
|
|
17
|
+
* It detects postcode-shaped substrings with per-country regexes and repairs the label sequence
|
|
18
|
+
* so the postcode span matches the detected shape. The model is untouched — this is a
|
|
19
|
+
* deterministic decoder-side correction, the "lowest risk" lever in the v0.7 plan (vs. #36's soft
|
|
20
|
+
* FST shallow-fusion or #41's char-level encoder).
|
|
21
|
+
*
|
|
22
|
+
* PRECISION GUARDS (so we never regress the countries already passing):
|
|
23
|
+
*
|
|
24
|
+
* - Alphanumeric shapes (GB/CA/NL/DE-prefixed) are high-confidence "this IS a postcode" patterns →
|
|
25
|
+
* eligible to ADD a span where the model emitted none, but only over non-structural labels
|
|
26
|
+
* (never over house_number/street/etc.).
|
|
27
|
+
* - Numeric shapes (\d{5}, ZIP+4, JP, PT, PL) are ambiguous (a bare 5-digit could be a house number)
|
|
28
|
+
* → SNAP-only: they expand/clip an EXISTING postcode span, never create one from scratch.
|
|
29
|
+
* - Smear cleanup is LOCAL: only postcode tokens immediately flanking a snapped span are cleared. We
|
|
30
|
+
* never globally clear unmatched postcode tokens — that would regress shapes we don't
|
|
31
|
+
* pattern-match (AU 4-digit, IN 6-digit, …).
|
|
32
|
+
*/
|
|
33
|
+
/**
|
|
34
|
+
* Per-country postcode shape patterns, ordered most-specific → least. Alphanumeric patterns require
|
|
35
|
+
* uppercase letters (postcodes are conventionally uppercase, and the eval data has them uppercase)
|
|
36
|
+
* — this keeps them from matching ordinary lowercase prose.
|
|
37
|
+
*/
|
|
38
|
+
export const POSTCODE_PATTERNS = [
|
|
39
|
+
// --- Alphanumeric (eligible to ADD) ---
|
|
40
|
+
// GB: outward + space + inward, e.g. SW1A 1AA, EH8 9YL, W1J 9PN, IP13 6SU, B12 8QX
|
|
41
|
+
{ label: "GB", kind: "alnum", re: /\b[A-Z]{1,2}\d[A-Z\d]?\s+\d[A-Z]{2}\b/g },
|
|
42
|
+
// CA: A1A 1A1 (space optional), e.g. M5V 2T6, H2X 2T6, H3B 1A3
|
|
43
|
+
{ label: "CA", kind: "alnum", re: /\b[A-Z]\d[A-Z]\s?\d[A-Z]\d\b/g },
|
|
44
|
+
// DE-prefixed: D-68161
|
|
45
|
+
{ label: "DE", kind: "alnum", re: /\bD-\d{5}\b/g },
|
|
46
|
+
// NL: 1234 AB / 1234AB — space optional (glued is common). The US "2737 CA" (ZIP+4 tail +
|
|
47
|
+
// state) collision is resolved by longest-match-wins below, which lets the ZIP+4 claim it.
|
|
48
|
+
{ label: "NL", kind: "alnum", re: /\b\d{4}\s?[A-Z]{2}\b/g },
|
|
49
|
+
// --- Numeric (SNAP-only) ---
|
|
50
|
+
{ label: "ZIP4", kind: "numeric", re: /\b\d{5}-\d{4}\b/g }, // US ZIP+4
|
|
51
|
+
{ label: "JP", kind: "numeric", re: /\b\d{3}-\d{4}\b/g }, // 100-0001
|
|
52
|
+
{ label: "PT", kind: "numeric", re: /\b\d{4}-\d{3}\b/g }, // 3060-187
|
|
53
|
+
{ label: "PL", kind: "numeric", re: /\b\d{2}-\d{3}\b/g }, // 47-400
|
|
54
|
+
{ label: "NUM5", kind: "numeric", re: /\b\d{5}\b/g }, // US/FR/DE/ES 5-digit
|
|
55
|
+
];
|
|
56
|
+
/**
|
|
57
|
+
* Labels a postcode span is allowed to overwrite when the model emitted no postcode at all (ADD
|
|
58
|
+
* path). These are the geographic-container tags postcodes get confused with per the diagnostic
|
|
59
|
+
* ("often labeled as locality or O"). Structural tags (house_number, street*, unit, po_box, venue,
|
|
60
|
+
* …) are intentionally absent so we never clobber a confidently-labeled street/number with a false
|
|
61
|
+
* postcode.
|
|
62
|
+
*/
|
|
63
|
+
const ADD_OVER_TAGS = new Set(["locality", "dependent_locality", "region", "subregion", "country"]);
|
|
64
|
+
const POSTCODE_B = "B-postcode";
|
|
65
|
+
const POSTCODE_I = "I-postcode";
|
|
66
|
+
const LOCALITY_B = "B-locality";
|
|
67
|
+
const LOCALITY_I = "I-locality";
|
|
68
|
+
const OUTSIDE = "O";
|
|
69
|
+
function isPostcodeLabel(label) {
|
|
70
|
+
return label === "B-postcode" || label === "I-postcode";
|
|
71
|
+
}
|
|
72
|
+
/** Extract the bare tag from a BIO label ("B-locality" → "locality", "O" → null). */
|
|
73
|
+
function tagOf(label) {
|
|
74
|
+
return label === "O" ? null : label.slice(2);
|
|
75
|
+
}
|
|
76
|
+
/** Collect non-overlapping postcode matches, preferring more-specific (earlier) patterns. */
|
|
77
|
+
export function collectMatches(text) {
|
|
78
|
+
const candidates = [];
|
|
79
|
+
POSTCODE_PATTERNS.forEach((pat, priority) => {
|
|
80
|
+
pat.re.lastIndex = 0;
|
|
81
|
+
for (let m = pat.re.exec(text); m; m = pat.re.exec(text)) {
|
|
82
|
+
candidates.push({ start: m.index, end: m.index + m[0].length, kind: pat.kind, priority });
|
|
83
|
+
}
|
|
84
|
+
});
|
|
85
|
+
// Greedy longest-match-wins: accept by (length desc, then priority asc); reject anything
|
|
86
|
+
// overlapping an accepted match. Longest-first lets a US ZIP+4 ("94610-2737") claim its span
|
|
87
|
+
// before the shorter NL-shaped false positive in its tail ("2737 CA") can.
|
|
88
|
+
candidates.sort((a, b) => b.end - b.start - (a.end - a.start) || a.priority - b.priority);
|
|
89
|
+
const accepted = [];
|
|
90
|
+
for (const c of candidates) {
|
|
91
|
+
if (accepted.some((a) => c.start < a.end && a.start < c.end))
|
|
92
|
+
continue;
|
|
93
|
+
accepted.push(c);
|
|
94
|
+
}
|
|
95
|
+
return accepted;
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* Repair postcode label spans in a decoded token sequence using per-country regexes. Returns a NEW
|
|
99
|
+
* token array (inputs are not mutated) plus a change count.
|
|
100
|
+
*/
|
|
101
|
+
export function repairPostcodeLabels(text, input) {
|
|
102
|
+
const matches = collectMatches(text);
|
|
103
|
+
const tokens = input.map((t) => ({ ...t }));
|
|
104
|
+
if (matches.length === 0)
|
|
105
|
+
return { tokens, changed: 0 };
|
|
106
|
+
let changed = 0;
|
|
107
|
+
const setLabel = (i, label) => {
|
|
108
|
+
if (tokens[i].label !== label) {
|
|
109
|
+
tokens[i].label = label;
|
|
110
|
+
changed++;
|
|
111
|
+
}
|
|
112
|
+
};
|
|
113
|
+
for (const m of matches) {
|
|
114
|
+
// Tokens whose char span intersects the match.
|
|
115
|
+
const overlap = [];
|
|
116
|
+
for (let i = 0; i < tokens.length; i++) {
|
|
117
|
+
const t = tokens[i];
|
|
118
|
+
if (t.start < m.end && m.start < t.end)
|
|
119
|
+
overlap.push(i);
|
|
120
|
+
}
|
|
121
|
+
if (overlap.length === 0)
|
|
122
|
+
continue;
|
|
123
|
+
const hasPostcode = overlap.some((i) => isPostcodeLabel(tokens[i].label));
|
|
124
|
+
if (!hasPostcode) {
|
|
125
|
+
// ADD path — only for high-confidence alphanumeric shapes, only over safe labels.
|
|
126
|
+
if (m.kind !== "alnum")
|
|
127
|
+
continue;
|
|
128
|
+
const safe = overlap.every((i) => {
|
|
129
|
+
const tag = tagOf(tokens[i].label);
|
|
130
|
+
return tag === null || ADD_OVER_TAGS.has(tag);
|
|
131
|
+
});
|
|
132
|
+
if (!safe)
|
|
133
|
+
continue;
|
|
134
|
+
}
|
|
135
|
+
// SNAP/ADD: relabel the matched run as a single postcode span.
|
|
136
|
+
overlap.forEach((i, k) => setLabel(i, k === 0 ? POSTCODE_B : POSTCODE_I));
|
|
137
|
+
// Leading smear clip: postcode tokens immediately BEFORE the snapped run are noise (e.g. a
|
|
138
|
+
// house-number digit the model over-labeled) — clear to O as before.
|
|
139
|
+
for (let j = overlap[0] - 1; j >= 0 && isPostcodeLabel(tokens[j].label); j--)
|
|
140
|
+
setLabel(j, OUTSIDE);
|
|
141
|
+
// Trailing smear: the model over-extended the postcode to the RIGHT. In postcode-before-city
|
|
142
|
+
// locales (DE/FR/ES/IT, "08523 Plauen") this swallows the leading characters of the city, which
|
|
143
|
+
// the historical clip-to-O then DISCARDED ("08523 Pl|auen Vogtl" → postcode "08523" + O +
|
|
144
|
+
// locality "auen Vogtl", dropping the "Pl"). When the smear connects to a following locality run,
|
|
145
|
+
// hand those characters BACK to the city — reassign them to locality and demote the city's
|
|
146
|
+
// leading B so the prefix + city form ONE span ("Pl"+"auen"+"Vogtl" → "Plauen Vogtl"). A
|
|
147
|
+
// standalone neighbour with no following locality (a country, "Paris 75008 France") keeps the
|
|
148
|
+
// historical clip-to-O. This is the decoder-side repair for the cross-tag postcode→city
|
|
149
|
+
// absorption diagnosed in the PR3 Pilot A postmortem (+36pp DE exact-locality, no-op on US,
|
|
150
|
+
// where the postcode sits at the end with nothing to trim).
|
|
151
|
+
const trailing = [];
|
|
152
|
+
for (let j = overlap[overlap.length - 1] + 1; j < tokens.length && isPostcodeLabel(tokens[j].label); j++) {
|
|
153
|
+
trailing.push(j);
|
|
154
|
+
}
|
|
155
|
+
if (trailing.length > 0) {
|
|
156
|
+
const after = trailing[trailing.length - 1] + 1;
|
|
157
|
+
const connectsToCity = after < tokens.length && tagOf(tokens[after].label) === "locality";
|
|
158
|
+
if (connectsToCity) {
|
|
159
|
+
trailing.forEach((j, k) => setLabel(j, k === 0 ? LOCALITY_B : LOCALITY_I));
|
|
160
|
+
if (tokens[after].label === "B-locality")
|
|
161
|
+
setLabel(after, LOCALITY_I);
|
|
162
|
+
}
|
|
163
|
+
else {
|
|
164
|
+
for (const j of trailing)
|
|
165
|
+
setLabel(j, OUTSIDE);
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
return { tokens, changed };
|
|
170
|
+
}
|
|
171
|
+
//# sourceMappingURL=postcode-repair.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"postcode-repair.js","sourceRoot":"","sources":["../postcode-repair.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA+BG;AAcH;;;;GAIG;AACH,MAAM,CAAC,MAAM,iBAAiB,GAAoE;IACjG,yCAAyC;IACzC,mFAAmF;IACnF,EAAE,KAAK,EAAE,IAAI,EAAE,IAAI,EAAE,OAAO,EAAE,EAAE,EAAE,wCAAwC,EAAE;IAC5E,+DAA+D;IAC/D,EAAE,KAAK,EAAE,IAAI,EAAE,IAAI,EAAE,OAAO,EAAE,EAAE,EAAE,+BAA+B,EAAE;IACnE,uBAAuB;IACvB,EAAE,KAAK,EAAE,IAAI,EAAE,IAAI,EAAE,OAAO,EAAE,EAAE,EAAE,cAAc,EAAE;IAClD,0FAA0F;IAC1F,2FAA2F;IAC3F,EAAE,KAAK,EAAE,IAAI,EAAE,IAAI,EAAE,OAAO,EAAE,EAAE,EAAE,uBAAuB,EAAE;IAC3D,8BAA8B;IAC9B,EAAE,KAAK,EAAE,MAAM,EAAE,IAAI,EAAE,SAAS,EAAE,EAAE,EAAE,kBAAkB,EAAE,EAAE,WAAW;IACvE,EAAE,KAAK,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,EAAE,EAAE,kBAAkB,EAAE,EAAE,WAAW;IACrE,EAAE,KAAK,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,EAAE,EAAE,kBAAkB,EAAE,EAAE,WAAW;IACrE,EAAE,KAAK,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,EAAE,EAAE,kBAAkB,EAAE,EAAE,SAAS;IACnE,EAAE,KAAK,EAAE,MAAM,EAAE,IAAI,EAAE,SAAS,EAAE,EAAE,EAAE,YAAY,EAAE,EAAE,sBAAsB;CAC5E,CAAA;AAED;;;;;;GAMG;AACH,MAAM,aAAa,GAAG,IAAI,GAAG,CAAS,CAAC,UAAU,EAAE,oBAAoB,EAAE,QAAQ,EAAE,WAAW,EAAE,SAAS,CAAC,CAAC,CAAA;AAE3G,MAAM,UAAU,GAAG,YAAqC,CAAA;AACxD,MAAM,UAAU,GAAG,YAAqC,CAAA;AACxD,MAAM,UAAU,GAAG,YAAqC,CAAA;AACxD,MAAM,UAAU,GAAG,YAAqC,CAAA;AACxD,MAAM,OAAO,GAAG,GAA4B,CAAA;AAE5C,SAAS,eAAe,CAAC,KAAa;IACrC,OAAO,KAAK,KAAK,YAAY,IAAI,KAAK,KAAK,YAAY,CAAA;AACxD,CAAC;AAED,qFAAqF;AACrF,SAAS,KAAK,CAAC,KAAa;IAC3B,OAAO,KAAK,KAAK,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAA;AAC7C,CAAC;AAED,6FAA6F;AAC7F,MAAM,UAAU,cAAc,CAAC,IAAY;IAC1C,MAAM,UAAU,GAAoB,EAAE,CAAA;IACtC,iBAAiB,CAAC,OAAO,CAAC,CAAC,GAAG,EAAE,QAAQ,EAAE,EAAE;QAC3C,GAAG,CAAC,EAAE,CAAC,SAAS,GAAG,CAAC,CAAA;QACpB,KAAK,IAAI,CAAC,GAAG,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;YAC1D,UAAU,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC,KAAK,EAAE,GAAG,EAAE,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,EAAE,IAAI,EAAE,GAAG,CAAC,IAAI,EAAE,QAAQ,EAAE,CAAC,CAAA;QAC1F,CAAC;IACF,CAAC,CAAC,CAAA;IACF,yFAAyF;IACzF,6FAA6F;IAC7F,2EAA2E;IAC3E,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,QAAQ,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAA;IACzF,MAAM,QAAQ,GAAoB,EAAE,CAAA;IACpC,KAAK,MAAM,CAAC,IAAI,UAAU,EAAE,CAAC;QAC5B,IAAI,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,GAAG,CAAC;YAAE,SAAQ;QACtE,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IACjB,CAAC;IACD,OAAO,QAAQ,CAAA;AAChB,CAAC;AAQD;;;GAGG;AACH,MAAM,UAAU,oBAAoB,CAAC,IAAY,EAAE,KAA8B;IAChF,MAAM,OAAO,GAAG,cAAc,CAAC,IAAI,CAAC,CAAA;IACpC,MAAM,MAAM,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC,CAAC,CAAA;IAC3C,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,CAAC,EAAE,CAAA;IAEvD,IAAI,OAAO,GAAG,CAAC,CAAA;IACf,MAAM,QAAQ,GAAG,CAAC,CAAS,EAAE,KAA4B,EAAQ,EAAE;QAClE,IAAI,MAAM,CAAC,CAAC,CAAE,CAAC,KAAK,KAAK,KAAK,EAAE,CAAC;YAChC,MAAM,CAAC,CAAC,CAAE,CAAC,KAAK,GAAG,KAAK,CAAA;YACxB,OAAO,EAAE,CAAA;QACV,CAAC;IACF,CAAC,CAAA;IAED,KAAK,MAAM,CAAC,IAAI,OAAO,EAAE,CAAC;QACzB,+CAA+C;QAC/C,MAAM,OAAO,GAAa,EAAE,CAAA;QAC5B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACxC,MAAM,CAAC,GAAG,MAAM,CAAC,CAAC,CAAE,CAAA;YACpB,IAAI,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,GAAG;gBAAE,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;QACxD,CAAC;QACD,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC;YAAE,SAAQ;QAElC,MAAM,WAAW,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,eAAe,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,KAAK,CAAC,CAAC,CAAA;QAC1E,IAAI,CAAC,WAAW,EAAE,CAAC;YAClB,kFAAkF;YAClF,IAAI,CAAC,CAAC,IAAI,KAAK,OAAO;gBAAE,SAAQ;YAChC,MAAM,IAAI,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE;gBAChC,MAAM,GAAG,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,KAAK,CAAC,CAAA;gBACnC,OAAO,GAAG,KAAK,IAAI,IAAI,aAAa,CAAC,GAAG,CAAC,GAAG,CAAC,CAAA;YAC9C,CAAC,CAAC,CAAA;YACF,IAAI,CAAC,IAAI;gBAAE,SAAQ;QACpB,CAAC;QAED,+DAA+D;QAC/D,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,EAAE,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAA;QAEzE,2FAA2F;QAC3F,qEAAqE;QACrE,KAAK,IAAI,CAAC,GAAG,OAAO,CAAC,CAAC,CAAE,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,eAAe,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,KAAK,CAAC,EAAE,CAAC,EAAE;YAAE,QAAQ,CAAC,CAAC,EAAE,OAAO,CAAC,CAAA;QAEpG,6FAA6F;QAC7F,gGAAgG;QAChG,0FAA0F;QAC1F,kGAAkG;QAClG,2FAA2F;QAC3F,yFAAyF;QACzF,8FAA8F;QAC9F,wFAAwF;QACxF,4FAA4F;QAC5F,4DAA4D;QAC5D,MAAM,QAAQ,GAAa,EAAE,CAAA;QAC7B,KAAK,IAAI,CAAC,GAAG,OAAO,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAE,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,IAAI,eAAe,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,KAAK,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAC5G,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;QACjB,CAAC;QACD,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACzB,MAAM,KAAK,GAAG,QAAQ,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAE,GAAG,CAAC,CAAA;YAChD,MAAM,cAAc,GAAG,KAAK,GAAG,MAAM,CAAC,MAAM,IAAI,KAAK,CAAC,MAAM,CAAC,KAAK,CAAE,CAAC,KAAK,CAAC,KAAK,UAAU,CAAA;YAC1F,IAAI,cAAc,EAAE,CAAC;gBACpB,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,EAAE,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAA;gBAC1E,IAAI,MAAM,CAAC,KAAK,CAAE,CAAC,KAAK,KAAK,YAAY;oBAAE,QAAQ,CAAC,KAAK,EAAE,UAAU,CAAC,CAAA;YACvE,CAAC;iBAAM,CAAC;gBACP,KAAK,MAAM,CAAC,IAAI,QAAQ;oBAAE,QAAQ,CAAC,CAAC,EAAE,OAAO,CAAC,CAAA;YAC/C,CAAC;QACF,CAAC;IACF,CAAC;IAED,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,CAAA;AAC3B,CAAC"}
|
|
@@ -24,7 +24,11 @@ export interface NeuralProposalClassifierConfig {
|
|
|
24
24
|
id: string;
|
|
25
25
|
/** The underlying neural classifier instance. */
|
|
26
26
|
classifier: NeuralAddressClassifier;
|
|
27
|
-
/**
|
|
27
|
+
/**
|
|
28
|
+
* Component tags this classifier may emit. Defaults to the Stage 2 tag set (coarse +
|
|
29
|
+
* venue/street/house_number). v0.2.0 Stage 1 models never decode to the fine tags anyway, so the
|
|
30
|
+
* broader default is forwards-compat without back-compat risk.
|
|
31
|
+
*/
|
|
28
32
|
emits?: readonly ComponentTag[];
|
|
29
33
|
/** Locales this classifier is active for. `["*"]` (locale-agnostic) by default. */
|
|
30
34
|
locales?: readonly (string | "*")[];
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"proposal-classifier.d.ts","sourceRoot":"","sources":["../proposal-classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAIH,OAAO,KAAK,EAGX,YAAY,EACZ,kBAAkB,EAElB,MAAM,uBAAuB,CAAA;AAC9B,OAAO,KAAK,EAAE,uBAAuB,EAAE,MAAM,iBAAiB,CAAA;AAG9D,MAAM,WAAW,8BAA8B;IAC9C,wFAAwF;IACxF,EAAE,EAAE,MAAM,CAAA;IACV,iDAAiD;IACjD,UAAU,EAAE,uBAAuB,CAAA;IACnC
|
|
1
|
+
{"version":3,"file":"proposal-classifier.d.ts","sourceRoot":"","sources":["../proposal-classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAIH,OAAO,KAAK,EAGX,YAAY,EACZ,kBAAkB,EAElB,MAAM,uBAAuB,CAAA;AAC9B,OAAO,KAAK,EAAE,uBAAuB,EAAE,MAAM,iBAAiB,CAAA;AAG9D,MAAM,WAAW,8BAA8B;IAC9C,wFAAwF;IACxF,EAAE,EAAE,MAAM,CAAA;IACV,iDAAiD;IACjD,UAAU,EAAE,uBAAuB,CAAA;IACnC;;;;OAIG;IACH,KAAK,CAAC,EAAE,SAAS,YAAY,EAAE,CAAA;IAC/B,mFAAmF;IACnF,OAAO,CAAC,EAAE,SAAS,CAAC,MAAM,GAAG,GAAG,CAAC,EAAE,CAAA;IACnC,+DAA+D;IAC/D,OAAO,CAAC,EAAE,MAAM,CAAA;CAChB;AAED,0EAA0E;AAC1E,wBAAgB,8BAA8B,CAAC,GAAG,EAAE,8BAA8B,GAAG,kBAAkB,CA+CtG"}
|
|
@@ -17,14 +17,16 @@
|
|
|
17
17
|
* inference is a future optimization once the policy layer has a way to invoke a classifier "once
|
|
18
18
|
* per parse" instead of per section.
|
|
19
19
|
*/
|
|
20
|
-
import {
|
|
20
|
+
import { STAGE2_TAGS } from "./labels.js";
|
|
21
21
|
/** Build a `ProposalClassifier` backed by a `NeuralAddressClassifier`. */
|
|
22
22
|
export function createNeuralProposalClassifier(cfg) {
|
|
23
|
-
const emits = cfg.emits ??
|
|
23
|
+
const emits = cfg.emits ?? STAGE2_TAGS;
|
|
24
24
|
const emitsSet = new Set(emits);
|
|
25
25
|
const penalty = cfg.penalty ?? 0;
|
|
26
26
|
async function classify(section, _ctx) {
|
|
27
|
-
|
|
27
|
+
// Postcode regex repair on by default (v0.7 #35, operator-signed): +135/0 on the postcode
|
|
28
|
+
// harness, model-independent. Fixes the SentencePiece-fragmentation misses (GB/CA/NL/…).
|
|
29
|
+
const tree = await cfg.classifier.parse(section.body, { postcodeRepair: true });
|
|
28
30
|
const proposals = [];
|
|
29
31
|
const sectionOffset = section.start;
|
|
30
32
|
const visit = (node) => {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"proposal-classifier.js","sourceRoot":"","sources":["../proposal-classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAYH,OAAO,EAAE,
|
|
1
|
+
{"version":3,"file":"proposal-classifier.js","sourceRoot":"","sources":["../proposal-classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAYH,OAAO,EAAE,WAAW,EAAE,MAAM,aAAa,CAAA;AAmBzC,0EAA0E;AAC1E,MAAM,UAAU,8BAA8B,CAAC,GAAmC;IACjF,MAAM,KAAK,GAAG,GAAG,CAAC,KAAK,IAAI,WAAW,CAAA;IACtC,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAe,KAAgC,CAAC,CAAA;IACxE,MAAM,OAAO,GAAG,GAAG,CAAC,OAAO,IAAI,CAAC,CAAA;IAEhC,KAAK,UAAU,QAAQ,CAAC,OAAgB,EAAE,IAAuB;QAChE,0FAA0F;QAC1F,yFAAyF;QACzF,MAAM,IAAI,GAAG,MAAM,GAAG,CAAC,UAAU,CAAC,KAAK,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,cAAc,EAAE,IAAI,EAAE,CAAC,CAAA;QAC/E,MAAM,SAAS,GAA6B,EAAE,CAAA;QAC9C,MAAM,aAAa,GAAG,OAAO,CAAC,KAAK,CAAA;QAEnC,MAAM,KAAK,GAAG,CAAC,IAAiB,EAAQ,EAAE;YACzC,IAAI,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;gBAC5B,wFAAwF;gBACxF,oFAAoF;gBACpF,uFAAuF;gBACvF,iFAAiF;gBACjF,mFAAmF;gBACnF,2EAA2E;gBAC3E,MAAM,IAAI,GAAG;oBACZ,KAAK,EAAE,aAAa,GAAG,IAAI,CAAC,KAAK;oBACjC,GAAG,EAAE,aAAa,GAAG,IAAI,CAAC,GAAG;oBAC7B,IAAI,EAAE,IAAI,CAAC,KAAK;iBACG,CAAA;gBACpB,SAAS,CAAC,IAAI,CAAC;oBACd,IAAI;oBACJ,SAAS,EAAE,IAAI,CAAC,GAAG;oBACnB,UAAU,EAAE,IAAI,CAAC,UAAU;oBAC3B,MAAM,EAAE,QAAQ;oBAChB,SAAS,EAAE,GAAG,CAAC,EAAE;oBACjB,OAAO;iBACP,CAAC,CAAA;YACH,CAAC;YACD,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,QAAQ;gBAAE,KAAK,CAAC,KAAK,CAAC,CAAA;QAChD,CAAC,CAAA;QAED,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,KAAK;YAAE,KAAK,CAAC,IAAI,CAAC,CAAA;QAC1C,OAAO,SAAS,CAAA;IACjB,CAAC;IAED,OAAO;QACN,EAAE,EAAE,GAAG,CAAC,EAAE;QACV,KAAK;QACL,OAAO,EAAE,GAAG,CAAC,OAAO,IAAI,CAAC,GAAG,CAAC;QAC7B,QAAQ;KACR,CAAA;AACF,CAAC"}
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Soft-prior emission biases derived from `QueryShape`.
|
|
7
|
+
*
|
|
8
|
+
* When the QueryShape sub-system has identified a known-format span (US ZIP, UK postcode, PO box,
|
|
9
|
+
* etc.), this module produces an additive bias matrix that nudges the encoder's per-token
|
|
10
|
+
* emissions toward the matching BIO label. The biases compose with the structural BIO mask in the
|
|
11
|
+
* Viterbi decoder — confident encoder predictions still win, but uncertain ones get pulled toward
|
|
12
|
+
* the format-implied label.
|
|
13
|
+
*
|
|
14
|
+
* Bitter-lesson-safe boundary: we don't override the encoder, just bias it. The encoder remains the
|
|
15
|
+
* authority on context-dependent calls (the "Buffalo Wild Wings, Buffalo, NY" disambiguation);
|
|
16
|
+
* the QueryShape prior helps on the easy cases (a 5-digit token is _probably_ a postcode).
|
|
17
|
+
*
|
|
18
|
+
* Uses structural typing for the QueryShape input so this module has zero dependencies on
|
|
19
|
+
* `@mailwoman/query-shape` — consumers compute the shape with that package, pass it in here.
|
|
20
|
+
*/
|
|
21
|
+
/**
|
|
22
|
+
* Minimal subset of `QueryShape` this module consumes. Compatible with `@mailwoman/query-shape`'s
|
|
23
|
+
* exported `QueryShape` type by shape — no import required.
|
|
24
|
+
*/
|
|
25
|
+
export interface QueryShapeLike {
|
|
26
|
+
knownFormats: ReadonlyArray<KnownFormatHitLike>;
|
|
27
|
+
regionAbbreviations?: ReadonlyArray<RegionAbbreviationHitLike>;
|
|
28
|
+
}
|
|
29
|
+
export interface RegionAbbreviationHitLike {
|
|
30
|
+
start: number;
|
|
31
|
+
span: string;
|
|
32
|
+
}
|
|
33
|
+
export interface KnownFormatHitLike {
|
|
34
|
+
format: string;
|
|
35
|
+
span: {
|
|
36
|
+
start: number;
|
|
37
|
+
end: number;
|
|
38
|
+
};
|
|
39
|
+
/** 0..1; ambiguous patterns (e.g. 5-digit US/FR/DE overlap) score lower. */
|
|
40
|
+
confidence: number;
|
|
41
|
+
}
|
|
42
|
+
/** Minimal subset of `TokenizedPiece` this module consumes. */
|
|
43
|
+
export interface TokenLike {
|
|
44
|
+
start: number;
|
|
45
|
+
end: number;
|
|
46
|
+
}
|
|
47
|
+
export interface BuildPriorsOpts {
|
|
48
|
+
/**
|
|
49
|
+
* Maximum bias magnitude (in log-odds units). Default 1.0 — adds up to ~e^1 ≈ 2.7× odds to the
|
|
50
|
+
* favored label. Confidence-scaled, so a 0.6-confidence format hit gets +0.6 max bias.
|
|
51
|
+
*/
|
|
52
|
+
biasScale?: number;
|
|
53
|
+
/**
|
|
54
|
+
* Bias magnitude for the locality soft prior (in log-odds units). Default 2.0 — adds ~e^2 ≈ 7.4×
|
|
55
|
+
* odds to B-locality / I-locality for tokens preceding a detected region abbreviation.
|
|
56
|
+
*/
|
|
57
|
+
localityBiasScale?: number;
|
|
58
|
+
/** Raw input text for region-name matching in the locality bias guard. */
|
|
59
|
+
inputText?: string;
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* Build a `[seqLen][numLabels]` matrix of additive log-bias to be added to encoder emissions before
|
|
63
|
+
* Viterbi decoding.
|
|
64
|
+
*
|
|
65
|
+
* For each (token, format-hit) pair where the token's character span overlaps the hit's span, the
|
|
66
|
+
* matrix entry for the format's mapped label receives `hit.confidence × biasScale`. Tokens that
|
|
67
|
+
* don't overlap any hit, or for which no label mapping exists, get 0.
|
|
68
|
+
*
|
|
69
|
+
* Returns the all-zeros matrix if `shape.knownFormats` is empty — composes harmlessly.
|
|
70
|
+
*/
|
|
71
|
+
export declare function buildEmissionPriors(shape: QueryShapeLike, tokens: ReadonlyArray<TokenLike>, labels: ReadonlyArray<string>, opts?: BuildPriorsOpts): number[][];
|
|
72
|
+
/** Element-wise add two matrices of equal shape. Returns a new matrix. */
|
|
73
|
+
export declare function addEmissionMatrix(emissions: number[][], priors: number[][]): number[][];
|
|
74
|
+
//# sourceMappingURL=query-shape-prior.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"query-shape-prior.d.ts","sourceRoot":"","sources":["../query-shape-prior.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;GAmBG;AAEH;;;GAGG;AACH,MAAM,WAAW,cAAc;IAC9B,YAAY,EAAE,aAAa,CAAC,kBAAkB,CAAC,CAAA;IAC/C,mBAAmB,CAAC,EAAE,aAAa,CAAC,yBAAyB,CAAC,CAAA;CAC9D;AAED,MAAM,WAAW,yBAAyB;IACzC,KAAK,EAAE,MAAM,CAAA;IACb,IAAI,EAAE,MAAM,CAAA;CACZ;AAED,MAAM,WAAW,kBAAkB;IAClC,MAAM,EAAE,MAAM,CAAA;IACd,IAAI,EAAE;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,GAAG,EAAE,MAAM,CAAA;KAAE,CAAA;IACpC,4EAA4E;IAC5E,UAAU,EAAE,MAAM,CAAA;CAClB;AAED,+DAA+D;AAC/D,MAAM,WAAW,SAAS;IACzB,KAAK,EAAE,MAAM,CAAA;IACb,GAAG,EAAE,MAAM,CAAA;CACX;AAiBD,MAAM,WAAW,eAAe;IAC/B;;;OAGG;IACH,SAAS,CAAC,EAAE,MAAM,CAAA;IAElB;;;OAGG;IACH,iBAAiB,CAAC,EAAE,MAAM,CAAA;IAC1B,0EAA0E;IAC1E,SAAS,CAAC,EAAE,MAAM,CAAA;CAClB;AAED;;;;;;;;;GASG;AACH,wBAAgB,mBAAmB,CAClC,KAAK,EAAE,cAAc,EACrB,MAAM,EAAE,aAAa,CAAC,SAAS,CAAC,EAChC,MAAM,EAAE,aAAa,CAAC,MAAM,CAAC,EAC7B,IAAI,GAAE,eAAoB,GACxB,MAAM,EAAE,EAAE,CAoCZ;AA2ID,0EAA0E;AAC1E,wBAAgB,iBAAiB,CAAC,SAAS,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,EAAE,GAAG,MAAM,EAAE,EAAE,CAWvF"}
|