@mailwoman/neural 2.2.0 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/out/anchor-inference.d.ts +58 -0
- package/out/anchor-inference.d.ts.map +1 -0
- package/out/anchor-inference.js +95 -0
- package/out/anchor-inference.js.map +1 -0
- package/out/browser.d.ts +2 -0
- package/out/browser.d.ts.map +1 -1
- package/out/browser.js +4 -0
- package/out/browser.js.map +1 -1
- package/out/classifier.d.ts +70 -3
- package/out/classifier.d.ts.map +1 -1
- package/out/classifier.js +80 -19
- package/out/classifier.js.map +1 -1
- package/out/fst-prior.d.ts +71 -0
- package/out/fst-prior.d.ts.map +1 -0
- package/out/fst-prior.js +173 -0
- package/out/fst-prior.js.map +1 -0
- package/out/index.d.ts +3 -0
- package/out/index.d.ts.map +1 -1
- package/out/index.js +3 -0
- package/out/index.js.map +1 -1
- package/out/labels.d.ts +3 -0
- package/out/labels.d.ts.map +1 -1
- package/out/labels.js +13 -0
- package/out/labels.js.map +1 -1
- package/out/onnx-runner.d.ts +8 -1
- package/out/onnx-runner.d.ts.map +1 -1
- package/out/onnx-runner.js +31 -1
- package/out/onnx-runner.js.map +1 -1
- package/out/postcode-anchor.d.ts +117 -0
- package/out/postcode-anchor.d.ts.map +1 -0
- package/out/postcode-anchor.js +269 -0
- package/out/postcode-anchor.js.map +1 -0
- package/out/postcode-binary-resolver.d.ts +60 -0
- package/out/postcode-binary-resolver.d.ts.map +1 -0
- package/out/postcode-binary-resolver.js +208 -0
- package/out/postcode-binary-resolver.js.map +1 -0
- package/out/postcode-repair.d.ts +65 -0
- package/out/postcode-repair.d.ts.map +1 -0
- package/out/postcode-repair.js +171 -0
- package/out/postcode-repair.js.map +1 -0
- package/out/proposal-classifier.d.ts.map +1 -1
- package/out/proposal-classifier.js +3 -1
- package/out/proposal-classifier.js.map +1 -1
- package/out/query-shape-prior.d.ts +12 -0
- package/out/query-shape-prior.d.ts.map +1 -1
- package/out/query-shape-prior.js +132 -2
- package/out/query-shape-prior.js.map +1 -1
- package/out/street-morphology-prior.d.ts +56 -0
- package/out/street-morphology-prior.d.ts.map +1 -0
- package/out/street-morphology-prior.js +159 -0
- package/out/street-morphology-prior.js.map +1 -0
- package/out/unit-repair.d.ts +42 -0
- package/out/unit-repair.d.ts.map +1 -0
- package/out/unit-repair.js +142 -0
- package/out/unit-repair.js.map +1 -0
- package/out/vitest.config.d.ts.map +1 -1
- package/out/vitest.config.js +3 -0
- package/out/vitest.config.js.map +1 -1
- package/out/weights.d.ts +27 -3
- package/out/weights.d.ts.map +1 -1
- package/out/weights.js +46 -2
- package/out/weights.js.map +1 -1
- package/package.json +6 -2
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Postcode anchor — the first member of the "anchor-based parsing" family (Direction D, #240). See
|
|
7
|
+
* `docs/articles/plan/2026-06-03-anchor-based-parsing.md`.
|
|
8
|
+
*
|
|
9
|
+
* A postcode is the most information-dense token in an address: a hierarchical geo-encoding that
|
|
10
|
+
* places a query on Earth far more cheaply than the rest of the parse. This module lifts the
|
|
11
|
+
* postcode out of the BIO sequence-labelling problem and treats it as a structured anchor. It
|
|
12
|
+
* runs the same per-country shape regexes the decoder repair pass uses ({@link collectMatches}),
|
|
13
|
+
* resolves each shaped span against a postcode gazetteer, and returns a SOFT signal: a country
|
|
14
|
+
* posterior plus a calibrated confidence. It never decides a postcode's identity on its own — it
|
|
15
|
+
* reports "this string is (or is not) a real postcode, in these countries, near here", and leaves
|
|
16
|
+
* the parser to weigh that against the surrounding tokens.
|
|
17
|
+
*
|
|
18
|
+
* Two design rules carried from the DeepSeek consult
|
|
19
|
+
* (`.agents/skills/deepseek-consult/ds-pc-turn{1,2}-postcode-anchor.txt`):
|
|
20
|
+
*
|
|
21
|
+
* - The country posterior is UNIFORM over the countries a string actually exists in. We never weight
|
|
22
|
+
* by per-country postcode volume, because that skews "75001" toward whichever country owns
|
|
23
|
+
* more 5-digit codes — the exact bias the anchor exists to avoid. Disambiguation is the
|
|
24
|
+
* parser's job, using script, city tokens, and user locale.
|
|
25
|
+
* - Confidence combines gazetteer MEMBERSHIP with country AMBIGUITY. A string that matches a postcode
|
|
26
|
+
* regex but exists in no gazetteer (a bare `27`, or a 5-digit house number that is not a real
|
|
27
|
+
* code) gets confidence 0, so the parser treats it as a house number. A real-but-ambiguous
|
|
28
|
+
* code (`75001` in FR and US) gets moderate confidence. A real, single-country code gets
|
|
29
|
+
* 1.0.
|
|
30
|
+
*/
|
|
31
|
+
import { candidateSystemsForPostcode } from "@mailwoman/codex";
|
|
32
|
+
import { isGermanStreetToken } from "@mailwoman/codex/de";
|
|
33
|
+
import { isFrenchStreetWord } from "@mailwoman/codex/fr";
|
|
34
|
+
import { isStreetSuffixToken, isUsStateAbbreviation } from "@mailwoman/codex/us";
|
|
35
|
+
import { collectMatches } from "./postcode-repair.js";
|
|
36
|
+
/**
|
|
37
|
+
* Entropy cap for the confidence formula: a k-way country split saturates toward 0 confidence at
|
|
38
|
+
* k=10.
|
|
39
|
+
*/
|
|
40
|
+
const MAX_COUNTRIES = 10;
|
|
41
|
+
/** A fuzzy (typo-corrected) match is less certain than an exact one — scale its confidence down. */
|
|
42
|
+
const FUZZY_PENALTY = 0.6;
|
|
43
|
+
/**
|
|
44
|
+
* Class-aware edit-distance-1 variants of a postcode string: deletions, same-class substitutions
|
|
45
|
+
* (digit↔digit, letter↔letter), same-class insertions, and adjacent transpositions. Restricting
|
|
46
|
+
* substitutions/insertions to the character's class mirrors how humans mistype or OCR a postcode (a
|
|
47
|
+
* digit becomes another digit, not a letter) and keeps the candidate set small.
|
|
48
|
+
*/
|
|
49
|
+
export function editDistance1Variants(s) {
|
|
50
|
+
const classOf = (ch) => /[0-9]/.test(ch) ? "0123456789" : /[A-Z]/.test(ch) ? "ABCDEFGHIJKLMNOPQRSTUVWXYZ" : "";
|
|
51
|
+
const variants = new Set();
|
|
52
|
+
for (let i = 0; i < s.length; i++)
|
|
53
|
+
variants.add(s.slice(0, i) + s.slice(i + 1)); // deletions
|
|
54
|
+
for (let i = 0; i < s.length; i++) {
|
|
55
|
+
for (const c of classOf(s[i]))
|
|
56
|
+
if (c !== s[i])
|
|
57
|
+
variants.add(s.slice(0, i) + c + s.slice(i + 1)); // substitutions
|
|
58
|
+
}
|
|
59
|
+
for (let i = 0; i <= s.length; i++) {
|
|
60
|
+
for (const c of classOf(s[i] ?? s[i - 1] ?? ""))
|
|
61
|
+
variants.add(s.slice(0, i) + c + s.slice(i)); // insertions
|
|
62
|
+
}
|
|
63
|
+
for (let i = 0; i + 1 < s.length; i++)
|
|
64
|
+
variants.add(s.slice(0, i) + s[i + 1] + s[i] + s.slice(i + 2)); // transpositions
|
|
65
|
+
variants.delete(s);
|
|
66
|
+
return [...variants];
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* Normalize a shaped span to the canonical gazetteer key: uppercase, collapse internal whitespace
|
|
70
|
+
* to a single space, and strip the German `D-` courtesy prefix (the shards store `68161`, not
|
|
71
|
+
* `D-68161`).
|
|
72
|
+
*/
|
|
73
|
+
export function normalizePostcode(raw) {
|
|
74
|
+
let s = raw.trim().toUpperCase().replace(/\s+/g, " ");
|
|
75
|
+
if (/^D-\d{5}$/.test(s))
|
|
76
|
+
s = s.slice(2); // German courtesy prefix: D-68161 → 68161
|
|
77
|
+
if (/^\d{4} [A-Z]{2}$/.test(s))
|
|
78
|
+
s = s.replace(" ", ""); // Dutch: gazetteer stores 1012LM, not 1012 LM
|
|
79
|
+
return s;
|
|
80
|
+
}
|
|
81
|
+
/**
|
|
82
|
+
* The GB outward code of a normalized unit postcode — the part before the space when the inward
|
|
83
|
+
* half is `\d[A-Z]{2}` (`SO4 3RX` → `SO4`). The GB gazetteer is aggregated to outward codes (2.7M
|
|
84
|
+
* units is too large + too fine for an anchor), so the extractor retries the outward code when a
|
|
85
|
+
* full GB unit misses. Returns `null` for any string that isn't a GB unit postcode (so it never
|
|
86
|
+
* fires elsewhere).
|
|
87
|
+
*/
|
|
88
|
+
export function gbOutwardCode(normalized) {
|
|
89
|
+
const sp = normalized.indexOf(" ");
|
|
90
|
+
if (sp < 1)
|
|
91
|
+
return null;
|
|
92
|
+
return /^\d[A-Z]{2}$/.test(normalized.slice(sp + 1)) ? normalized.slice(0, sp) : null;
|
|
93
|
+
}
|
|
94
|
+
/**
|
|
95
|
+
* `1 - log2(k)/log2(MAX_COUNTRIES)`, clamped to [0, 1]. k=1 → 1.0; k=2 → ~0.70; k≥MAX_COUNTRIES →
|
|
96
|
+
* 0.
|
|
97
|
+
*/
|
|
98
|
+
function confidenceFromCountryCount(k) {
|
|
99
|
+
if (k <= 0)
|
|
100
|
+
return 0;
|
|
101
|
+
if (k === 1)
|
|
102
|
+
return 1;
|
|
103
|
+
const c = 1 - Math.log2(k) / Math.log2(MAX_COUNTRIES);
|
|
104
|
+
return Math.max(0, Math.min(1, c));
|
|
105
|
+
}
|
|
106
|
+
/**
|
|
107
|
+
* Confidence scale for a digit-only code that shares its segment with a street word. A house number
|
|
108
|
+
* and a 5-digit postcode are the same shape, so membership alone can't separate `12345 Main St`
|
|
109
|
+
* (house number that happens to be a real ZIP elsewhere) from `San Francisco 94105` (postcode). The
|
|
110
|
+
* structural tell is cheap and locale-general: house numbers sit beside the street, postcodes
|
|
111
|
+
* beside the city. We scale rather than zero — the gazetteer still vouches for the shape, so a lone
|
|
112
|
+
* code in a street-only line stays usable; the penalty just lets a real trailing postcode out-rank
|
|
113
|
+
* it.
|
|
114
|
+
*/
|
|
115
|
+
const HOUSE_NUMBER_PENALTY = 0.2;
|
|
116
|
+
/**
|
|
117
|
+
* Standalone street-type words for the locales without a codex slice yet (ES/IT). US comes from
|
|
118
|
+
* `@mailwoman/codex/us`, German from `@mailwoman/codex/de`, French from `@mailwoman/codex/fr`; the
|
|
119
|
+
* Dutch compound suffixes are still inline below pending a `codex/nl` slice.
|
|
120
|
+
*/
|
|
121
|
+
const NON_US_STREET_WORDS = new Set([
|
|
122
|
+
// Spanish
|
|
123
|
+
"calle",
|
|
124
|
+
"avenida",
|
|
125
|
+
"avda",
|
|
126
|
+
"plaza",
|
|
127
|
+
"paseo",
|
|
128
|
+
"camino",
|
|
129
|
+
"carrera",
|
|
130
|
+
"ronda",
|
|
131
|
+
// Italian
|
|
132
|
+
"via",
|
|
133
|
+
"viale",
|
|
134
|
+
"piazza",
|
|
135
|
+
"corso",
|
|
136
|
+
"largo",
|
|
137
|
+
"vicolo",
|
|
138
|
+
"strada",
|
|
139
|
+
"contrada",
|
|
140
|
+
]);
|
|
141
|
+
/** Dutch compound street suffixes — matched against a token's tail (pending a `codex/nl` slice). */
|
|
142
|
+
const NL_STREET_SUFFIXES = ["straat", "laan", "plein", "gracht", "kade", "dijk", "steeg", "dreef"];
|
|
143
|
+
/**
|
|
144
|
+
* True when a token denotes a street. US suffixes come from the USPS Pub-28 table in
|
|
145
|
+
* `@mailwoman/codex/us` (complete, so `Trl`/`Holw`/`Xing` all match), EXCEPT the abbreviations that
|
|
146
|
+
* collide with a state code — `KY` (Key vs Kentucky), `PR` (Prairie vs Puerto Rico) — which sit in
|
|
147
|
+
* the postcode's own `City, ST ZIP` segment. German compounds come from `@mailwoman/codex/de`
|
|
148
|
+
* ({@link isGermanStreetToken}), whose suffix set already excludes the place-name endings (`-berg`,
|
|
149
|
+
* `-burg`, `-dorf`) that would otherwise flag a city token. French voie words come from
|
|
150
|
+
* `@mailwoman/codex/fr` ({@link isFrenchStreetWord}). ES/IT and Dutch fall back to the inline
|
|
151
|
+
* lists.
|
|
152
|
+
*
|
|
153
|
+
* `systems` GATES which vocabularies are consulted — only the systems the postcode plausibly
|
|
154
|
+
* belongs to (its gazetteer membership, e.g. a US-only ZIP gates to `{us}` and never checks the
|
|
155
|
+
* German or French vocab). This is what lets the check scale to 15-20 systems without a
|
|
156
|
+
* cross-locale collision (German `-ring` vs English `spring`): an unrelated system's vocabulary is
|
|
157
|
+
* simply never asked. The gate carries lowercase system/locale tags (`us`, `de`, `fr`, `es`, `it`,
|
|
158
|
+
* `nl`).
|
|
159
|
+
*/
|
|
160
|
+
function looksLikeStreetWord(token, systems) {
|
|
161
|
+
const t = token.toLowerCase().replace(/[^\p{L}]/gu, "");
|
|
162
|
+
if (t.length < 2)
|
|
163
|
+
return false;
|
|
164
|
+
if (systems.has("us") && isStreetSuffixToken(t) && !isUsStateAbbreviation(t))
|
|
165
|
+
return true;
|
|
166
|
+
if (systems.has("de") && isGermanStreetToken(t))
|
|
167
|
+
return true;
|
|
168
|
+
if (systems.has("fr") && isFrenchStreetWord(t))
|
|
169
|
+
return true;
|
|
170
|
+
if ((systems.has("es") || systems.has("it")) && NON_US_STREET_WORDS.has(t))
|
|
171
|
+
return true;
|
|
172
|
+
if (systems.has("nl"))
|
|
173
|
+
return NL_STREET_SUFFIXES.some((s) => t.length > s.length && t.endsWith(s));
|
|
174
|
+
return false;
|
|
175
|
+
}
|
|
176
|
+
/**
|
|
177
|
+
* Position-aware confidence factor for a postcode span: `1` for anything that cannot be confused
|
|
178
|
+
* with a house number, and {@link HOUSE_NUMBER_PENALTY} for a digit-only code sharing its
|
|
179
|
+
* comma-delimited segment with a street word. This is the structural prior that lets the anchor
|
|
180
|
+
* tell a leading `12345 Main St` house number from a trailing `San Francisco 94105` postcode with
|
|
181
|
+
* no model in the loop — and lets a consumer pick the right span by confidence instead of by raw
|
|
182
|
+
* position.
|
|
183
|
+
*
|
|
184
|
+
* `systems` narrows the street vocabularies to the ones this code plausibly belongs to (its
|
|
185
|
+
* gazetteer membership, or — for a code in no gazetteer — the format-shape candidates from codex).
|
|
186
|
+
*/
|
|
187
|
+
function positionFactor(text, start, normalized, systems) {
|
|
188
|
+
if (!/^\d+$/.test(normalized))
|
|
189
|
+
return 1; // only digit-only codes collide with house numbers
|
|
190
|
+
const segStart = text.lastIndexOf(",", start - 1) + 1;
|
|
191
|
+
let segEnd = text.indexOf(",", start);
|
|
192
|
+
if (segEnd < 0)
|
|
193
|
+
segEnd = text.length;
|
|
194
|
+
for (const token of text.slice(segStart, segEnd).split(/\s+/)) {
|
|
195
|
+
if (looksLikeStreetWord(token, systems))
|
|
196
|
+
return HOUSE_NUMBER_PENALTY;
|
|
197
|
+
}
|
|
198
|
+
return 1;
|
|
199
|
+
}
|
|
200
|
+
/**
|
|
201
|
+
* Extract postcode anchors from raw text. For each postcode-shaped span, resolve it against the
|
|
202
|
+
* gazetteer and emit a soft anchor (country posterior + confidence). Spans that match a shape but
|
|
203
|
+
* exist in no gazetteer are still returned, with an empty posterior and confidence 0 — an explicit
|
|
204
|
+
* "looks like a postcode, but isn't one" so the caller can see the extractor fired and chose not to
|
|
205
|
+
* anchor.
|
|
206
|
+
*/
|
|
207
|
+
export function extractPostcodeAnchors(text, resolver, opts = {}) {
|
|
208
|
+
const anchors = [];
|
|
209
|
+
for (const match of collectMatches(text)) {
|
|
210
|
+
const spanText = text.slice(match.start, match.end);
|
|
211
|
+
const normalized = normalizePostcode(spanText);
|
|
212
|
+
// Exact first; then the GB outward fallback (structural, not a guess); then edit-distance-1.
|
|
213
|
+
let hits = resolver.lookup(normalized);
|
|
214
|
+
let matchType = hits.length > 0 ? "exact" : "none";
|
|
215
|
+
if (matchType === "none") {
|
|
216
|
+
const outward = gbOutwardCode(normalized);
|
|
217
|
+
if (outward) {
|
|
218
|
+
const outwardHits = resolver.lookup(outward);
|
|
219
|
+
if (outwardHits.length > 0) {
|
|
220
|
+
hits = outwardHits;
|
|
221
|
+
matchType = "outward";
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
if (matchType === "none" && opts.fuzzy) {
|
|
226
|
+
const fuzzyHits = [];
|
|
227
|
+
for (const variant of editDistance1Variants(normalized)) {
|
|
228
|
+
for (const h of resolver.lookup(variant))
|
|
229
|
+
fuzzyHits.push(h);
|
|
230
|
+
}
|
|
231
|
+
if (fuzzyHits.length > 0) {
|
|
232
|
+
hits = fuzzyHits;
|
|
233
|
+
matchType = "fuzzy";
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
// Membership: distinct countries the postcode exists in (regardless of whether we have a centroid).
|
|
237
|
+
const countries = [...new Set(hits.map((h) => h.country))].sort();
|
|
238
|
+
const k = countries.length;
|
|
239
|
+
const posterior = {};
|
|
240
|
+
for (const c of countries)
|
|
241
|
+
posterior[c] = 1 / k;
|
|
242
|
+
// Placement: one representative coordinate-bearing hit per country (the first with real coords).
|
|
243
|
+
const candidates = [];
|
|
244
|
+
for (const c of countries) {
|
|
245
|
+
const placed = hits.find((h) => h.country === c && h.lat !== 0 && h.lon !== 0);
|
|
246
|
+
if (placed)
|
|
247
|
+
candidates.push(placed);
|
|
248
|
+
}
|
|
249
|
+
// Gate the street-word check to the systems this code plausibly belongs to: its gazetteer
|
|
250
|
+
// membership when known (precise — a US-only ZIP never checks the German vocab), else the
|
|
251
|
+
// format-shape candidates from codex (for a code in no gazetteer; its confidence is 0 anyway).
|
|
252
|
+
const systems = countries.length > 0
|
|
253
|
+
? new Set(countries.map((c) => c.toLowerCase()))
|
|
254
|
+
: new Set(candidateSystemsForPostcode(normalized));
|
|
255
|
+
const position = positionFactor(text, match.start, normalized, systems);
|
|
256
|
+
const confidence = confidenceFromCountryCount(k) * (matchType === "fuzzy" ? FUZZY_PENALTY : 1) * position;
|
|
257
|
+
anchors.push({
|
|
258
|
+
span: { text: spanText, start: match.start, end: match.end },
|
|
259
|
+
normalized,
|
|
260
|
+
candidates,
|
|
261
|
+
posterior,
|
|
262
|
+
confidence,
|
|
263
|
+
matchType,
|
|
264
|
+
positionFactor: position,
|
|
265
|
+
});
|
|
266
|
+
}
|
|
267
|
+
return anchors;
|
|
268
|
+
}
|
|
269
|
+
//# sourceMappingURL=postcode-anchor.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"postcode-anchor.js","sourceRoot":"","sources":["../postcode-anchor.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6BG;AAEH,OAAO,EAAE,2BAA2B,EAAE,MAAM,kBAAkB,CAAA;AAC9D,OAAO,EAAE,mBAAmB,EAAE,MAAM,qBAAqB,CAAA;AACzD,OAAO,EAAE,kBAAkB,EAAE,MAAM,qBAAqB,CAAA;AACxD,OAAO,EAAE,mBAAmB,EAAE,qBAAqB,EAAE,MAAM,qBAAqB,CAAA;AAChF,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAA;AA2DrD;;;GAGG;AACH,MAAM,aAAa,GAAG,EAAE,CAAA;AAExB,oGAAoG;AACpG,MAAM,aAAa,GAAG,GAAG,CAAA;AAEzB;;;;;GAKG;AACH,MAAM,UAAU,qBAAqB,CAAC,CAAS;IAC9C,MAAM,OAAO,GAAG,CAAC,EAAU,EAAU,EAAE,CACtC,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,4BAA4B,CAAC,CAAC,CAAC,EAAE,CAAA;IACvF,MAAM,QAAQ,GAAG,IAAI,GAAG,EAAU,CAAA;IAClC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC,EAAE;QAAE,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAA,CAAC,YAAY;IAC5F,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACnC,KAAK,MAAM,CAAC,IAAI,OAAO,CAAC,CAAC,CAAC,CAAC,CAAE,CAAC;YAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;gBAAE,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAA,CAAC,gBAAgB;IAClH,CAAC;IACD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACpC,KAAK,MAAM,CAAC,IAAI,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;YAAE,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAA,CAAC,aAAa;IAC5G,CAAC;IACD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC,EAAE;QAAE,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAA,CAAC,iBAAiB;IACvH,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,CAAA;IAClB,OAAO,CAAC,GAAG,QAAQ,CAAC,CAAA;AACrB,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,iBAAiB,CAAC,GAAW;IAC5C,IAAI,CAAC,GAAG,GAAG,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAA;IACrD,IAAI,WAAW,CAAC,IAAI,CAAC,CAAC,CAAC;QAAE,CAAC,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAA,CAAC,0CAA0C;IAClF,IAAI,kBAAkB,CAAC,IAAI,CAAC,CAAC,CAAC;QAAE,CAAC,GAAG,CAAC,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,CAAC,CAAA,CAAC,8CAA8C;IACrG,OAAO,CAAC,CAAA;AACT,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,aAAa,CAAC,UAAkB;IAC/C,MAAM,EAAE,GAAG,UAAU,CAAC,OAAO,CAAC,GAAG,CAAC,CAAA;IAClC,IAAI,EAAE,GAAG,CAAC;QAAE,OAAO,IAAI,CAAA;IACvB,OAAO,cAAc,CAAC,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,IAAI,CAAA;AACtF,CAAC;AAED;;;GAGG;AACH,SAAS,0BAA0B,CAAC,CAAS;IAC5C,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,CAAC,CAAA;IACpB,IAAI,CAAC,KAAK,CAAC;QAAE,OAAO,CAAC,CAAA;IACrB,MAAM,CAAC,GAAG,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,aAAa,CAAC,CAAA;IACrD,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAA;AACnC,CAAC;AAED;;;;;;;;GAQG;AACH,MAAM,oBAAoB,GAAG,GAAG,CAAA;AAEhC;;;;GAIG;AACH,MAAM,mBAAmB,GAAG,IAAI,GAAG,CAAC;IACnC,UAAU;IACV,OAAO;IACP,SAAS;IACT,MAAM;IACN,OAAO;IACP,OAAO;IACP,QAAQ;IACR,SAAS;IACT,OAAO;IACP,UAAU;IACV,KAAK;IACL,OAAO;IACP,QAAQ;IACR,OAAO;IACP,OAAO;IACP,QAAQ;IACR,QAAQ;IACR,UAAU;CACV,CAAC,CAAA;AAEF,oGAAoG;AACpG,MAAM,kBAAkB,GAAG,CAAC,QAAQ,EAAE,MAAM,EAAE,OAAO,EAAE,QAAQ,EAAE,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,OAAO,CAAC,CAAA;AAElG;;;;;;;;;;;;;;;;GAgBG;AACH,SAAS,mBAAmB,CAAC,KAAa,EAAE,OAA4B;IACvE,MAAM,CAAC,GAAG,KAAK,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,YAAY,EAAE,EAAE,CAAC,CAAA;IACvD,IAAI,CAAC,CAAC,MAAM,GAAG,CAAC;QAAE,OAAO,KAAK,CAAA;IAC9B,IAAI,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,mBAAmB,CAAC,CAAC,CAAC,IAAI,CAAC,qBAAqB,CAAC,CAAC,CAAC;QAAE,OAAO,IAAI,CAAA;IACzF,IAAI,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,mBAAmB,CAAC,CAAC,CAAC;QAAE,OAAO,IAAI,CAAA;IAC5D,IAAI,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,kBAAkB,CAAC,CAAC,CAAC;QAAE,OAAO,IAAI,CAAA;IAC3D,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,IAAI,mBAAmB,CAAC,GAAG,CAAC,CAAC,CAAC;QAAE,OAAO,IAAI,CAAA;IACvF,IAAI,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC;QAAE,OAAO,kBAAkB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,MAAM,IAAI,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAA;IAClG,OAAO,KAAK,CAAA;AACb,CAAC;AAED;;;;;;;;;;GAUG;AACH,SAAS,cAAc,CAAC,IAAY,EAAE,KAAa,EAAE,UAAkB,EAAE,OAA4B;IACpG,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,UAAU,CAAC;QAAE,OAAO,CAAC,CAAA,CAAC,mDAAmD;IAC3F,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,GAAG,EAAE,KAAK,GAAG,CAAC,CAAC,GAAG,CAAC,CAAA;IACrD,IAAI,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE,KAAK,CAAC,CAAA;IACrC,IAAI,MAAM,GAAG,CAAC;QAAE,MAAM,GAAG,IAAI,CAAC,MAAM,CAAA;IACpC,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,KAAK,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC;QAC/D,IAAI,mBAAmB,CAAC,KAAK,EAAE,OAAO,CAAC;YAAE,OAAO,oBAAoB,CAAA;IACrE,CAAC;IACD,OAAO,CAAC,CAAA;AACT,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,sBAAsB,CACrC,IAAY,EACZ,QAA0B,EAC1B,OAAmC,EAAE;IAErC,MAAM,OAAO,GAAqB,EAAE,CAAA;IAEpC,KAAK,MAAM,KAAK,IAAI,cAAc,CAAC,IAAI,CAAC,EAAE,CAAC;QAC1C,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,KAAK,EAAE,KAAK,CAAC,GAAG,CAAC,CAAA;QACnD,MAAM,UAAU,GAAG,iBAAiB,CAAC,QAAQ,CAAC,CAAA;QAE9C,6FAA6F;QAC7F,IAAI,IAAI,GAAG,QAAQ,CAAC,MAAM,CAAC,UAAU,CAAC,CAAA;QACtC,IAAI,SAAS,GAAgC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAA;QAC/E,IAAI,SAAS,KAAK,MAAM,EAAE,CAAC;YAC1B,MAAM,OAAO,GAAG,aAAa,CAAC,UAAU,CAAC,CAAA;YACzC,IAAI,OAAO,EAAE,CAAC;gBACb,MAAM,WAAW,GAAG,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,CAAA;gBAC5C,IAAI,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBAC5B,IAAI,GAAG,WAAW,CAAA;oBAClB,SAAS,GAAG,SAAS,CAAA;gBACtB,CAAC;YACF,CAAC;QACF,CAAC;QACD,IAAI,SAAS,KAAK,MAAM,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;YACxC,MAAM,SAAS,GAAoB,EAAE,CAAA;YACrC,KAAK,MAAM,OAAO,IAAI,qBAAqB,CAAC,UAAU,CAAC,EAAE,CAAC;gBACzD,KAAK,MAAM,CAAC,IAAI,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC;oBAAE,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;YAC5D,CAAC;YACD,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC1B,IAAI,GAAG,SAAS,CAAA;gBAChB,SAAS,GAAG,OAAO,CAAA;YACpB,CAAC;QACF,CAAC;QAED,oGAAoG;QACpG,MAAM,SAAS,GAAG,CAAC,GAAG,IAAI,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAA;QACjE,MAAM,CAAC,GAAG,SAAS,CAAC,MAAM,CAAA;QAE1B,MAAM,SAAS,GAA2B,EAAE,CAAA;QAC5C,KAAK,MAAM,CAAC,IAAI,SAAS;YAAE,SAAS,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAA;QAE/C,iGAAiG;QACjG,MAAM,UAAU,GAAoB,EAAE,CAAA;QACtC,KAAK,MAAM,CAAC,IAAI,SAAS,EAAE,CAAC;YAC3B,MAAM,MAAM,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,KAAK,CAAC,IAAI,CAAC,CAAC,GAAG,KAAK,CAAC,IAAI,CAAC,CAAC,GAAG,KAAK,CAAC,CAAC,CAAA;YAC9E,IAAI,MAAM;gBAAE,UAAU,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QACpC,CAAC;QAED,0FAA0F;QAC1F,0FAA0F;QAC1F,+FAA+F;QAC/F,MAAM,OAAO,GACZ,SAAS,CAAC,MAAM,GAAG,CAAC;YACnB,CAAC,CAAC,IAAI,GAAG,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC,CAAC;YAChD,CAAC,CAAC,IAAI,GAAG,CAAS,2BAA2B,CAAC,UAAU,CAAC,CAAC,CAAA;QAC5D,MAAM,QAAQ,GAAG,cAAc,CAAC,IAAI,EAAE,KAAK,CAAC,KAAK,EAAE,UAAU,EAAE,OAAO,CAAC,CAAA;QACvE,MAAM,UAAU,GAAG,0BAA0B,CAAC,CAAC,CAAC,GAAG,CAAC,SAAS,KAAK,OAAO,CAAC,CAAC,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,QAAQ,CAAA;QAEzG,OAAO,CAAC,IAAI,CAAC;YACZ,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,KAAK,EAAE,KAAK,CAAC,KAAK,EAAE,GAAG,EAAE,KAAK,CAAC,GAAG,EAAE;YAC5D,UAAU;YACV,UAAU;YACV,SAAS;YACT,UAAU;YACV,SAAS;YACT,cAAc,EAAE,QAAQ;SACxB,CAAC,CAAA;IACH,CAAC;IAED,OAAO,OAAO,CAAA;AACf,CAAC"}
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Browser-side postcode resolver for the anchor (#240). A pure-JS, zero-dependency
|
|
7
|
+
* `PostcodeResolver` backed by a compact flat binary instead of SQLite, so the postcode anchor
|
|
8
|
+
* runs in the WASM/browser parser behind the same `lookup()` seam as the server-side
|
|
9
|
+
* `WofPostcodeLookup`.
|
|
10
|
+
*
|
|
11
|
+
* This file owns BOTH ends of the format — `serializePostcodeBinary` (run in Node by
|
|
12
|
+
* `scripts/build-postcode-binary.ts`) and `PostcodeBinaryResolver` (run in the browser) — so the
|
|
13
|
+
* layout can never drift between writer and reader.
|
|
14
|
+
*
|
|
15
|
+
* Binary layout (little-endian): magic "PCB1" (4 bytes) u32 recordCount u8 countryCount, then
|
|
16
|
+
* countryCount × 2 ASCII bytes (the country table) u8 keyWidth (max postcode length in bytes)
|
|
17
|
+
* records recordCount × { key[keyWidth] ASCII right-padded with 0x00, u8 countryIdx, i16 latQ,
|
|
18
|
+
* i16 lonQ }, sorted by key bytes ascending. A postcode present in two countries appears as two
|
|
19
|
+
* adjacent records (same key, different countryIdx).
|
|
20
|
+
*
|
|
21
|
+
* Coordinates are quantized to i16: latQ = round(lat/90 × 32767), lonQ = round(lon/180 × 32767),
|
|
22
|
+
* giving ~300 m resolution — ample for a "which city/region" anchor. A record with latQ = lonQ =
|
|
23
|
+
* 0 means "known postcode, no centroid" (membership only), matching the SQLite resolver's
|
|
24
|
+
* convention.
|
|
25
|
+
*/
|
|
26
|
+
import type { AnchorLookup } from "./anchor-inference.js";
|
|
27
|
+
import type { PostcodePlace } from "./postcode-anchor.js";
|
|
28
|
+
export interface PostcodeBinaryEntry {
|
|
29
|
+
postcode: string;
|
|
30
|
+
country: string;
|
|
31
|
+
lat: number;
|
|
32
|
+
lon: number;
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Serialize postcode entries into the flat binary. Entries are sorted by (postcode, country) so
|
|
36
|
+
* equal postcodes land in adjacent records. Run in Node; consumed by
|
|
37
|
+
* {@link PostcodeBinaryResolver}.
|
|
38
|
+
*/
|
|
39
|
+
export declare function serializePostcodeBinary(entries: readonly PostcodeBinaryEntry[]): Uint8Array;
|
|
40
|
+
/**
|
|
41
|
+
* Pure-JS, browser-safe postcode resolver over the flat binary. Implements the same `lookup()` seam
|
|
42
|
+
* as the SQLite `WofPostcodeLookup`, so `extractPostcodeAnchors` is agnostic to which backs it.
|
|
43
|
+
*/
|
|
44
|
+
export declare class PostcodeBinaryResolver {
|
|
45
|
+
#private;
|
|
46
|
+
constructor(bytes: Uint8Array);
|
|
47
|
+
lookup(postcode: string): PostcodePlace[];
|
|
48
|
+
/**
|
|
49
|
+
* Decode the whole binary into an {@link AnchorLookup} (`Map<postcode, AnchorEntry>`) for the
|
|
50
|
+
* neural anchor channel (#239/#240): each postcode → a uniform posterior over its member
|
|
51
|
+
* countries
|
|
52
|
+
*
|
|
53
|
+
* - The mean of its non-zero centroids. This is the browser-side equivalent of the pilot
|
|
54
|
+
* postcode→anchor lookup the model trained against, built live from the shipped binary instead
|
|
55
|
+
* of a precomputed JSON. Records are stored sorted by (postcode, country), so equal keys are
|
|
56
|
+
* contiguous.
|
|
57
|
+
*/
|
|
58
|
+
toAnchorLookup(): AnchorLookup;
|
|
59
|
+
}
|
|
60
|
+
//# sourceMappingURL=postcode-binary-resolver.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"postcode-binary-resolver.d.ts","sourceRoot":"","sources":["../postcode-binary-resolver.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AAEH,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,uBAAuB,CAAA;AACzD,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,sBAAsB,CAAA;AAOzD,MAAM,WAAW,mBAAmB;IACnC,QAAQ,EAAE,MAAM,CAAA;IAChB,OAAO,EAAE,MAAM,CAAA;IACf,GAAG,EAAE,MAAM,CAAA;IACX,GAAG,EAAE,MAAM,CAAA;CACX;AAUD;;;;GAIG;AACH,wBAAgB,uBAAuB,CAAC,OAAO,EAAE,SAAS,mBAAmB,EAAE,GAAG,UAAU,CA2C3F;AAED;;;GAGG;AACH,qBAAa,sBAAsB;;gBAStB,KAAK,EAAE,UAAU;IA2B7B,MAAM,CAAC,QAAQ,EAAE,MAAM,GAAG,aAAa,EAAE;IA2BzC;;;;;;;;;OASG;IACH,cAAc,IAAI,YAAY;CA+C9B"}
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Browser-side postcode resolver for the anchor (#240). A pure-JS, zero-dependency
|
|
7
|
+
* `PostcodeResolver` backed by a compact flat binary instead of SQLite, so the postcode anchor
|
|
8
|
+
* runs in the WASM/browser parser behind the same `lookup()` seam as the server-side
|
|
9
|
+
* `WofPostcodeLookup`.
|
|
10
|
+
*
|
|
11
|
+
* This file owns BOTH ends of the format — `serializePostcodeBinary` (run in Node by
|
|
12
|
+
* `scripts/build-postcode-binary.ts`) and `PostcodeBinaryResolver` (run in the browser) — so the
|
|
13
|
+
* layout can never drift between writer and reader.
|
|
14
|
+
*
|
|
15
|
+
* Binary layout (little-endian): magic "PCB1" (4 bytes) u32 recordCount u8 countryCount, then
|
|
16
|
+
* countryCount × 2 ASCII bytes (the country table) u8 keyWidth (max postcode length in bytes)
|
|
17
|
+
* records recordCount × { key[keyWidth] ASCII right-padded with 0x00, u8 countryIdx, i16 latQ,
|
|
18
|
+
* i16 lonQ }, sorted by key bytes ascending. A postcode present in two countries appears as two
|
|
19
|
+
* adjacent records (same key, different countryIdx).
|
|
20
|
+
*
|
|
21
|
+
* Coordinates are quantized to i16: latQ = round(lat/90 × 32767), lonQ = round(lon/180 × 32767),
|
|
22
|
+
* giving ~300 m resolution — ample for a "which city/region" anchor. A record with latQ = lonQ =
|
|
23
|
+
* 0 means "known postcode, no centroid" (membership only), matching the SQLite resolver's
|
|
24
|
+
* convention.
|
|
25
|
+
*/
|
|
26
|
+
const MAGIC = 0x31_42_43_50; // "PCB1" little-endian (P=0x50 C=0x43 B=0x42 1=0x31)
|
|
27
|
+
const REC_TAIL = 5; // countryIdx(1) + latQ(2) + lonQ(2)
|
|
28
|
+
const LAT_Q = 32767 / 90;
|
|
29
|
+
const LON_Q = 32767 / 180;
|
|
30
|
+
/**
|
|
31
|
+
* Right-pad an ASCII postcode to `width` with NUL; `\0` sorts below any real char, so shorter keys
|
|
32
|
+
* order before longer ones with the same prefix, which is what we want.
|
|
33
|
+
*/
|
|
34
|
+
function encodeKey(s, width, out, offset) {
|
|
35
|
+
for (let i = 0; i < width; i++)
|
|
36
|
+
out[offset + i] = i < s.length ? s.charCodeAt(i) & 0x7f : 0;
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Serialize postcode entries into the flat binary. Entries are sorted by (postcode, country) so
|
|
40
|
+
* equal postcodes land in adjacent records. Run in Node; consumed by
|
|
41
|
+
* {@link PostcodeBinaryResolver}.
|
|
42
|
+
*/
|
|
43
|
+
export function serializePostcodeBinary(entries) {
|
|
44
|
+
const sorted = [...entries].sort((a, b) => a.postcode < b.postcode
|
|
45
|
+
? -1
|
|
46
|
+
: a.postcode > b.postcode
|
|
47
|
+
? 1
|
|
48
|
+
: a.country < b.country
|
|
49
|
+
? -1
|
|
50
|
+
: a.country > b.country
|
|
51
|
+
? 1
|
|
52
|
+
: 0);
|
|
53
|
+
const countries = [...new Set(sorted.map((e) => e.country))].sort();
|
|
54
|
+
const countryIdx = new Map(countries.map((c, i) => [c, i]));
|
|
55
|
+
const keyWidth = sorted.reduce((m, e) => Math.max(m, e.postcode.length), 1);
|
|
56
|
+
const recSize = keyWidth + REC_TAIL;
|
|
57
|
+
const headerSize = 4 + 4 + 1 + countries.length * 2 + 1;
|
|
58
|
+
const buf = new Uint8Array(headerSize + sorted.length * recSize);
|
|
59
|
+
const view = new DataView(buf.buffer);
|
|
60
|
+
let o = 0;
|
|
61
|
+
view.setUint32(o, MAGIC, true);
|
|
62
|
+
o += 4;
|
|
63
|
+
view.setUint32(o, sorted.length, true);
|
|
64
|
+
o += 4;
|
|
65
|
+
buf[o++] = countries.length;
|
|
66
|
+
for (const c of countries) {
|
|
67
|
+
buf[o++] = c.charCodeAt(0) & 0x7f;
|
|
68
|
+
buf[o++] = c.charCodeAt(1) & 0x7f;
|
|
69
|
+
}
|
|
70
|
+
buf[o++] = keyWidth;
|
|
71
|
+
for (const e of sorted) {
|
|
72
|
+
encodeKey(e.postcode, keyWidth, buf, o);
|
|
73
|
+
o += keyWidth;
|
|
74
|
+
buf[o++] = countryIdx.get(e.country);
|
|
75
|
+
view.setInt16(o, Math.max(-32767, Math.min(32767, Math.round(e.lat * LAT_Q))), true);
|
|
76
|
+
o += 2;
|
|
77
|
+
view.setInt16(o, Math.max(-32767, Math.min(32767, Math.round(e.lon * LON_Q))), true);
|
|
78
|
+
o += 2;
|
|
79
|
+
}
|
|
80
|
+
return buf;
|
|
81
|
+
}
|
|
82
|
+
/**
|
|
83
|
+
* Pure-JS, browser-safe postcode resolver over the flat binary. Implements the same `lookup()` seam
|
|
84
|
+
* as the SQLite `WofPostcodeLookup`, so `extractPostcodeAnchors` is agnostic to which backs it.
|
|
85
|
+
*/
|
|
86
|
+
export class PostcodeBinaryResolver {
|
|
87
|
+
#buf;
|
|
88
|
+
#view;
|
|
89
|
+
#count;
|
|
90
|
+
#countries;
|
|
91
|
+
#keyWidth;
|
|
92
|
+
#recSize;
|
|
93
|
+
#recBase;
|
|
94
|
+
constructor(bytes) {
|
|
95
|
+
this.#buf = bytes;
|
|
96
|
+
this.#view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength);
|
|
97
|
+
if (this.#view.getUint32(0, true) !== MAGIC)
|
|
98
|
+
throw new Error("postcode binary: bad magic");
|
|
99
|
+
this.#count = this.#view.getUint32(4, true);
|
|
100
|
+
let o = 8;
|
|
101
|
+
const countryCount = bytes[o++];
|
|
102
|
+
this.#countries = [];
|
|
103
|
+
for (let i = 0; i < countryCount; i++) {
|
|
104
|
+
this.#countries.push(String.fromCharCode(bytes[o], bytes[o + 1]));
|
|
105
|
+
o += 2;
|
|
106
|
+
}
|
|
107
|
+
this.#keyWidth = bytes[o++];
|
|
108
|
+
this.#recSize = this.#keyWidth + REC_TAIL;
|
|
109
|
+
this.#recBase = o;
|
|
110
|
+
}
|
|
111
|
+
/** Compare the keyWidth bytes of record `i` against a padded query key. */
|
|
112
|
+
#cmpKey(i, key) {
|
|
113
|
+
const base = this.#recBase + i * this.#recSize;
|
|
114
|
+
for (let j = 0; j < this.#keyWidth; j++) {
|
|
115
|
+
const d = this.#buf[base + j] - key[j];
|
|
116
|
+
if (d !== 0)
|
|
117
|
+
return d;
|
|
118
|
+
}
|
|
119
|
+
return 0;
|
|
120
|
+
}
|
|
121
|
+
lookup(postcode) {
|
|
122
|
+
if (postcode.length > this.#keyWidth)
|
|
123
|
+
return []; // longer than any stored key → impossible
|
|
124
|
+
const key = new Uint8Array(this.#keyWidth);
|
|
125
|
+
encodeKey(postcode, this.#keyWidth, key, 0);
|
|
126
|
+
// Binary search for the first record whose key >= the query.
|
|
127
|
+
let lo = 0;
|
|
128
|
+
let hi = this.#count;
|
|
129
|
+
while (lo < hi) {
|
|
130
|
+
const mid = (lo + hi) >>> 1;
|
|
131
|
+
if (this.#cmpKey(mid, key) < 0)
|
|
132
|
+
lo = mid + 1;
|
|
133
|
+
else
|
|
134
|
+
hi = mid;
|
|
135
|
+
}
|
|
136
|
+
// Collect the contiguous run of equal keys (one per country).
|
|
137
|
+
const out = [];
|
|
138
|
+
for (let i = lo; i < this.#count && this.#cmpKey(i, key) === 0; i++) {
|
|
139
|
+
const base = this.#recBase + i * this.#recSize + this.#keyWidth;
|
|
140
|
+
out.push({
|
|
141
|
+
country: this.#countries[this.#buf[base]],
|
|
142
|
+
lat: this.#view.getInt16(base + 1, true) / LAT_Q,
|
|
143
|
+
lon: this.#view.getInt16(base + 3, true) / LON_Q,
|
|
144
|
+
});
|
|
145
|
+
}
|
|
146
|
+
return out;
|
|
147
|
+
}
|
|
148
|
+
/**
|
|
149
|
+
* Decode the whole binary into an {@link AnchorLookup} (`Map<postcode, AnchorEntry>`) for the
|
|
150
|
+
* neural anchor channel (#239/#240): each postcode → a uniform posterior over its member
|
|
151
|
+
* countries
|
|
152
|
+
*
|
|
153
|
+
* - The mean of its non-zero centroids. This is the browser-side equivalent of the pilot
|
|
154
|
+
* postcode→anchor lookup the model trained against, built live from the shipped binary instead
|
|
155
|
+
* of a precomputed JSON. Records are stored sorted by (postcode, country), so equal keys are
|
|
156
|
+
* contiguous.
|
|
157
|
+
*/
|
|
158
|
+
toAnchorLookup() {
|
|
159
|
+
const out = new Map();
|
|
160
|
+
let i = 0;
|
|
161
|
+
while (i < this.#count) {
|
|
162
|
+
// Decode this record's postcode key (ASCII, 0x00-right-padded).
|
|
163
|
+
const keyBase = this.#recBase + i * this.#recSize;
|
|
164
|
+
let postcode = "";
|
|
165
|
+
for (let j = 0; j < this.#keyWidth; j++) {
|
|
166
|
+
const c = this.#buf[keyBase + j];
|
|
167
|
+
if (c === 0)
|
|
168
|
+
break;
|
|
169
|
+
postcode += String.fromCharCode(c);
|
|
170
|
+
}
|
|
171
|
+
// Walk the contiguous run of records sharing this key (one per member country).
|
|
172
|
+
const posterior = {};
|
|
173
|
+
let latSum = 0;
|
|
174
|
+
let lonSum = 0;
|
|
175
|
+
let centroidCount = 0;
|
|
176
|
+
let k = i;
|
|
177
|
+
for (; k < this.#count; k++) {
|
|
178
|
+
const base = this.#recBase + k * this.#recSize;
|
|
179
|
+
let same = true;
|
|
180
|
+
for (let j = 0; j < this.#keyWidth; j++) {
|
|
181
|
+
if (this.#buf[base + j] !== this.#buf[keyBase + j]) {
|
|
182
|
+
same = false;
|
|
183
|
+
break;
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
if (!same)
|
|
187
|
+
break;
|
|
188
|
+
const tail = base + this.#keyWidth;
|
|
189
|
+
posterior[this.#countries[this.#buf[tail]]] = 1; // uniform — anchorFeatureVector renormalizes
|
|
190
|
+
const lat = this.#view.getInt16(tail + 1, true) / LAT_Q;
|
|
191
|
+
const lon = this.#view.getInt16(tail + 3, true) / LON_Q;
|
|
192
|
+
if (lat !== 0 || lon !== 0) {
|
|
193
|
+
latSum += lat;
|
|
194
|
+
lonSum += lon;
|
|
195
|
+
centroidCount++;
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
out.set(postcode, {
|
|
199
|
+
posterior,
|
|
200
|
+
lat: centroidCount ? latSum / centroidCount : 0,
|
|
201
|
+
lon: centroidCount ? lonSum / centroidCount : 0,
|
|
202
|
+
});
|
|
203
|
+
i = k;
|
|
204
|
+
}
|
|
205
|
+
return out;
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
//# sourceMappingURL=postcode-binary-resolver.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"postcode-binary-resolver.js","sourceRoot":"","sources":["../postcode-binary-resolver.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AAKH,MAAM,KAAK,GAAG,aAAa,CAAA,CAAC,qDAAqD;AACjF,MAAM,QAAQ,GAAG,CAAC,CAAA,CAAC,oCAAoC;AACvD,MAAM,KAAK,GAAG,KAAK,GAAG,EAAE,CAAA;AACxB,MAAM,KAAK,GAAG,KAAK,GAAG,GAAG,CAAA;AASzB;;;GAGG;AACH,SAAS,SAAS,CAAC,CAAS,EAAE,KAAa,EAAE,GAAe,EAAE,MAAc;IAC3E,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,EAAE,CAAC,EAAE;QAAE,GAAG,CAAC,MAAM,GAAG,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC,CAAA;AAC5F,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,uBAAuB,CAAC,OAAuC;IAC9E,MAAM,MAAM,GAAG,CAAC,GAAG,OAAO,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CACzC,CAAC,CAAC,QAAQ,GAAG,CAAC,CAAC,QAAQ;QACtB,CAAC,CAAC,CAAC,CAAC;QACJ,CAAC,CAAC,CAAC,CAAC,QAAQ,GAAG,CAAC,CAAC,QAAQ;YACxB,CAAC,CAAC,CAAC;YACH,CAAC,CAAC,CAAC,CAAC,OAAO,GAAG,CAAC,CAAC,OAAO;gBACtB,CAAC,CAAC,CAAC,CAAC;gBACJ,CAAC,CAAC,CAAC,CAAC,OAAO,GAAG,CAAC,CAAC,OAAO;oBACtB,CAAC,CAAC,CAAC;oBACH,CAAC,CAAC,CAAC,CACP,CAAA;IACD,MAAM,SAAS,GAAG,CAAC,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAA;IACnE,MAAM,UAAU,GAAG,IAAI,GAAG,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAA;IAC3D,MAAM,QAAQ,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAA;IAC3E,MAAM,OAAO,GAAG,QAAQ,GAAG,QAAQ,CAAA;IAEnC,MAAM,UAAU,GAAG,CAAC,GAAG,CAAC,GAAG,CAAC,GAAG,SAAS,CAAC,MAAM,GAAG,CAAC,GAAG,CAAC,CAAA;IACvD,MAAM,GAAG,GAAG,IAAI,UAAU,CAAC,UAAU,GAAG,MAAM,CAAC,MAAM,GAAG,OAAO,CAAC,CAAA;IAChE,MAAM,IAAI,GAAG,IAAI,QAAQ,CAAC,GAAG,CAAC,MAAM,CAAC,CAAA;IAErC,IAAI,CAAC,GAAG,CAAC,CAAA;IACT,IAAI,CAAC,SAAS,CAAC,CAAC,EAAE,KAAK,EAAE,IAAI,CAAC,CAAA;IAC9B,CAAC,IAAI,CAAC,CAAA;IACN,IAAI,CAAC,SAAS,CAAC,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC,CAAA;IACtC,CAAC,IAAI,CAAC,CAAA;IACN,GAAG,CAAC,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,MAAM,CAAA;IAC3B,KAAK,MAAM,CAAC,IAAI,SAAS,EAAE,CAAC;QAC3B,GAAG,CAAC,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,GAAG,IAAI,CAAA;QACjC,GAAG,CAAC,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,GAAG,IAAI,CAAA;IAClC,CAAC;IACD,GAAG,CAAC,CAAC,EAAE,CAAC,GAAG,QAAQ,CAAA;IAEnB,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;QACxB,SAAS,CAAC,CAAC,CAAC,QAAQ,EAAE,QAAQ,EAAE,GAAG,EAAE,CAAC,CAAC,CAAA;QACvC,CAAC,IAAI,QAAQ,CAAA;QACb,GAAG,CAAC,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,CAAE,CAAA;QACrC,IAAI,CAAC,QAAQ,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,IAAI,CAAC,GAAG,CAAC,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,GAAG,GAAG,KAAK,CAAC,CAAC,CAAC,EAAE,IAAI,CAAC,CAAA;QACpF,CAAC,IAAI,CAAC,CAAA;QACN,IAAI,CAAC,QAAQ,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,IAAI,CAAC,GAAG,CAAC,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,GAAG,GAAG,KAAK,CAAC,CAAC,CAAC,EAAE,IAAI,CAAC,CAAA;QACpF,CAAC,IAAI,CAAC,CAAA;IACP,CAAC;IACD,OAAO,GAAG,CAAA;AACX,CAAC;AAED;;;GAGG;AACH,MAAM,OAAO,sBAAsB;IACzB,IAAI,CAAY;IAChB,KAAK,CAAU;IACf,MAAM,CAAQ;IACd,UAAU,CAAU;IACpB,SAAS,CAAQ;IACjB,QAAQ,CAAQ;IAChB,QAAQ,CAAQ;IAEzB,YAAY,KAAiB;QAC5B,IAAI,CAAC,IAAI,GAAG,KAAK,CAAA;QACjB,IAAI,CAAC,KAAK,GAAG,IAAI,QAAQ,CAAC,KAAK,CAAC,MAAM,EAAE,KAAK,CAAC,UAAU,EAAE,KAAK,CAAC,UAAU,CAAC,CAAA;QAC3E,IAAI,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC,EAAE,IAAI,CAAC,KAAK,KAAK;YAAE,MAAM,IAAI,KAAK,CAAC,4BAA4B,CAAC,CAAA;QAC1F,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC,EAAE,IAAI,CAAC,CAAA;QAC3C,IAAI,CAAC,GAAG,CAAC,CAAA;QACT,MAAM,YAAY,GAAG,KAAK,CAAC,CAAC,EAAE,CAAE,CAAA;QAChC,IAAI,CAAC,UAAU,GAAG,EAAE,CAAA;QACpB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,YAAY,EAAE,CAAC,EAAE,EAAE,CAAC;YACvC,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,MAAM,CAAC,YAAY,CAAC,KAAK,CAAC,CAAC,CAAE,EAAE,KAAK,CAAC,CAAC,GAAG,CAAC,CAAE,CAAC,CAAC,CAAA;YACnE,CAAC,IAAI,CAAC,CAAA;QACP,CAAC;QACD,IAAI,CAAC,SAAS,GAAG,KAAK,CAAC,CAAC,EAAE,CAAE,CAAA;QAC5B,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC,SAAS,GAAG,QAAQ,CAAA;QACzC,IAAI,CAAC,QAAQ,GAAG,CAAC,CAAA;IAClB,CAAC;IAED,2EAA2E;IAC3E,OAAO,CAAC,CAAS,EAAE,GAAe;QACjC,MAAM,IAAI,GAAG,IAAI,CAAC,QAAQ,GAAG,CAAC,GAAG,IAAI,CAAC,QAAQ,CAAA;QAC9C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;YACzC,MAAM,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,GAAG,CAAC,CAAE,GAAG,GAAG,CAAC,CAAC,CAAE,CAAA;YACxC,IAAI,CAAC,KAAK,CAAC;gBAAE,OAAO,CAAC,CAAA;QACtB,CAAC;QACD,OAAO,CAAC,CAAA;IACT,CAAC;IAED,MAAM,CAAC,QAAgB;QACtB,IAAI,QAAQ,CAAC,MAAM,GAAG,IAAI,CAAC,SAAS;YAAE,OAAO,EAAE,CAAA,CAAC,0CAA0C;QAC1F,MAAM,GAAG,GAAG,IAAI,UAAU,CAAC,IAAI,CAAC,SAAS,CAAC,CAAA;QAC1C,SAAS,CAAC,QAAQ,EAAE,IAAI,CAAC,SAAS,EAAE,GAAG,EAAE,CAAC,CAAC,CAAA;QAE3C,6DAA6D;QAC7D,IAAI,EAAE,GAAG,CAAC,CAAA;QACV,IAAI,EAAE,GAAG,IAAI,CAAC,MAAM,CAAA;QACpB,OAAO,EAAE,GAAG,EAAE,EAAE,CAAC;YAChB,MAAM,GAAG,GAAG,CAAC,EAAE,GAAG,EAAE,CAAC,KAAK,CAAC,CAAA;YAC3B,IAAI,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE,GAAG,CAAC,GAAG,CAAC;gBAAE,EAAE,GAAG,GAAG,GAAG,CAAC,CAAA;;gBACvC,EAAE,GAAG,GAAG,CAAA;QACd,CAAC;QAED,8DAA8D;QAC9D,MAAM,GAAG,GAAoB,EAAE,CAAA;QAC/B,KAAK,IAAI,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,IAAI,IAAI,CAAC,OAAO,CAAC,CAAC,EAAE,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YACrE,MAAM,IAAI,GAAG,IAAI,CAAC,QAAQ,GAAG,CAAC,GAAG,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC,SAAS,CAAA;YAC/D,GAAG,CAAC,IAAI,CAAC;gBACR,OAAO,EAAE,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAE,CAAE;gBAC3C,GAAG,EAAE,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,IAAI,GAAG,CAAC,EAAE,IAAI,CAAC,GAAG,KAAK;gBAChD,GAAG,EAAE,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,IAAI,GAAG,CAAC,EAAE,IAAI,CAAC,GAAG,KAAK;aAChD,CAAC,CAAA;QACH,CAAC;QACD,OAAO,GAAG,CAAA;IACX,CAAC;IAED;;;;;;;;;OASG;IACH,cAAc;QACb,MAAM,GAAG,GAAiB,IAAI,GAAG,EAAE,CAAA;QACnC,IAAI,CAAC,GAAG,CAAC,CAAA;QACT,OAAO,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;YACxB,gEAAgE;YAChE,MAAM,OAAO,GAAG,IAAI,CAAC,QAAQ,GAAG,CAAC,GAAG,IAAI,CAAC,QAAQ,CAAA;YACjD,IAAI,QAAQ,GAAG,EAAE,CAAA;YACjB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;gBACzC,MAAM,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,GAAG,CAAC,CAAE,CAAA;gBACjC,IAAI,CAAC,KAAK,CAAC;oBAAE,MAAK;gBAClB,QAAQ,IAAI,MAAM,CAAC,YAAY,CAAC,CAAC,CAAC,CAAA;YACnC,CAAC;YACD,gFAAgF;YAChF,MAAM,SAAS,GAA2B,EAAE,CAAA;YAC5C,IAAI,MAAM,GAAG,CAAC,CAAA;YACd,IAAI,MAAM,GAAG,CAAC,CAAA;YACd,IAAI,aAAa,GAAG,CAAC,CAAA;YACrB,IAAI,CAAC,GAAG,CAAC,CAAA;YACT,OAAO,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBAC7B,MAAM,IAAI,GAAG,IAAI,CAAC,QAAQ,GAAG,CAAC,GAAG,IAAI,CAAC,QAAQ,CAAA;gBAC9C,IAAI,IAAI,GAAG,IAAI,CAAA;gBACf,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;oBACzC,IAAI,IAAI,CAAC,IAAI,CAAC,IAAI,GAAG,CAAC,CAAC,KAAK,IAAI,CAAC,IAAI,CAAC,OAAO,GAAG,CAAC,CAAC,EAAE,CAAC;wBACpD,IAAI,GAAG,KAAK,CAAA;wBACZ,MAAK;oBACN,CAAC;gBACF,CAAC;gBACD,IAAI,CAAC,IAAI;oBAAE,MAAK;gBAChB,MAAM,IAAI,GAAG,IAAI,GAAG,IAAI,CAAC,SAAS,CAAA;gBAClC,SAAS,CAAC,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAE,CAAE,CAAC,GAAG,CAAC,CAAA,CAAC,6CAA6C;gBAC/F,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,IAAI,GAAG,CAAC,EAAE,IAAI,CAAC,GAAG,KAAK,CAAA;gBACvD,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,IAAI,GAAG,CAAC,EAAE,IAAI,CAAC,GAAG,KAAK,CAAA;gBACvD,IAAI,GAAG,KAAK,CAAC,IAAI,GAAG,KAAK,CAAC,EAAE,CAAC;oBAC5B,MAAM,IAAI,GAAG,CAAA;oBACb,MAAM,IAAI,GAAG,CAAA;oBACb,aAAa,EAAE,CAAA;gBAChB,CAAC;YACF,CAAC;YACD,GAAG,CAAC,GAAG,CAAC,QAAQ,EAAE;gBACjB,SAAS;gBACT,GAAG,EAAE,aAAa,CAAC,CAAC,CAAC,MAAM,GAAG,aAAa,CAAC,CAAC,CAAC,CAAC;gBAC/C,GAAG,EAAE,aAAa,CAAC,CAAC,CAAC,MAAM,GAAG,aAAa,CAAC,CAAC,CAAC,CAAC;aAC/C,CAAC,CAAA;YACF,CAAC,GAAG,CAAC,CAAA;QACN,CAAC;QACD,OAAO,GAAG,CAAA;IACX,CAAC;CACD"}
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Postcode regex repair pass — v0.7 task #35 ("postcode regex pre-pass").
|
|
7
|
+
*
|
|
8
|
+
* The 2026-05-29 postcode diagnostic showed the neural model fragments alphanumeric postcodes at
|
|
9
|
+
* the SentencePiece layer (GB/CA/NL at 0%, US 80.5%, FR 70.1%). Three failure modes were visible
|
|
10
|
+
* in the data:
|
|
11
|
+
*
|
|
12
|
+
* 1. Total miss — "London SW1A 1AA" → (no postcode label)
|
|
13
|
+
* 2. Truncation — "M5V 2T6" → "2T6"; "B12 8QX" → "B12"
|
|
14
|
+
* 3. Char-drift — "75008" → "5008"; "62701" → "2701" (and smear: "1200-030 Lisboa" → "200-030 Lis")
|
|
15
|
+
*
|
|
16
|
+
* This pass runs AFTER the model's per-token BIO labels are decoded but BEFORE `buildAddressTree`.
|
|
17
|
+
* It detects postcode-shaped substrings with per-country regexes and repairs the label sequence
|
|
18
|
+
* so the postcode span matches the detected shape. The model is untouched — this is a
|
|
19
|
+
* deterministic decoder-side correction, the "lowest risk" lever in the v0.7 plan (vs. #36's soft
|
|
20
|
+
* FST shallow-fusion or #41's char-level encoder).
|
|
21
|
+
*
|
|
22
|
+
* PRECISION GUARDS (so we never regress the countries already passing):
|
|
23
|
+
*
|
|
24
|
+
* - Alphanumeric shapes (GB/CA/NL/DE-prefixed) are high-confidence "this IS a postcode" patterns →
|
|
25
|
+
* eligible to ADD a span where the model emitted none, but only over non-structural labels
|
|
26
|
+
* (never over house_number/street/etc.).
|
|
27
|
+
* - Numeric shapes (\d{5}, ZIP+4, JP, PT, PL) are ambiguous (a bare 5-digit could be a house number)
|
|
28
|
+
* → SNAP-only: they expand/clip an EXISTING postcode span, never create one from scratch.
|
|
29
|
+
* - Smear cleanup is LOCAL: only postcode tokens immediately flanking a snapped span are cleared. We
|
|
30
|
+
* never globally clear unmatched postcode tokens — that would regress shapes we don't
|
|
31
|
+
* pattern-match (AU 4-digit, IN 6-digit, …).
|
|
32
|
+
*/
|
|
33
|
+
import type { DecoderToken } from "@mailwoman/core/decoder";
|
|
34
|
+
/** A detected postcode-shaped substring with its char range and confidence class. */
|
|
35
|
+
export interface PostcodeMatch {
|
|
36
|
+
start: number;
|
|
37
|
+
end: number;
|
|
38
|
+
/** "alnum" shapes may ADD; "numeric" shapes may only SNAP an existing span. */
|
|
39
|
+
kind: "alnum" | "numeric";
|
|
40
|
+
/** Pattern priority (lower = more specific, wins overlap resolution). */
|
|
41
|
+
priority: number;
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* Per-country postcode shape patterns, ordered most-specific → least. Alphanumeric patterns require
|
|
45
|
+
* uppercase letters (postcodes are conventionally uppercase, and the eval data has them uppercase)
|
|
46
|
+
* — this keeps them from matching ordinary lowercase prose.
|
|
47
|
+
*/
|
|
48
|
+
export declare const POSTCODE_PATTERNS: Array<{
|
|
49
|
+
label: string;
|
|
50
|
+
kind: "alnum" | "numeric";
|
|
51
|
+
re: RegExp;
|
|
52
|
+
}>;
|
|
53
|
+
/** Collect non-overlapping postcode matches, preferring more-specific (earlier) patterns. */
|
|
54
|
+
export declare function collectMatches(text: string): PostcodeMatch[];
|
|
55
|
+
export interface RepairResult {
|
|
56
|
+
tokens: DecoderToken[];
|
|
57
|
+
/** Number of token labels changed — for telemetry / logging. */
|
|
58
|
+
changed: number;
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* Repair postcode label spans in a decoded token sequence using per-country regexes. Returns a NEW
|
|
62
|
+
* token array (inputs are not mutated) plus a change count.
|
|
63
|
+
*/
|
|
64
|
+
export declare function repairPostcodeLabels(text: string, input: readonly DecoderToken[]): RepairResult;
|
|
65
|
+
//# sourceMappingURL=postcode-repair.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"postcode-repair.d.ts","sourceRoot":"","sources":["../postcode-repair.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA+BG;AAEH,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,yBAAyB,CAAA;AAE3D,qFAAqF;AACrF,MAAM,WAAW,aAAa;IAC7B,KAAK,EAAE,MAAM,CAAA;IACb,GAAG,EAAE,MAAM,CAAA;IACX,+EAA+E;IAC/E,IAAI,EAAE,OAAO,GAAG,SAAS,CAAA;IACzB,yEAAyE;IACzE,QAAQ,EAAE,MAAM,CAAA;CAChB;AAED;;;;GAIG;AACH,eAAO,MAAM,iBAAiB,EAAE,KAAK,CAAC;IAAE,KAAK,EAAE,MAAM,CAAC;IAAC,IAAI,EAAE,OAAO,GAAG,SAAS,CAAC;IAAC,EAAE,EAAE,MAAM,CAAA;CAAE,CAiB7F,CAAA;AA0BD,6FAA6F;AAC7F,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,aAAa,EAAE,CAkB5D;AAED,MAAM,WAAW,YAAY;IAC5B,MAAM,EAAE,YAAY,EAAE,CAAA;IACtB,gEAAgE;IAChE,OAAO,EAAE,MAAM,CAAA;CACf;AAED;;;GAGG;AACH,wBAAgB,oBAAoB,CAAC,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,SAAS,YAAY,EAAE,GAAG,YAAY,CAmE/F"}
|