@mailwoman/neural 2.1.0 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/out/anchor-inference.d.ts +57 -0
- package/out/anchor-inference.d.ts.map +1 -0
- package/out/anchor-inference.js +94 -0
- package/out/anchor-inference.js.map +1 -0
- package/out/browser.d.ts +18 -0
- package/out/browser.d.ts.map +1 -0
- package/out/browser.js +19 -0
- package/out/browser.js.map +1 -0
- package/out/classifier.d.ts +145 -11
- package/out/classifier.d.ts.map +1 -1
- package/out/classifier.js +185 -20
- package/out/classifier.js.map +1 -1
- package/out/fst-prior.d.ts +71 -0
- package/out/fst-prior.d.ts.map +1 -0
- package/out/fst-prior.js +173 -0
- package/out/fst-prior.js.map +1 -0
- package/out/index.d.ts +7 -0
- package/out/index.d.ts.map +1 -1
- package/out/index.js +5 -0
- package/out/index.js.map +1 -1
- package/out/labels.d.ts +30 -6
- package/out/labels.d.ts.map +1 -1
- package/out/labels.js +43 -6
- package/out/labels.js.map +1 -1
- package/out/onnx-runner.d.ts +8 -1
- package/out/onnx-runner.d.ts.map +1 -1
- package/out/onnx-runner.js +31 -1
- package/out/onnx-runner.js.map +1 -1
- package/out/postcode-anchor.d.ts +117 -0
- package/out/postcode-anchor.d.ts.map +1 -0
- package/out/postcode-anchor.js +269 -0
- package/out/postcode-anchor.js.map +1 -0
- package/out/postcode-binary-resolver.d.ts +60 -0
- package/out/postcode-binary-resolver.d.ts.map +1 -0
- package/out/postcode-binary-resolver.js +208 -0
- package/out/postcode-binary-resolver.js.map +1 -0
- package/out/postcode-repair.d.ts +65 -0
- package/out/postcode-repair.d.ts.map +1 -0
- package/out/postcode-repair.js +171 -0
- package/out/postcode-repair.js.map +1 -0
- package/out/proposal-classifier.d.ts +5 -1
- package/out/proposal-classifier.d.ts.map +1 -1
- package/out/proposal-classifier.js +5 -3
- package/out/proposal-classifier.js.map +1 -1
- package/out/query-shape-prior.d.ts +74 -0
- package/out/query-shape-prior.d.ts.map +1 -0
- package/out/query-shape-prior.js +223 -0
- package/out/query-shape-prior.js.map +1 -0
- package/out/street-morphology-prior.d.ts +56 -0
- package/out/street-morphology-prior.d.ts.map +1 -0
- package/out/street-morphology-prior.js +159 -0
- package/out/street-morphology-prior.js.map +1 -0
- package/out/tokenizer.d.ts +6 -1
- package/out/tokenizer.d.ts.map +1 -1
- package/out/tokenizer.js +8 -3
- package/out/tokenizer.js.map +1 -1
- package/out/unit-repair.d.ts +46 -0
- package/out/unit-repair.d.ts.map +1 -0
- package/out/unit-repair.js +147 -0
- package/out/unit-repair.js.map +1 -0
- package/out/viterbi.d.ts +76 -0
- package/out/viterbi.d.ts.map +1 -0
- package/out/viterbi.js +163 -0
- package/out/viterbi.js.map +1 -0
- package/out/vitest.config.d.ts.map +1 -1
- package/out/vitest.config.js +3 -0
- package/out/vitest.config.js.map +1 -1
- package/out/weights.d.ts +42 -0
- package/out/weights.d.ts.map +1 -1
- package/out/weights.js +92 -4
- package/out/weights.js.map +1 -1
- package/package.json +10 -3
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Postcode anchor — the first member of the "anchor-based parsing" family (Direction D, #240). See
|
|
7
|
+
* `docs/articles/plan/2026-06-03-anchor-based-parsing.md`.
|
|
8
|
+
*
|
|
9
|
+
* A postcode is the most information-dense token in an address: a hierarchical geo-encoding that
|
|
10
|
+
* places a query on Earth far more cheaply than the rest of the parse. This module lifts the
|
|
11
|
+
* postcode out of the BIO sequence-labelling problem and treats it as a structured anchor. It
|
|
12
|
+
* runs the same per-country shape regexes the decoder repair pass uses ({@link collectMatches}),
|
|
13
|
+
* resolves each shaped span against a postcode gazetteer, and returns a SOFT signal: a country
|
|
14
|
+
* posterior plus a calibrated confidence. It never decides a postcode's identity on its own — it
|
|
15
|
+
* reports "this string is (or is not) a real postcode, in these countries, near here", and leaves
|
|
16
|
+
* the parser to weigh that against the surrounding tokens.
|
|
17
|
+
*
|
|
18
|
+
* Two design rules carried from the DeepSeek consult
|
|
19
|
+
* (`.agents/skills/deepseek-consult/ds-pc-turn{1,2}-postcode-anchor.txt`):
|
|
20
|
+
*
|
|
21
|
+
* - The country posterior is UNIFORM over the countries a string actually exists in. We never weight
|
|
22
|
+
* by per-country postcode volume, because that skews "75001" toward whichever country owns
|
|
23
|
+
* more 5-digit codes — the exact bias the anchor exists to avoid. Disambiguation is the
|
|
24
|
+
* parser's job, using script, city tokens, and user locale.
|
|
25
|
+
* - Confidence combines gazetteer MEMBERSHIP with country AMBIGUITY. A string that matches a postcode
|
|
26
|
+
* regex but exists in no gazetteer (a bare `27`, or a 5-digit house number that is not a real
|
|
27
|
+
* code) gets confidence 0, so the parser treats it as a house number. A real-but-ambiguous
|
|
28
|
+
* code (`75001` in FR and US) gets moderate confidence. A real, single-country code gets
|
|
29
|
+
* 1.0.
|
|
30
|
+
*/
|
|
31
|
+
/** A gazetteer hit for a postcode string. `lat`/`lon` of 0 means "known postcode, no centroid yet". */
|
|
32
|
+
export interface PostcodePlace {
|
|
33
|
+
country: string;
|
|
34
|
+
lat: number;
|
|
35
|
+
lon: number;
|
|
36
|
+
}
|
|
37
|
+
/**
|
|
38
|
+
* The minimal surface the anchor needs from a gazetteer. Implementations: an in-memory fake (tests)
|
|
39
|
+
* or a SQLite-backed lookup over the `postalcode-*.db` shards (`@mailwoman/resolver-wof-sqlite`).
|
|
40
|
+
* Keeping the seam this narrow lets a future FST/WASM resolver drop in without touching the anchor
|
|
41
|
+
* logic.
|
|
42
|
+
*/
|
|
43
|
+
export interface PostcodeResolver {
|
|
44
|
+
/** Exact-match lookup of a normalized postcode string across every country shard. */
|
|
45
|
+
lookup(postcode: string): PostcodePlace[];
|
|
46
|
+
}
|
|
47
|
+
export interface PostcodeAnchor {
|
|
48
|
+
/** The shaped substring as it appeared in the raw text, with char offsets. */
|
|
49
|
+
span: {
|
|
50
|
+
text: string;
|
|
51
|
+
start: number;
|
|
52
|
+
end: number;
|
|
53
|
+
};
|
|
54
|
+
/** The normalized form actually queried (uppercased, `D-` prefix stripped, whitespace collapsed). */
|
|
55
|
+
normalized: string;
|
|
56
|
+
/** Coordinate-bearing gazetteer hits — best-effort centroid(s), one representative per country. */
|
|
57
|
+
candidates: PostcodePlace[];
|
|
58
|
+
/**
|
|
59
|
+
* Uniform distribution over the countries the postcode exists in (membership,
|
|
60
|
+
* coordinate-independent).
|
|
61
|
+
*/
|
|
62
|
+
posterior: Record<string, number>;
|
|
63
|
+
/** `1 - normalizedEntropy(posterior)` when the postcode exists; `0` when it is in no gazetteer. */
|
|
64
|
+
confidence: number;
|
|
65
|
+
/**
|
|
66
|
+
* `exact` — the string is a real postcode; `outward` — a GB unit (`SO4 3RX`) resolved to its
|
|
67
|
+
* outward district (`SO4`), the granularity the GB gazetteer is aggregated at (no penalty — it is
|
|
68
|
+
* a real, confident GB match); `fuzzy` — only an edit-distance-1 variant exists (a likely typo /
|
|
69
|
+
* OCR slip), so the confidence carries a penalty; `none` — in no gazetteer.
|
|
70
|
+
*/
|
|
71
|
+
matchType: "exact" | "outward" | "fuzzy" | "none";
|
|
72
|
+
/**
|
|
73
|
+
* Structural house-number prior in [0, 1]: `1` for a code that cannot be a house number, and
|
|
74
|
+
* below `1` for a digit-only code sharing its comma-delimited segment with a street word (so it
|
|
75
|
+
* reads as a house number rather than a postcode). Already folded into {@link confidence}; exposed
|
|
76
|
+
* so a consumer can rank competing spans, or see why one was down-weighted, without re-deriving
|
|
77
|
+
* it.
|
|
78
|
+
*/
|
|
79
|
+
positionFactor: number;
|
|
80
|
+
}
|
|
81
|
+
export interface ExtractPostcodeAnchorsOpts {
|
|
82
|
+
/**
|
|
83
|
+
* When an exact lookup finds nothing, retry Damerau–Levenshtein ≤1 variants to absorb typos and
|
|
84
|
+
* OCR slips (`75OO8` → `75008`). Off by default so existing callers keep exact-match behaviour.
|
|
85
|
+
*/
|
|
86
|
+
fuzzy?: boolean;
|
|
87
|
+
}
|
|
88
|
+
/**
|
|
89
|
+
* Class-aware edit-distance-1 variants of a postcode string: deletions, same-class substitutions
|
|
90
|
+
* (digit↔digit, letter↔letter), same-class insertions, and adjacent transpositions. Restricting
|
|
91
|
+
* substitutions/insertions to the character's class mirrors how humans mistype or OCR a postcode (a
|
|
92
|
+
* digit becomes another digit, not a letter) and keeps the candidate set small.
|
|
93
|
+
*/
|
|
94
|
+
export declare function editDistance1Variants(s: string): string[];
|
|
95
|
+
/**
|
|
96
|
+
* Normalize a shaped span to the canonical gazetteer key: uppercase, collapse internal whitespace
|
|
97
|
+
* to a single space, and strip the German `D-` courtesy prefix (the shards store `68161`, not
|
|
98
|
+
* `D-68161`).
|
|
99
|
+
*/
|
|
100
|
+
export declare function normalizePostcode(raw: string): string;
|
|
101
|
+
/**
|
|
102
|
+
* The GB outward code of a normalized unit postcode — the part before the space when the inward
|
|
103
|
+
* half is `\d[A-Z]{2}` (`SO4 3RX` → `SO4`). The GB gazetteer is aggregated to outward codes (2.7M
|
|
104
|
+
* units is too large + too fine for an anchor), so the extractor retries the outward code when a
|
|
105
|
+
* full GB unit misses. Returns `null` for any string that isn't a GB unit postcode (so it never
|
|
106
|
+
* fires elsewhere).
|
|
107
|
+
*/
|
|
108
|
+
export declare function gbOutwardCode(normalized: string): string | null;
|
|
109
|
+
/**
|
|
110
|
+
* Extract postcode anchors from raw text. For each postcode-shaped span, resolve it against the
|
|
111
|
+
* gazetteer and emit a soft anchor (country posterior + confidence). Spans that match a shape but
|
|
112
|
+
* exist in no gazetteer are still returned, with an empty posterior and confidence 0 — an explicit
|
|
113
|
+
* "looks like a postcode, but isn't one" so the caller can see the extractor fired and chose not to
|
|
114
|
+
* anchor.
|
|
115
|
+
*/
|
|
116
|
+
export declare function extractPostcodeAnchors(text: string, resolver: PostcodeResolver, opts?: ExtractPostcodeAnchorsOpts): PostcodeAnchor[];
|
|
117
|
+
//# sourceMappingURL=postcode-anchor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"postcode-anchor.d.ts","sourceRoot":"","sources":["../postcode-anchor.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6BG;AAQH,uGAAuG;AACvG,MAAM,WAAW,aAAa;IAC7B,OAAO,EAAE,MAAM,CAAA;IACf,GAAG,EAAE,MAAM,CAAA;IACX,GAAG,EAAE,MAAM,CAAA;CACX;AAED;;;;;GAKG;AACH,MAAM,WAAW,gBAAgB;IAChC,qFAAqF;IACrF,MAAM,CAAC,QAAQ,EAAE,MAAM,GAAG,aAAa,EAAE,CAAA;CACzC;AAED,MAAM,WAAW,cAAc;IAC9B,8EAA8E;IAC9E,IAAI,EAAE;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAC;QAAC,GAAG,EAAE,MAAM,CAAA;KAAE,CAAA;IAClD,qGAAqG;IACrG,UAAU,EAAE,MAAM,CAAA;IAClB,mGAAmG;IACnG,UAAU,EAAE,aAAa,EAAE,CAAA;IAC3B;;;OAGG;IACH,SAAS,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAA;IACjC,mGAAmG;IACnG,UAAU,EAAE,MAAM,CAAA;IAClB;;;;;OAKG;IACH,SAAS,EAAE,OAAO,GAAG,SAAS,GAAG,OAAO,GAAG,MAAM,CAAA;IACjD;;;;;;OAMG;IACH,cAAc,EAAE,MAAM,CAAA;CACtB;AAED,MAAM,WAAW,0BAA0B;IAC1C;;;OAGG;IACH,KAAK,CAAC,EAAE,OAAO,CAAA;CACf;AAWD;;;;;GAKG;AACH,wBAAgB,qBAAqB,CAAC,CAAC,EAAE,MAAM,GAAG,MAAM,EAAE,CAczD;AAED;;;;GAIG;AACH,wBAAgB,iBAAiB,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,CAKrD;AAED;;;;;;GAMG;AACH,wBAAgB,aAAa,CAAC,UAAU,EAAE,MAAM,GAAG,MAAM,GAAG,IAAI,CAI/D;AAuGD;;;;;;GAMG;AACH,wBAAgB,sBAAsB,CACrC,IAAI,EAAE,MAAM,EACZ,QAAQ,EAAE,gBAAgB,EAC1B,IAAI,GAAE,0BAA+B,GACnC,cAAc,EAAE,CAmElB"}
|
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Postcode anchor — the first member of the "anchor-based parsing" family (Direction D, #240). See
|
|
7
|
+
* `docs/articles/plan/2026-06-03-anchor-based-parsing.md`.
|
|
8
|
+
*
|
|
9
|
+
* A postcode is the most information-dense token in an address: a hierarchical geo-encoding that
|
|
10
|
+
* places a query on Earth far more cheaply than the rest of the parse. This module lifts the
|
|
11
|
+
* postcode out of the BIO sequence-labelling problem and treats it as a structured anchor. It
|
|
12
|
+
* runs the same per-country shape regexes the decoder repair pass uses ({@link collectMatches}),
|
|
13
|
+
* resolves each shaped span against a postcode gazetteer, and returns a SOFT signal: a country
|
|
14
|
+
* posterior plus a calibrated confidence. It never decides a postcode's identity on its own — it
|
|
15
|
+
* reports "this string is (or is not) a real postcode, in these countries, near here", and leaves
|
|
16
|
+
* the parser to weigh that against the surrounding tokens.
|
|
17
|
+
*
|
|
18
|
+
* Two design rules carried from the DeepSeek consult
|
|
19
|
+
* (`.agents/skills/deepseek-consult/ds-pc-turn{1,2}-postcode-anchor.txt`):
|
|
20
|
+
*
|
|
21
|
+
* - The country posterior is UNIFORM over the countries a string actually exists in. We never weight
|
|
22
|
+
* by per-country postcode volume, because that skews "75001" toward whichever country owns
|
|
23
|
+
* more 5-digit codes — the exact bias the anchor exists to avoid. Disambiguation is the
|
|
24
|
+
* parser's job, using script, city tokens, and user locale.
|
|
25
|
+
* - Confidence combines gazetteer MEMBERSHIP with country AMBIGUITY. A string that matches a postcode
|
|
26
|
+
* regex but exists in no gazetteer (a bare `27`, or a 5-digit house number that is not a real
|
|
27
|
+
* code) gets confidence 0, so the parser treats it as a house number. A real-but-ambiguous
|
|
28
|
+
* code (`75001` in FR and US) gets moderate confidence. A real, single-country code gets
|
|
29
|
+
* 1.0.
|
|
30
|
+
*/
|
|
31
|
+
import { candidateSystemsForPostcode } from "@mailwoman/codex";
|
|
32
|
+
import { isGermanStreetToken } from "@mailwoman/codex/de";
|
|
33
|
+
import { isFrenchStreetWord } from "@mailwoman/codex/fr";
|
|
34
|
+
import { isStreetSuffixToken, isUsStateAbbreviation } from "@mailwoman/codex/us";
|
|
35
|
+
import { collectMatches } from "./postcode-repair.js";
|
|
36
|
+
/**
|
|
37
|
+
* Entropy cap for the confidence formula: a k-way country split saturates toward 0 confidence at
|
|
38
|
+
* k=10.
|
|
39
|
+
*/
|
|
40
|
+
const MAX_COUNTRIES = 10;
|
|
41
|
+
/** A fuzzy (typo-corrected) match is less certain than an exact one — scale its confidence down. */
|
|
42
|
+
const FUZZY_PENALTY = 0.6;
|
|
43
|
+
/**
|
|
44
|
+
* Class-aware edit-distance-1 variants of a postcode string: deletions, same-class substitutions
|
|
45
|
+
* (digit↔digit, letter↔letter), same-class insertions, and adjacent transpositions. Restricting
|
|
46
|
+
* substitutions/insertions to the character's class mirrors how humans mistype or OCR a postcode (a
|
|
47
|
+
* digit becomes another digit, not a letter) and keeps the candidate set small.
|
|
48
|
+
*/
|
|
49
|
+
export function editDistance1Variants(s) {
|
|
50
|
+
const classOf = (ch) => /[0-9]/.test(ch) ? "0123456789" : /[A-Z]/.test(ch) ? "ABCDEFGHIJKLMNOPQRSTUVWXYZ" : "";
|
|
51
|
+
const variants = new Set();
|
|
52
|
+
for (let i = 0; i < s.length; i++)
|
|
53
|
+
variants.add(s.slice(0, i) + s.slice(i + 1)); // deletions
|
|
54
|
+
for (let i = 0; i < s.length; i++) {
|
|
55
|
+
for (const c of classOf(s[i]))
|
|
56
|
+
if (c !== s[i])
|
|
57
|
+
variants.add(s.slice(0, i) + c + s.slice(i + 1)); // substitutions
|
|
58
|
+
}
|
|
59
|
+
for (let i = 0; i <= s.length; i++) {
|
|
60
|
+
for (const c of classOf(s[i] ?? s[i - 1] ?? ""))
|
|
61
|
+
variants.add(s.slice(0, i) + c + s.slice(i)); // insertions
|
|
62
|
+
}
|
|
63
|
+
for (let i = 0; i + 1 < s.length; i++)
|
|
64
|
+
variants.add(s.slice(0, i) + s[i + 1] + s[i] + s.slice(i + 2)); // transpositions
|
|
65
|
+
variants.delete(s);
|
|
66
|
+
return [...variants];
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* Normalize a shaped span to the canonical gazetteer key: uppercase, collapse internal whitespace
|
|
70
|
+
* to a single space, and strip the German `D-` courtesy prefix (the shards store `68161`, not
|
|
71
|
+
* `D-68161`).
|
|
72
|
+
*/
|
|
73
|
+
export function normalizePostcode(raw) {
|
|
74
|
+
let s = raw.trim().toUpperCase().replace(/\s+/g, " ");
|
|
75
|
+
if (/^D-\d{5}$/.test(s))
|
|
76
|
+
s = s.slice(2); // German courtesy prefix: D-68161 → 68161
|
|
77
|
+
if (/^\d{4} [A-Z]{2}$/.test(s))
|
|
78
|
+
s = s.replace(" ", ""); // Dutch: gazetteer stores 1012LM, not 1012 LM
|
|
79
|
+
return s;
|
|
80
|
+
}
|
|
81
|
+
/**
|
|
82
|
+
* The GB outward code of a normalized unit postcode — the part before the space when the inward
|
|
83
|
+
* half is `\d[A-Z]{2}` (`SO4 3RX` → `SO4`). The GB gazetteer is aggregated to outward codes (2.7M
|
|
84
|
+
* units is too large + too fine for an anchor), so the extractor retries the outward code when a
|
|
85
|
+
* full GB unit misses. Returns `null` for any string that isn't a GB unit postcode (so it never
|
|
86
|
+
* fires elsewhere).
|
|
87
|
+
*/
|
|
88
|
+
export function gbOutwardCode(normalized) {
|
|
89
|
+
const sp = normalized.indexOf(" ");
|
|
90
|
+
if (sp < 1)
|
|
91
|
+
return null;
|
|
92
|
+
return /^\d[A-Z]{2}$/.test(normalized.slice(sp + 1)) ? normalized.slice(0, sp) : null;
|
|
93
|
+
}
|
|
94
|
+
/**
|
|
95
|
+
* `1 - log2(k)/log2(MAX_COUNTRIES)`, clamped to [0, 1]. k=1 → 1.0; k=2 → ~0.70; k≥MAX_COUNTRIES →
|
|
96
|
+
* 0.
|
|
97
|
+
*/
|
|
98
|
+
function confidenceFromCountryCount(k) {
|
|
99
|
+
if (k <= 0)
|
|
100
|
+
return 0;
|
|
101
|
+
if (k === 1)
|
|
102
|
+
return 1;
|
|
103
|
+
const c = 1 - Math.log2(k) / Math.log2(MAX_COUNTRIES);
|
|
104
|
+
return Math.max(0, Math.min(1, c));
|
|
105
|
+
}
|
|
106
|
+
/**
|
|
107
|
+
* Confidence scale for a digit-only code that shares its segment with a street word. A house number
|
|
108
|
+
* and a 5-digit postcode are the same shape, so membership alone can't separate `12345 Main St`
|
|
109
|
+
* (house number that happens to be a real ZIP elsewhere) from `San Francisco 94105` (postcode). The
|
|
110
|
+
* structural tell is cheap and locale-general: house numbers sit beside the street, postcodes
|
|
111
|
+
* beside the city. We scale rather than zero — the gazetteer still vouches for the shape, so a lone
|
|
112
|
+
* code in a street-only line stays usable; the penalty just lets a real trailing postcode out-rank
|
|
113
|
+
* it.
|
|
114
|
+
*/
|
|
115
|
+
const HOUSE_NUMBER_PENALTY = 0.2;
|
|
116
|
+
/**
|
|
117
|
+
* Standalone street-type words for the locales without a codex slice yet (ES/IT). US comes from
|
|
118
|
+
* `@mailwoman/codex/us`, German from `@mailwoman/codex/de`, French from `@mailwoman/codex/fr`; the
|
|
119
|
+
* Dutch compound suffixes are still inline below pending a `codex/nl` slice.
|
|
120
|
+
*/
|
|
121
|
+
const NON_US_STREET_WORDS = new Set([
|
|
122
|
+
// Spanish
|
|
123
|
+
"calle",
|
|
124
|
+
"avenida",
|
|
125
|
+
"avda",
|
|
126
|
+
"plaza",
|
|
127
|
+
"paseo",
|
|
128
|
+
"camino",
|
|
129
|
+
"carrera",
|
|
130
|
+
"ronda",
|
|
131
|
+
// Italian
|
|
132
|
+
"via",
|
|
133
|
+
"viale",
|
|
134
|
+
"piazza",
|
|
135
|
+
"corso",
|
|
136
|
+
"largo",
|
|
137
|
+
"vicolo",
|
|
138
|
+
"strada",
|
|
139
|
+
"contrada",
|
|
140
|
+
]);
|
|
141
|
+
/** Dutch compound street suffixes — matched against a token's tail (pending a `codex/nl` slice). */
|
|
142
|
+
const NL_STREET_SUFFIXES = ["straat", "laan", "plein", "gracht", "kade", "dijk", "steeg", "dreef"];
|
|
143
|
+
/**
|
|
144
|
+
* True when a token denotes a street. US suffixes come from the USPS Pub-28 table in
|
|
145
|
+
* `@mailwoman/codex/us` (complete, so `Trl`/`Holw`/`Xing` all match), EXCEPT the abbreviations that
|
|
146
|
+
* collide with a state code — `KY` (Key vs Kentucky), `PR` (Prairie vs Puerto Rico) — which sit in
|
|
147
|
+
* the postcode's own `City, ST ZIP` segment. German compounds come from `@mailwoman/codex/de`
|
|
148
|
+
* ({@link isGermanStreetToken}), whose suffix set already excludes the place-name endings (`-berg`,
|
|
149
|
+
* `-burg`, `-dorf`) that would otherwise flag a city token. French voie words come from
|
|
150
|
+
* `@mailwoman/codex/fr` ({@link isFrenchStreetWord}). ES/IT and Dutch fall back to the inline
|
|
151
|
+
* lists.
|
|
152
|
+
*
|
|
153
|
+
* `systems` GATES which vocabularies are consulted — only the systems the postcode plausibly
|
|
154
|
+
* belongs to (its gazetteer membership, e.g. a US-only ZIP gates to `{us}` and never checks the
|
|
155
|
+
* German or French vocab). This is what lets the check scale to 15-20 systems without a
|
|
156
|
+
* cross-locale collision (German `-ring` vs English `spring`): an unrelated system's vocabulary is
|
|
157
|
+
* simply never asked. The gate carries lowercase system/locale tags (`us`, `de`, `fr`, `es`, `it`,
|
|
158
|
+
* `nl`).
|
|
159
|
+
*/
|
|
160
|
+
function looksLikeStreetWord(token, systems) {
|
|
161
|
+
const t = token.toLowerCase().replace(/[^\p{L}]/gu, "");
|
|
162
|
+
if (t.length < 2)
|
|
163
|
+
return false;
|
|
164
|
+
if (systems.has("us") && isStreetSuffixToken(t) && !isUsStateAbbreviation(t))
|
|
165
|
+
return true;
|
|
166
|
+
if (systems.has("de") && isGermanStreetToken(t))
|
|
167
|
+
return true;
|
|
168
|
+
if (systems.has("fr") && isFrenchStreetWord(t))
|
|
169
|
+
return true;
|
|
170
|
+
if ((systems.has("es") || systems.has("it")) && NON_US_STREET_WORDS.has(t))
|
|
171
|
+
return true;
|
|
172
|
+
if (systems.has("nl"))
|
|
173
|
+
return NL_STREET_SUFFIXES.some((s) => t.length > s.length && t.endsWith(s));
|
|
174
|
+
return false;
|
|
175
|
+
}
|
|
176
|
+
/**
|
|
177
|
+
* Position-aware confidence factor for a postcode span: `1` for anything that cannot be confused
|
|
178
|
+
* with a house number, and {@link HOUSE_NUMBER_PENALTY} for a digit-only code sharing its
|
|
179
|
+
* comma-delimited segment with a street word. This is the structural prior that lets the anchor
|
|
180
|
+
* tell a leading `12345 Main St` house number from a trailing `San Francisco 94105` postcode with
|
|
181
|
+
* no model in the loop — and lets a consumer pick the right span by confidence instead of by raw
|
|
182
|
+
* position.
|
|
183
|
+
*
|
|
184
|
+
* `systems` narrows the street vocabularies to the ones this code plausibly belongs to (its
|
|
185
|
+
* gazetteer membership, or — for a code in no gazetteer — the format-shape candidates from codex).
|
|
186
|
+
*/
|
|
187
|
+
function positionFactor(text, start, normalized, systems) {
|
|
188
|
+
if (!/^\d+$/.test(normalized))
|
|
189
|
+
return 1; // only digit-only codes collide with house numbers
|
|
190
|
+
const segStart = text.lastIndexOf(",", start - 1) + 1;
|
|
191
|
+
let segEnd = text.indexOf(",", start);
|
|
192
|
+
if (segEnd < 0)
|
|
193
|
+
segEnd = text.length;
|
|
194
|
+
for (const token of text.slice(segStart, segEnd).split(/\s+/)) {
|
|
195
|
+
if (looksLikeStreetWord(token, systems))
|
|
196
|
+
return HOUSE_NUMBER_PENALTY;
|
|
197
|
+
}
|
|
198
|
+
return 1;
|
|
199
|
+
}
|
|
200
|
+
/**
|
|
201
|
+
* Extract postcode anchors from raw text. For each postcode-shaped span, resolve it against the
|
|
202
|
+
* gazetteer and emit a soft anchor (country posterior + confidence). Spans that match a shape but
|
|
203
|
+
* exist in no gazetteer are still returned, with an empty posterior and confidence 0 — an explicit
|
|
204
|
+
* "looks like a postcode, but isn't one" so the caller can see the extractor fired and chose not to
|
|
205
|
+
* anchor.
|
|
206
|
+
*/
|
|
207
|
+
export function extractPostcodeAnchors(text, resolver, opts = {}) {
|
|
208
|
+
const anchors = [];
|
|
209
|
+
for (const match of collectMatches(text)) {
|
|
210
|
+
const spanText = text.slice(match.start, match.end);
|
|
211
|
+
const normalized = normalizePostcode(spanText);
|
|
212
|
+
// Exact first; then the GB outward fallback (structural, not a guess); then edit-distance-1.
|
|
213
|
+
let hits = resolver.lookup(normalized);
|
|
214
|
+
let matchType = hits.length > 0 ? "exact" : "none";
|
|
215
|
+
if (matchType === "none") {
|
|
216
|
+
const outward = gbOutwardCode(normalized);
|
|
217
|
+
if (outward) {
|
|
218
|
+
const outwardHits = resolver.lookup(outward);
|
|
219
|
+
if (outwardHits.length > 0) {
|
|
220
|
+
hits = outwardHits;
|
|
221
|
+
matchType = "outward";
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
if (matchType === "none" && opts.fuzzy) {
|
|
226
|
+
const fuzzyHits = [];
|
|
227
|
+
for (const variant of editDistance1Variants(normalized)) {
|
|
228
|
+
for (const h of resolver.lookup(variant))
|
|
229
|
+
fuzzyHits.push(h);
|
|
230
|
+
}
|
|
231
|
+
if (fuzzyHits.length > 0) {
|
|
232
|
+
hits = fuzzyHits;
|
|
233
|
+
matchType = "fuzzy";
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
// Membership: distinct countries the postcode exists in (regardless of whether we have a centroid).
|
|
237
|
+
const countries = [...new Set(hits.map((h) => h.country))].sort();
|
|
238
|
+
const k = countries.length;
|
|
239
|
+
const posterior = {};
|
|
240
|
+
for (const c of countries)
|
|
241
|
+
posterior[c] = 1 / k;
|
|
242
|
+
// Placement: one representative coordinate-bearing hit per country (the first with real coords).
|
|
243
|
+
const candidates = [];
|
|
244
|
+
for (const c of countries) {
|
|
245
|
+
const placed = hits.find((h) => h.country === c && h.lat !== 0 && h.lon !== 0);
|
|
246
|
+
if (placed)
|
|
247
|
+
candidates.push(placed);
|
|
248
|
+
}
|
|
249
|
+
// Gate the street-word check to the systems this code plausibly belongs to: its gazetteer
|
|
250
|
+
// membership when known (precise — a US-only ZIP never checks the German vocab), else the
|
|
251
|
+
// format-shape candidates from codex (for a code in no gazetteer; its confidence is 0 anyway).
|
|
252
|
+
const systems = countries.length > 0
|
|
253
|
+
? new Set(countries.map((c) => c.toLowerCase()))
|
|
254
|
+
: new Set(candidateSystemsForPostcode(normalized));
|
|
255
|
+
const position = positionFactor(text, match.start, normalized, systems);
|
|
256
|
+
const confidence = confidenceFromCountryCount(k) * (matchType === "fuzzy" ? FUZZY_PENALTY : 1) * position;
|
|
257
|
+
anchors.push({
|
|
258
|
+
span: { text: spanText, start: match.start, end: match.end },
|
|
259
|
+
normalized,
|
|
260
|
+
candidates,
|
|
261
|
+
posterior,
|
|
262
|
+
confidence,
|
|
263
|
+
matchType,
|
|
264
|
+
positionFactor: position,
|
|
265
|
+
});
|
|
266
|
+
}
|
|
267
|
+
return anchors;
|
|
268
|
+
}
|
|
269
|
+
//# sourceMappingURL=postcode-anchor.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"postcode-anchor.js","sourceRoot":"","sources":["../postcode-anchor.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6BG;AAEH,OAAO,EAAE,2BAA2B,EAAE,MAAM,kBAAkB,CAAA;AAC9D,OAAO,EAAE,mBAAmB,EAAE,MAAM,qBAAqB,CAAA;AACzD,OAAO,EAAE,kBAAkB,EAAE,MAAM,qBAAqB,CAAA;AACxD,OAAO,EAAE,mBAAmB,EAAE,qBAAqB,EAAE,MAAM,qBAAqB,CAAA;AAChF,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAA;AA2DrD;;;GAGG;AACH,MAAM,aAAa,GAAG,EAAE,CAAA;AAExB,oGAAoG;AACpG,MAAM,aAAa,GAAG,GAAG,CAAA;AAEzB;;;;;GAKG;AACH,MAAM,UAAU,qBAAqB,CAAC,CAAS;IAC9C,MAAM,OAAO,GAAG,CAAC,EAAU,EAAU,EAAE,CACtC,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,4BAA4B,CAAC,CAAC,CAAC,EAAE,CAAA;IACvF,MAAM,QAAQ,GAAG,IAAI,GAAG,EAAU,CAAA;IAClC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC,EAAE;QAAE,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAA,CAAC,YAAY;IAC5F,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACnC,KAAK,MAAM,CAAC,IAAI,OAAO,CAAC,CAAC,CAAC,CAAC,CAAE,CAAC;YAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;gBAAE,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAA,CAAC,gBAAgB;IAClH,CAAC;IACD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACpC,KAAK,MAAM,CAAC,IAAI,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;YAAE,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAA,CAAC,aAAa;IAC5G,CAAC;IACD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC,EAAE;QAAE,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAA,CAAC,iBAAiB;IACvH,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,CAAA;IAClB,OAAO,CAAC,GAAG,QAAQ,CAAC,CAAA;AACrB,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,iBAAiB,CAAC,GAAW;IAC5C,IAAI,CAAC,GAAG,GAAG,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAA;IACrD,IAAI,WAAW,CAAC,IAAI,CAAC,CAAC,CAAC;QAAE,CAAC,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAA,CAAC,0CAA0C;IAClF,IAAI,kBAAkB,CAAC,IAAI,CAAC,CAAC,CAAC;QAAE,CAAC,GAAG,CAAC,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,CAAC,CAAA,CAAC,8CAA8C;IACrG,OAAO,CAAC,CAAA;AACT,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,aAAa,CAAC,UAAkB;IAC/C,MAAM,EAAE,GAAG,UAAU,CAAC,OAAO,CAAC,GAAG,CAAC,CAAA;IAClC,IAAI,EAAE,GAAG,CAAC;QAAE,OAAO,IAAI,CAAA;IACvB,OAAO,cAAc,CAAC,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,IAAI,CAAA;AACtF,CAAC;AAED;;;GAGG;AACH,SAAS,0BAA0B,CAAC,CAAS;IAC5C,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,CAAC,CAAA;IACpB,IAAI,CAAC,KAAK,CAAC;QAAE,OAAO,CAAC,CAAA;IACrB,MAAM,CAAC,GAAG,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,aAAa,CAAC,CAAA;IACrD,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAA;AACnC,CAAC;AAED;;;;;;;;GAQG;AACH,MAAM,oBAAoB,GAAG,GAAG,CAAA;AAEhC;;;;GAIG;AACH,MAAM,mBAAmB,GAAG,IAAI,GAAG,CAAC;IACnC,UAAU;IACV,OAAO;IACP,SAAS;IACT,MAAM;IACN,OAAO;IACP,OAAO;IACP,QAAQ;IACR,SAAS;IACT,OAAO;IACP,UAAU;IACV,KAAK;IACL,OAAO;IACP,QAAQ;IACR,OAAO;IACP,OAAO;IACP,QAAQ;IACR,QAAQ;IACR,UAAU;CACV,CAAC,CAAA;AAEF,oGAAoG;AACpG,MAAM,kBAAkB,GAAG,CAAC,QAAQ,EAAE,MAAM,EAAE,OAAO,EAAE,QAAQ,EAAE,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,OAAO,CAAC,CAAA;AAElG;;;;;;;;;;;;;;;;GAgBG;AACH,SAAS,mBAAmB,CAAC,KAAa,EAAE,OAA4B;IACvE,MAAM,CAAC,GAAG,KAAK,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,YAAY,EAAE,EAAE,CAAC,CAAA;IACvD,IAAI,CAAC,CAAC,MAAM,GAAG,CAAC;QAAE,OAAO,KAAK,CAAA;IAC9B,IAAI,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,mBAAmB,CAAC,CAAC,CAAC,IAAI,CAAC,qBAAqB,CAAC,CAAC,CAAC;QAAE,OAAO,IAAI,CAAA;IACzF,IAAI,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,mBAAmB,CAAC,CAAC,CAAC;QAAE,OAAO,IAAI,CAAA;IAC5D,IAAI,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,kBAAkB,CAAC,CAAC,CAAC;QAAE,OAAO,IAAI,CAAA;IAC3D,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,IAAI,mBAAmB,CAAC,GAAG,CAAC,CAAC,CAAC;QAAE,OAAO,IAAI,CAAA;IACvF,IAAI,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC;QAAE,OAAO,kBAAkB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,MAAM,IAAI,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAA;IAClG,OAAO,KAAK,CAAA;AACb,CAAC;AAED;;;;;;;;;;GAUG;AACH,SAAS,cAAc,CAAC,IAAY,EAAE,KAAa,EAAE,UAAkB,EAAE,OAA4B;IACpG,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,UAAU,CAAC;QAAE,OAAO,CAAC,CAAA,CAAC,mDAAmD;IAC3F,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,GAAG,EAAE,KAAK,GAAG,CAAC,CAAC,GAAG,CAAC,CAAA;IACrD,IAAI,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE,KAAK,CAAC,CAAA;IACrC,IAAI,MAAM,GAAG,CAAC;QAAE,MAAM,GAAG,IAAI,CAAC,MAAM,CAAA;IACpC,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,KAAK,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC;QAC/D,IAAI,mBAAmB,CAAC,KAAK,EAAE,OAAO,CAAC;YAAE,OAAO,oBAAoB,CAAA;IACrE,CAAC;IACD,OAAO,CAAC,CAAA;AACT,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,sBAAsB,CACrC,IAAY,EACZ,QAA0B,EAC1B,OAAmC,EAAE;IAErC,MAAM,OAAO,GAAqB,EAAE,CAAA;IAEpC,KAAK,MAAM,KAAK,IAAI,cAAc,CAAC,IAAI,CAAC,EAAE,CAAC;QAC1C,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,KAAK,EAAE,KAAK,CAAC,GAAG,CAAC,CAAA;QACnD,MAAM,UAAU,GAAG,iBAAiB,CAAC,QAAQ,CAAC,CAAA;QAE9C,6FAA6F;QAC7F,IAAI,IAAI,GAAG,QAAQ,CAAC,MAAM,CAAC,UAAU,CAAC,CAAA;QACtC,IAAI,SAAS,GAAgC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAA;QAC/E,IAAI,SAAS,KAAK,MAAM,EAAE,CAAC;YAC1B,MAAM,OAAO,GAAG,aAAa,CAAC,UAAU,CAAC,CAAA;YACzC,IAAI,OAAO,EAAE,CAAC;gBACb,MAAM,WAAW,GAAG,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,CAAA;gBAC5C,IAAI,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBAC5B,IAAI,GAAG,WAAW,CAAA;oBAClB,SAAS,GAAG,SAAS,CAAA;gBACtB,CAAC;YACF,CAAC;QACF,CAAC;QACD,IAAI,SAAS,KAAK,MAAM,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;YACxC,MAAM,SAAS,GAAoB,EAAE,CAAA;YACrC,KAAK,MAAM,OAAO,IAAI,qBAAqB,CAAC,UAAU,CAAC,EAAE,CAAC;gBACzD,KAAK,MAAM,CAAC,IAAI,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC;oBAAE,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;YAC5D,CAAC;YACD,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC1B,IAAI,GAAG,SAAS,CAAA;gBAChB,SAAS,GAAG,OAAO,CAAA;YACpB,CAAC;QACF,CAAC;QAED,oGAAoG;QACpG,MAAM,SAAS,GAAG,CAAC,GAAG,IAAI,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAA;QACjE,MAAM,CAAC,GAAG,SAAS,CAAC,MAAM,CAAA;QAE1B,MAAM,SAAS,GAA2B,EAAE,CAAA;QAC5C,KAAK,MAAM,CAAC,IAAI,SAAS;YAAE,SAAS,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAA;QAE/C,iGAAiG;QACjG,MAAM,UAAU,GAAoB,EAAE,CAAA;QACtC,KAAK,MAAM,CAAC,IAAI,SAAS,EAAE,CAAC;YAC3B,MAAM,MAAM,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,KAAK,CAAC,IAAI,CAAC,CAAC,GAAG,KAAK,CAAC,IAAI,CAAC,CAAC,GAAG,KAAK,CAAC,CAAC,CAAA;YAC9E,IAAI,MAAM;gBAAE,UAAU,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QACpC,CAAC;QAED,0FAA0F;QAC1F,0FAA0F;QAC1F,+FAA+F;QAC/F,MAAM,OAAO,GACZ,SAAS,CAAC,MAAM,GAAG,CAAC;YACnB,CAAC,CAAC,IAAI,GAAG,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC,CAAC;YAChD,CAAC,CAAC,IAAI,GAAG,CAAS,2BAA2B,CAAC,UAAU,CAAC,CAAC,CAAA;QAC5D,MAAM,QAAQ,GAAG,cAAc,CAAC,IAAI,EAAE,KAAK,CAAC,KAAK,EAAE,UAAU,EAAE,OAAO,CAAC,CAAA;QACvE,MAAM,UAAU,GAAG,0BAA0B,CAAC,CAAC,CAAC,GAAG,CAAC,SAAS,KAAK,OAAO,CAAC,CAAC,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,QAAQ,CAAA;QAEzG,OAAO,CAAC,IAAI,CAAC;YACZ,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,KAAK,EAAE,KAAK,CAAC,KAAK,EAAE,GAAG,EAAE,KAAK,CAAC,GAAG,EAAE;YAC5D,UAAU;YACV,UAAU;YACV,SAAS;YACT,UAAU;YACV,SAAS;YACT,cAAc,EAAE,QAAQ;SACxB,CAAC,CAAA;IACH,CAAC;IAED,OAAO,OAAO,CAAA;AACf,CAAC"}
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Browser-side postcode resolver for the anchor (#240). A pure-JS, zero-dependency
|
|
7
|
+
* `PostcodeResolver` backed by a compact flat binary instead of SQLite, so the postcode anchor
|
|
8
|
+
* runs in the WASM/browser parser behind the same `lookup()` seam as the server-side
|
|
9
|
+
* `WofPostcodeLookup`.
|
|
10
|
+
*
|
|
11
|
+
* This file owns BOTH ends of the format — `serializePostcodeBinary` (run in Node by
|
|
12
|
+
* `scripts/build-postcode-binary.ts`) and `PostcodeBinaryResolver` (run in the browser) — so the
|
|
13
|
+
* layout can never drift between writer and reader.
|
|
14
|
+
*
|
|
15
|
+
* Binary layout (little-endian): magic "PCB1" (4 bytes) u32 recordCount u8 countryCount, then
|
|
16
|
+
* countryCount × 2 ASCII bytes (the country table) u8 keyWidth (max postcode length in bytes)
|
|
17
|
+
* records recordCount × { key[keyWidth] ASCII right-padded with 0x00, u8 countryIdx, i16 latQ,
|
|
18
|
+
* i16 lonQ }, sorted by key bytes ascending. A postcode present in two countries appears as two
|
|
19
|
+
* adjacent records (same key, different countryIdx).
|
|
20
|
+
*
|
|
21
|
+
* Coordinates are quantized to i16: latQ = round(lat/90 × 32767), lonQ = round(lon/180 × 32767),
|
|
22
|
+
* giving ~300 m resolution — ample for a "which city/region" anchor. A record with latQ = lonQ =
|
|
23
|
+
* 0 means "known postcode, no centroid" (membership only), matching the SQLite resolver's
|
|
24
|
+
* convention.
|
|
25
|
+
*/
|
|
26
|
+
import type { AnchorLookup } from "./anchor-inference.js";
|
|
27
|
+
import type { PostcodePlace } from "./postcode-anchor.js";
|
|
28
|
+
export interface PostcodeBinaryEntry {
|
|
29
|
+
postcode: string;
|
|
30
|
+
country: string;
|
|
31
|
+
lat: number;
|
|
32
|
+
lon: number;
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Serialize postcode entries into the flat binary. Entries are sorted by (postcode, country) so
|
|
36
|
+
* equal postcodes land in adjacent records. Run in Node; consumed by
|
|
37
|
+
* {@link PostcodeBinaryResolver}.
|
|
38
|
+
*/
|
|
39
|
+
export declare function serializePostcodeBinary(entries: readonly PostcodeBinaryEntry[]): Uint8Array;
|
|
40
|
+
/**
|
|
41
|
+
* Pure-JS, browser-safe postcode resolver over the flat binary. Implements the same `lookup()` seam
|
|
42
|
+
* as the SQLite `WofPostcodeLookup`, so `extractPostcodeAnchors` is agnostic to which backs it.
|
|
43
|
+
*/
|
|
44
|
+
export declare class PostcodeBinaryResolver {
|
|
45
|
+
#private;
|
|
46
|
+
constructor(bytes: Uint8Array);
|
|
47
|
+
lookup(postcode: string): PostcodePlace[];
|
|
48
|
+
/**
|
|
49
|
+
* Decode the whole binary into an {@link AnchorLookup} (`Map<postcode, AnchorEntry>`) for the
|
|
50
|
+
* neural anchor channel (#239/#240): each postcode → a uniform posterior over its member
|
|
51
|
+
* countries
|
|
52
|
+
*
|
|
53
|
+
* - The mean of its non-zero centroids. This is the browser-side equivalent of the pilot
|
|
54
|
+
* postcode→anchor lookup the model trained against, built live from the shipped binary instead
|
|
55
|
+
* of a precomputed JSON. Records are stored sorted by (postcode, country), so equal keys are
|
|
56
|
+
* contiguous.
|
|
57
|
+
*/
|
|
58
|
+
toAnchorLookup(): AnchorLookup;
|
|
59
|
+
}
|
|
60
|
+
//# sourceMappingURL=postcode-binary-resolver.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"postcode-binary-resolver.d.ts","sourceRoot":"","sources":["../postcode-binary-resolver.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AAEH,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,uBAAuB,CAAA;AACzD,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,sBAAsB,CAAA;AAOzD,MAAM,WAAW,mBAAmB;IACnC,QAAQ,EAAE,MAAM,CAAA;IAChB,OAAO,EAAE,MAAM,CAAA;IACf,GAAG,EAAE,MAAM,CAAA;IACX,GAAG,EAAE,MAAM,CAAA;CACX;AAUD;;;;GAIG;AACH,wBAAgB,uBAAuB,CAAC,OAAO,EAAE,SAAS,mBAAmB,EAAE,GAAG,UAAU,CA2C3F;AAED;;;GAGG;AACH,qBAAa,sBAAsB;;gBAStB,KAAK,EAAE,UAAU;IA2B7B,MAAM,CAAC,QAAQ,EAAE,MAAM,GAAG,aAAa,EAAE;IA2BzC;;;;;;;;;OASG;IACH,cAAc,IAAI,YAAY;CA+C9B"}
|