@mailwoman/neural 2.2.0 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/out/anchor-inference.d.ts +57 -0
- package/out/anchor-inference.d.ts.map +1 -0
- package/out/anchor-inference.js +94 -0
- package/out/anchor-inference.js.map +1 -0
- package/out/browser.d.ts +2 -0
- package/out/browser.d.ts.map +1 -1
- package/out/browser.js +4 -0
- package/out/browser.js.map +1 -1
- package/out/classifier.d.ts +62 -2
- package/out/classifier.d.ts.map +1 -1
- package/out/classifier.js +78 -17
- package/out/classifier.js.map +1 -1
- package/out/fst-prior.d.ts +71 -0
- package/out/fst-prior.d.ts.map +1 -0
- package/out/fst-prior.js +173 -0
- package/out/fst-prior.js.map +1 -0
- package/out/index.d.ts +3 -0
- package/out/index.d.ts.map +1 -1
- package/out/index.js +3 -0
- package/out/index.js.map +1 -1
- package/out/labels.d.ts +3 -0
- package/out/labels.d.ts.map +1 -1
- package/out/labels.js +13 -0
- package/out/labels.js.map +1 -1
- package/out/onnx-runner.d.ts +8 -1
- package/out/onnx-runner.d.ts.map +1 -1
- package/out/onnx-runner.js +31 -1
- package/out/onnx-runner.js.map +1 -1
- package/out/postcode-anchor.d.ts +117 -0
- package/out/postcode-anchor.d.ts.map +1 -0
- package/out/postcode-anchor.js +269 -0
- package/out/postcode-anchor.js.map +1 -0
- package/out/postcode-binary-resolver.d.ts +60 -0
- package/out/postcode-binary-resolver.d.ts.map +1 -0
- package/out/postcode-binary-resolver.js +208 -0
- package/out/postcode-binary-resolver.js.map +1 -0
- package/out/postcode-repair.d.ts +65 -0
- package/out/postcode-repair.d.ts.map +1 -0
- package/out/postcode-repair.js +171 -0
- package/out/postcode-repair.js.map +1 -0
- package/out/proposal-classifier.d.ts.map +1 -1
- package/out/proposal-classifier.js +3 -1
- package/out/proposal-classifier.js.map +1 -1
- package/out/query-shape-prior.d.ts +12 -0
- package/out/query-shape-prior.d.ts.map +1 -1
- package/out/query-shape-prior.js +132 -2
- package/out/query-shape-prior.js.map +1 -1
- package/out/street-morphology-prior.d.ts +56 -0
- package/out/street-morphology-prior.d.ts.map +1 -0
- package/out/street-morphology-prior.js +159 -0
- package/out/street-morphology-prior.js.map +1 -0
- package/out/unit-repair.d.ts +46 -0
- package/out/unit-repair.d.ts.map +1 -0
- package/out/unit-repair.js +147 -0
- package/out/unit-repair.js.map +1 -0
- package/out/vitest.config.d.ts.map +1 -1
- package/out/vitest.config.js +3 -0
- package/out/vitest.config.js.map +1 -1
- package/out/weights.d.ts +27 -3
- package/out/weights.d.ts.map +1 -1
- package/out/weights.js +46 -2
- package/out/weights.js.map +1 -1
- package/package.json +6 -2
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Postcode regex repair pass — v0.7 task #35 ("postcode regex pre-pass").
|
|
7
|
+
*
|
|
8
|
+
* The 2026-05-29 postcode diagnostic showed the neural model fragments alphanumeric postcodes at
|
|
9
|
+
* the SentencePiece layer (GB/CA/NL at 0%, US 80.5%, FR 70.1%). Three failure modes were visible
|
|
10
|
+
* in the data:
|
|
11
|
+
*
|
|
12
|
+
* 1. Total miss — "London SW1A 1AA" → (no postcode label)
|
|
13
|
+
* 2. Truncation — "M5V 2T6" → "2T6"; "B12 8QX" → "B12"
|
|
14
|
+
* 3. Char-drift — "75008" → "5008"; "62701" → "2701" (and smear: "1200-030 Lisboa" → "200-030 Lis")
|
|
15
|
+
*
|
|
16
|
+
* This pass runs AFTER the model's per-token BIO labels are decoded but BEFORE `buildAddressTree`.
|
|
17
|
+
* It detects postcode-shaped substrings with per-country regexes and repairs the label sequence
|
|
18
|
+
* so the postcode span matches the detected shape. The model is untouched — this is a
|
|
19
|
+
* deterministic decoder-side correction, the "lowest risk" lever in the v0.7 plan (vs. #36's soft
|
|
20
|
+
* FST shallow-fusion or #41's char-level encoder).
|
|
21
|
+
*
|
|
22
|
+
* PRECISION GUARDS (so we never regress the countries already passing):
|
|
23
|
+
*
|
|
24
|
+
* - Alphanumeric shapes (GB/CA/NL/DE-prefixed) are high-confidence "this IS a postcode" patterns →
|
|
25
|
+
* eligible to ADD a span where the model emitted none, but only over non-structural labels
|
|
26
|
+
* (never over house_number/street/etc.).
|
|
27
|
+
* - Numeric shapes (\d{5}, ZIP+4, JP, PT, PL) are ambiguous (a bare 5-digit could be a house number)
|
|
28
|
+
* → SNAP-only: they expand/clip an EXISTING postcode span, never create one from scratch.
|
|
29
|
+
* - Smear cleanup is LOCAL: only postcode tokens immediately flanking a snapped span are cleared. We
|
|
30
|
+
* never globally clear unmatched postcode tokens — that would regress shapes we don't
|
|
31
|
+
* pattern-match (AU 4-digit, IN 6-digit, …).
|
|
32
|
+
*/
|
|
33
|
+
/**
|
|
34
|
+
* Per-country postcode shape patterns, ordered most-specific → least. Alphanumeric patterns require
|
|
35
|
+
* uppercase letters (postcodes are conventionally uppercase, and the eval data has them uppercase)
|
|
36
|
+
* — this keeps them from matching ordinary lowercase prose.
|
|
37
|
+
*/
|
|
38
|
+
export const POSTCODE_PATTERNS = [
|
|
39
|
+
// --- Alphanumeric (eligible to ADD) ---
|
|
40
|
+
// GB: outward + space + inward, e.g. SW1A 1AA, EH8 9YL, W1J 9PN, IP13 6SU, B12 8QX
|
|
41
|
+
{ label: "GB", kind: "alnum", re: /\b[A-Z]{1,2}\d[A-Z\d]?\s+\d[A-Z]{2}\b/g },
|
|
42
|
+
// CA: A1A 1A1 (space optional), e.g. M5V 2T6, H2X 2T6, H3B 1A3
|
|
43
|
+
{ label: "CA", kind: "alnum", re: /\b[A-Z]\d[A-Z]\s?\d[A-Z]\d\b/g },
|
|
44
|
+
// DE-prefixed: D-68161
|
|
45
|
+
{ label: "DE", kind: "alnum", re: /\bD-\d{5}\b/g },
|
|
46
|
+
// NL: 1234 AB / 1234AB — space optional (glued is common). The US "2737 CA" (ZIP+4 tail +
|
|
47
|
+
// state) collision is resolved by longest-match-wins below, which lets the ZIP+4 claim it.
|
|
48
|
+
{ label: "NL", kind: "alnum", re: /\b\d{4}\s?[A-Z]{2}\b/g },
|
|
49
|
+
// --- Numeric (SNAP-only) ---
|
|
50
|
+
{ label: "ZIP4", kind: "numeric", re: /\b\d{5}-\d{4}\b/g }, // US ZIP+4
|
|
51
|
+
{ label: "JP", kind: "numeric", re: /\b\d{3}-\d{4}\b/g }, // 100-0001
|
|
52
|
+
{ label: "PT", kind: "numeric", re: /\b\d{4}-\d{3}\b/g }, // 3060-187
|
|
53
|
+
{ label: "PL", kind: "numeric", re: /\b\d{2}-\d{3}\b/g }, // 47-400
|
|
54
|
+
{ label: "NUM5", kind: "numeric", re: /\b\d{5}\b/g }, // US/FR/DE/ES 5-digit
|
|
55
|
+
];
|
|
56
|
+
/**
|
|
57
|
+
* Labels a postcode span is allowed to overwrite when the model emitted no postcode at all (ADD
|
|
58
|
+
* path). These are the geographic-container tags postcodes get confused with per the diagnostic
|
|
59
|
+
* ("often labeled as locality or O"). Structural tags (house_number, street*, unit, po_box, venue,
|
|
60
|
+
* …) are intentionally absent so we never clobber a confidently-labeled street/number with a false
|
|
61
|
+
* postcode.
|
|
62
|
+
*/
|
|
63
|
+
const ADD_OVER_TAGS = new Set(["locality", "dependent_locality", "region", "subregion", "country"]);
|
|
64
|
+
const POSTCODE_B = "B-postcode";
|
|
65
|
+
const POSTCODE_I = "I-postcode";
|
|
66
|
+
const LOCALITY_B = "B-locality";
|
|
67
|
+
const LOCALITY_I = "I-locality";
|
|
68
|
+
const OUTSIDE = "O";
|
|
69
|
+
function isPostcodeLabel(label) {
|
|
70
|
+
return label === "B-postcode" || label === "I-postcode";
|
|
71
|
+
}
|
|
72
|
+
/** Extract the bare tag from a BIO label ("B-locality" → "locality", "O" → null). */
|
|
73
|
+
function tagOf(label) {
|
|
74
|
+
return label === "O" ? null : label.slice(2);
|
|
75
|
+
}
|
|
76
|
+
/** Collect non-overlapping postcode matches, preferring more-specific (earlier) patterns. */
|
|
77
|
+
export function collectMatches(text) {
|
|
78
|
+
const candidates = [];
|
|
79
|
+
POSTCODE_PATTERNS.forEach((pat, priority) => {
|
|
80
|
+
pat.re.lastIndex = 0;
|
|
81
|
+
for (let m = pat.re.exec(text); m; m = pat.re.exec(text)) {
|
|
82
|
+
candidates.push({ start: m.index, end: m.index + m[0].length, kind: pat.kind, priority });
|
|
83
|
+
}
|
|
84
|
+
});
|
|
85
|
+
// Greedy longest-match-wins: accept by (length desc, then priority asc); reject anything
|
|
86
|
+
// overlapping an accepted match. Longest-first lets a US ZIP+4 ("94610-2737") claim its span
|
|
87
|
+
// before the shorter NL-shaped false positive in its tail ("2737 CA") can.
|
|
88
|
+
candidates.sort((a, b) => b.end - b.start - (a.end - a.start) || a.priority - b.priority);
|
|
89
|
+
const accepted = [];
|
|
90
|
+
for (const c of candidates) {
|
|
91
|
+
if (accepted.some((a) => c.start < a.end && a.start < c.end))
|
|
92
|
+
continue;
|
|
93
|
+
accepted.push(c);
|
|
94
|
+
}
|
|
95
|
+
return accepted;
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* Repair postcode label spans in a decoded token sequence using per-country regexes. Returns a NEW
|
|
99
|
+
* token array (inputs are not mutated) plus a change count.
|
|
100
|
+
*/
|
|
101
|
+
export function repairPostcodeLabels(text, input) {
|
|
102
|
+
const matches = collectMatches(text);
|
|
103
|
+
const tokens = input.map((t) => ({ ...t }));
|
|
104
|
+
if (matches.length === 0)
|
|
105
|
+
return { tokens, changed: 0 };
|
|
106
|
+
let changed = 0;
|
|
107
|
+
const setLabel = (i, label) => {
|
|
108
|
+
if (tokens[i].label !== label) {
|
|
109
|
+
tokens[i].label = label;
|
|
110
|
+
changed++;
|
|
111
|
+
}
|
|
112
|
+
};
|
|
113
|
+
for (const m of matches) {
|
|
114
|
+
// Tokens whose char span intersects the match.
|
|
115
|
+
const overlap = [];
|
|
116
|
+
for (let i = 0; i < tokens.length; i++) {
|
|
117
|
+
const t = tokens[i];
|
|
118
|
+
if (t.start < m.end && m.start < t.end)
|
|
119
|
+
overlap.push(i);
|
|
120
|
+
}
|
|
121
|
+
if (overlap.length === 0)
|
|
122
|
+
continue;
|
|
123
|
+
const hasPostcode = overlap.some((i) => isPostcodeLabel(tokens[i].label));
|
|
124
|
+
if (!hasPostcode) {
|
|
125
|
+
// ADD path — only for high-confidence alphanumeric shapes, only over safe labels.
|
|
126
|
+
if (m.kind !== "alnum")
|
|
127
|
+
continue;
|
|
128
|
+
const safe = overlap.every((i) => {
|
|
129
|
+
const tag = tagOf(tokens[i].label);
|
|
130
|
+
return tag === null || ADD_OVER_TAGS.has(tag);
|
|
131
|
+
});
|
|
132
|
+
if (!safe)
|
|
133
|
+
continue;
|
|
134
|
+
}
|
|
135
|
+
// SNAP/ADD: relabel the matched run as a single postcode span.
|
|
136
|
+
overlap.forEach((i, k) => setLabel(i, k === 0 ? POSTCODE_B : POSTCODE_I));
|
|
137
|
+
// Leading smear clip: postcode tokens immediately BEFORE the snapped run are noise (e.g. a
|
|
138
|
+
// house-number digit the model over-labeled) — clear to O as before.
|
|
139
|
+
for (let j = overlap[0] - 1; j >= 0 && isPostcodeLabel(tokens[j].label); j--)
|
|
140
|
+
setLabel(j, OUTSIDE);
|
|
141
|
+
// Trailing smear: the model over-extended the postcode to the RIGHT. In postcode-before-city
|
|
142
|
+
// locales (DE/FR/ES/IT, "08523 Plauen") this swallows the leading characters of the city, which
|
|
143
|
+
// the historical clip-to-O then DISCARDED ("08523 Pl|auen Vogtl" → postcode "08523" + O +
|
|
144
|
+
// locality "auen Vogtl", dropping the "Pl"). When the smear connects to a following locality run,
|
|
145
|
+
// hand those characters BACK to the city — reassign them to locality and demote the city's
|
|
146
|
+
// leading B so the prefix + city form ONE span ("Pl"+"auen"+"Vogtl" → "Plauen Vogtl"). A
|
|
147
|
+
// standalone neighbour with no following locality (a country, "Paris 75008 France") keeps the
|
|
148
|
+
// historical clip-to-O. This is the decoder-side repair for the cross-tag postcode→city
|
|
149
|
+
// absorption diagnosed in the PR3 Pilot A postmortem (+36pp DE exact-locality, no-op on US,
|
|
150
|
+
// where the postcode sits at the end with nothing to trim).
|
|
151
|
+
const trailing = [];
|
|
152
|
+
for (let j = overlap[overlap.length - 1] + 1; j < tokens.length && isPostcodeLabel(tokens[j].label); j++) {
|
|
153
|
+
trailing.push(j);
|
|
154
|
+
}
|
|
155
|
+
if (trailing.length > 0) {
|
|
156
|
+
const after = trailing[trailing.length - 1] + 1;
|
|
157
|
+
const connectsToCity = after < tokens.length && tagOf(tokens[after].label) === "locality";
|
|
158
|
+
if (connectsToCity) {
|
|
159
|
+
trailing.forEach((j, k) => setLabel(j, k === 0 ? LOCALITY_B : LOCALITY_I));
|
|
160
|
+
if (tokens[after].label === "B-locality")
|
|
161
|
+
setLabel(after, LOCALITY_I);
|
|
162
|
+
}
|
|
163
|
+
else {
|
|
164
|
+
for (const j of trailing)
|
|
165
|
+
setLabel(j, OUTSIDE);
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
return { tokens, changed };
|
|
170
|
+
}
|
|
171
|
+
//# sourceMappingURL=postcode-repair.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"postcode-repair.js","sourceRoot":"","sources":["../postcode-repair.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA+BG;AAcH;;;;GAIG;AACH,MAAM,CAAC,MAAM,iBAAiB,GAAoE;IACjG,yCAAyC;IACzC,mFAAmF;IACnF,EAAE,KAAK,EAAE,IAAI,EAAE,IAAI,EAAE,OAAO,EAAE,EAAE,EAAE,wCAAwC,EAAE;IAC5E,+DAA+D;IAC/D,EAAE,KAAK,EAAE,IAAI,EAAE,IAAI,EAAE,OAAO,EAAE,EAAE,EAAE,+BAA+B,EAAE;IACnE,uBAAuB;IACvB,EAAE,KAAK,EAAE,IAAI,EAAE,IAAI,EAAE,OAAO,EAAE,EAAE,EAAE,cAAc,EAAE;IAClD,0FAA0F;IAC1F,2FAA2F;IAC3F,EAAE,KAAK,EAAE,IAAI,EAAE,IAAI,EAAE,OAAO,EAAE,EAAE,EAAE,uBAAuB,EAAE;IAC3D,8BAA8B;IAC9B,EAAE,KAAK,EAAE,MAAM,EAAE,IAAI,EAAE,SAAS,EAAE,EAAE,EAAE,kBAAkB,EAAE,EAAE,WAAW;IACvE,EAAE,KAAK,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,EAAE,EAAE,kBAAkB,EAAE,EAAE,WAAW;IACrE,EAAE,KAAK,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,EAAE,EAAE,kBAAkB,EAAE,EAAE,WAAW;IACrE,EAAE,KAAK,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,EAAE,EAAE,kBAAkB,EAAE,EAAE,SAAS;IACnE,EAAE,KAAK,EAAE,MAAM,EAAE,IAAI,EAAE,SAAS,EAAE,EAAE,EAAE,YAAY,EAAE,EAAE,sBAAsB;CAC5E,CAAA;AAED;;;;;;GAMG;AACH,MAAM,aAAa,GAAG,IAAI,GAAG,CAAS,CAAC,UAAU,EAAE,oBAAoB,EAAE,QAAQ,EAAE,WAAW,EAAE,SAAS,CAAC,CAAC,CAAA;AAE3G,MAAM,UAAU,GAAG,YAAqC,CAAA;AACxD,MAAM,UAAU,GAAG,YAAqC,CAAA;AACxD,MAAM,UAAU,GAAG,YAAqC,CAAA;AACxD,MAAM,UAAU,GAAG,YAAqC,CAAA;AACxD,MAAM,OAAO,GAAG,GAA4B,CAAA;AAE5C,SAAS,eAAe,CAAC,KAAa;IACrC,OAAO,KAAK,KAAK,YAAY,IAAI,KAAK,KAAK,YAAY,CAAA;AACxD,CAAC;AAED,qFAAqF;AACrF,SAAS,KAAK,CAAC,KAAa;IAC3B,OAAO,KAAK,KAAK,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAA;AAC7C,CAAC;AAED,6FAA6F;AAC7F,MAAM,UAAU,cAAc,CAAC,IAAY;IAC1C,MAAM,UAAU,GAAoB,EAAE,CAAA;IACtC,iBAAiB,CAAC,OAAO,CAAC,CAAC,GAAG,EAAE,QAAQ,EAAE,EAAE;QAC3C,GAAG,CAAC,EAAE,CAAC,SAAS,GAAG,CAAC,CAAA;QACpB,KAAK,IAAI,CAAC,GAAG,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;YAC1D,UAAU,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC,KAAK,EAAE,GAAG,EAAE,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,EAAE,IAAI,EAAE,GAAG,CAAC,IAAI,EAAE,QAAQ,EAAE,CAAC,CAAA;QAC1F,CAAC;IACF,CAAC,CAAC,CAAA;IACF,yFAAyF;IACzF,6FAA6F;IAC7F,2EAA2E;IAC3E,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,QAAQ,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAA;IACzF,MAAM,QAAQ,GAAoB,EAAE,CAAA;IACpC,KAAK,MAAM,CAAC,IAAI,UAAU,EAAE,CAAC;QAC5B,IAAI,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,GAAG,CAAC;YAAE,SAAQ;QACtE,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IACjB,CAAC;IACD,OAAO,QAAQ,CAAA;AAChB,CAAC;AAQD;;;GAGG;AACH,MAAM,UAAU,oBAAoB,CAAC,IAAY,EAAE,KAA8B;IAChF,MAAM,OAAO,GAAG,cAAc,CAAC,IAAI,CAAC,CAAA;IACpC,MAAM,MAAM,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC,CAAC,CAAA;IAC3C,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,CAAC,EAAE,CAAA;IAEvD,IAAI,OAAO,GAAG,CAAC,CAAA;IACf,MAAM,QAAQ,GAAG,CAAC,CAAS,EAAE,KAA4B,EAAQ,EAAE;QAClE,IAAI,MAAM,CAAC,CAAC,CAAE,CAAC,KAAK,KAAK,KAAK,EAAE,CAAC;YAChC,MAAM,CAAC,CAAC,CAAE,CAAC,KAAK,GAAG,KAAK,CAAA;YACxB,OAAO,EAAE,CAAA;QACV,CAAC;IACF,CAAC,CAAA;IAED,KAAK,MAAM,CAAC,IAAI,OAAO,EAAE,CAAC;QACzB,+CAA+C;QAC/C,MAAM,OAAO,GAAa,EAAE,CAAA;QAC5B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACxC,MAAM,CAAC,GAAG,MAAM,CAAC,CAAC,CAAE,CAAA;YACpB,IAAI,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,GAAG;gBAAE,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;QACxD,CAAC;QACD,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC;YAAE,SAAQ;QAElC,MAAM,WAAW,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,eAAe,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,KAAK,CAAC,CAAC,CAAA;QAC1E,IAAI,CAAC,WAAW,EAAE,CAAC;YAClB,kFAAkF;YAClF,IAAI,CAAC,CAAC,IAAI,KAAK,OAAO;gBAAE,SAAQ;YAChC,MAAM,IAAI,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE;gBAChC,MAAM,GAAG,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,KAAK,CAAC,CAAA;gBACnC,OAAO,GAAG,KAAK,IAAI,IAAI,aAAa,CAAC,GAAG,CAAC,GAAG,CAAC,CAAA;YAC9C,CAAC,CAAC,CAAA;YACF,IAAI,CAAC,IAAI;gBAAE,SAAQ;QACpB,CAAC;QAED,+DAA+D;QAC/D,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,EAAE,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAA;QAEzE,2FAA2F;QAC3F,qEAAqE;QACrE,KAAK,IAAI,CAAC,GAAG,OAAO,CAAC,CAAC,CAAE,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,eAAe,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,KAAK,CAAC,EAAE,CAAC,EAAE;YAAE,QAAQ,CAAC,CAAC,EAAE,OAAO,CAAC,CAAA;QAEpG,6FAA6F;QAC7F,gGAAgG;QAChG,0FAA0F;QAC1F,kGAAkG;QAClG,2FAA2F;QAC3F,yFAAyF;QACzF,8FAA8F;QAC9F,wFAAwF;QACxF,4FAA4F;QAC5F,4DAA4D;QAC5D,MAAM,QAAQ,GAAa,EAAE,CAAA;QAC7B,KAAK,IAAI,CAAC,GAAG,OAAO,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAE,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,IAAI,eAAe,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,KAAK,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAC5G,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;QACjB,CAAC;QACD,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACzB,MAAM,KAAK,GAAG,QAAQ,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAE,GAAG,CAAC,CAAA;YAChD,MAAM,cAAc,GAAG,KAAK,GAAG,MAAM,CAAC,MAAM,IAAI,KAAK,CAAC,MAAM,CAAC,KAAK,CAAE,CAAC,KAAK,CAAC,KAAK,UAAU,CAAA;YAC1F,IAAI,cAAc,EAAE,CAAC;gBACpB,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,EAAE,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAA;gBAC1E,IAAI,MAAM,CAAC,KAAK,CAAE,CAAC,KAAK,KAAK,YAAY;oBAAE,QAAQ,CAAC,KAAK,EAAE,UAAU,CAAC,CAAA;YACvE,CAAC;iBAAM,CAAC;gBACP,KAAK,MAAM,CAAC,IAAI,QAAQ;oBAAE,QAAQ,CAAC,CAAC,EAAE,OAAO,CAAC,CAAA;YAC/C,CAAC;QACF,CAAC;IACF,CAAC;IAED,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,CAAA;AAC3B,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"proposal-classifier.d.ts","sourceRoot":"","sources":["../proposal-classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAIH,OAAO,KAAK,EAGX,YAAY,EACZ,kBAAkB,EAElB,MAAM,uBAAuB,CAAA;AAC9B,OAAO,KAAK,EAAE,uBAAuB,EAAE,MAAM,iBAAiB,CAAA;AAG9D,MAAM,WAAW,8BAA8B;IAC9C,wFAAwF;IACxF,EAAE,EAAE,MAAM,CAAA;IACV,iDAAiD;IACjD,UAAU,EAAE,uBAAuB,CAAA;IACnC;;;;OAIG;IACH,KAAK,CAAC,EAAE,SAAS,YAAY,EAAE,CAAA;IAC/B,mFAAmF;IACnF,OAAO,CAAC,EAAE,SAAS,CAAC,MAAM,GAAG,GAAG,CAAC,EAAE,CAAA;IACnC,+DAA+D;IAC/D,OAAO,CAAC,EAAE,MAAM,CAAA;CAChB;AAED,0EAA0E;AAC1E,wBAAgB,8BAA8B,CAAC,GAAG,EAAE,8BAA8B,GAAG,kBAAkB,
|
|
1
|
+
{"version":3,"file":"proposal-classifier.d.ts","sourceRoot":"","sources":["../proposal-classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAIH,OAAO,KAAK,EAGX,YAAY,EACZ,kBAAkB,EAElB,MAAM,uBAAuB,CAAA;AAC9B,OAAO,KAAK,EAAE,uBAAuB,EAAE,MAAM,iBAAiB,CAAA;AAG9D,MAAM,WAAW,8BAA8B;IAC9C,wFAAwF;IACxF,EAAE,EAAE,MAAM,CAAA;IACV,iDAAiD;IACjD,UAAU,EAAE,uBAAuB,CAAA;IACnC;;;;OAIG;IACH,KAAK,CAAC,EAAE,SAAS,YAAY,EAAE,CAAA;IAC/B,mFAAmF;IACnF,OAAO,CAAC,EAAE,SAAS,CAAC,MAAM,GAAG,GAAG,CAAC,EAAE,CAAA;IACnC,+DAA+D;IAC/D,OAAO,CAAC,EAAE,MAAM,CAAA;CAChB;AAED,0EAA0E;AAC1E,wBAAgB,8BAA8B,CAAC,GAAG,EAAE,8BAA8B,GAAG,kBAAkB,CA+CtG"}
|
|
@@ -24,7 +24,9 @@ export function createNeuralProposalClassifier(cfg) {
|
|
|
24
24
|
const emitsSet = new Set(emits);
|
|
25
25
|
const penalty = cfg.penalty ?? 0;
|
|
26
26
|
async function classify(section, _ctx) {
|
|
27
|
-
|
|
27
|
+
// Postcode regex repair on by default (v0.7 #35, operator-signed): +135/0 on the postcode
|
|
28
|
+
// harness, model-independent. Fixes the SentencePiece-fragmentation misses (GB/CA/NL/…).
|
|
29
|
+
const tree = await cfg.classifier.parse(section.body, { postcodeRepair: true });
|
|
28
30
|
const proposals = [];
|
|
29
31
|
const sectionOffset = section.start;
|
|
30
32
|
const visit = (node) => {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"proposal-classifier.js","sourceRoot":"","sources":["../proposal-classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAYH,OAAO,EAAE,WAAW,EAAE,MAAM,aAAa,CAAA;AAmBzC,0EAA0E;AAC1E,MAAM,UAAU,8BAA8B,CAAC,GAAmC;IACjF,MAAM,KAAK,GAAG,GAAG,CAAC,KAAK,IAAI,WAAW,CAAA;IACtC,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAe,KAAgC,CAAC,CAAA;IACxE,MAAM,OAAO,GAAG,GAAG,CAAC,OAAO,IAAI,CAAC,CAAA;IAEhC,KAAK,UAAU,QAAQ,CAAC,OAAgB,EAAE,IAAuB;QAChE,MAAM,IAAI,GAAG,MAAM,GAAG,CAAC,UAAU,CAAC,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,CAAA;
|
|
1
|
+
{"version":3,"file":"proposal-classifier.js","sourceRoot":"","sources":["../proposal-classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAYH,OAAO,EAAE,WAAW,EAAE,MAAM,aAAa,CAAA;AAmBzC,0EAA0E;AAC1E,MAAM,UAAU,8BAA8B,CAAC,GAAmC;IACjF,MAAM,KAAK,GAAG,GAAG,CAAC,KAAK,IAAI,WAAW,CAAA;IACtC,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAe,KAAgC,CAAC,CAAA;IACxE,MAAM,OAAO,GAAG,GAAG,CAAC,OAAO,IAAI,CAAC,CAAA;IAEhC,KAAK,UAAU,QAAQ,CAAC,OAAgB,EAAE,IAAuB;QAChE,0FAA0F;QAC1F,yFAAyF;QACzF,MAAM,IAAI,GAAG,MAAM,GAAG,CAAC,UAAU,CAAC,KAAK,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,cAAc,EAAE,IAAI,EAAE,CAAC,CAAA;QAC/E,MAAM,SAAS,GAA6B,EAAE,CAAA;QAC9C,MAAM,aAAa,GAAG,OAAO,CAAC,KAAK,CAAA;QAEnC,MAAM,KAAK,GAAG,CAAC,IAAiB,EAAQ,EAAE;YACzC,IAAI,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;gBAC5B,wFAAwF;gBACxF,oFAAoF;gBACpF,uFAAuF;gBACvF,iFAAiF;gBACjF,mFAAmF;gBACnF,2EAA2E;gBAC3E,MAAM,IAAI,GAAG;oBACZ,KAAK,EAAE,aAAa,GAAG,IAAI,CAAC,KAAK;oBACjC,GAAG,EAAE,aAAa,GAAG,IAAI,CAAC,GAAG;oBAC7B,IAAI,EAAE,IAAI,CAAC,KAAK;iBACG,CAAA;gBACpB,SAAS,CAAC,IAAI,CAAC;oBACd,IAAI;oBACJ,SAAS,EAAE,IAAI,CAAC,GAAG;oBACnB,UAAU,EAAE,IAAI,CAAC,UAAU;oBAC3B,MAAM,EAAE,QAAQ;oBAChB,SAAS,EAAE,GAAG,CAAC,EAAE;oBACjB,OAAO;iBACP,CAAC,CAAA;YACH,CAAC;YACD,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,QAAQ;gBAAE,KAAK,CAAC,KAAK,CAAC,CAAA;QAChD,CAAC,CAAA;QAED,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,KAAK;YAAE,KAAK,CAAC,IAAI,CAAC,CAAA;QAC1C,OAAO,SAAS,CAAA;IACjB,CAAC;IAED,OAAO;QACN,EAAE,EAAE,GAAG,CAAC,EAAE;QACV,KAAK;QACL,OAAO,EAAE,GAAG,CAAC,OAAO,IAAI,CAAC,GAAG,CAAC;QAC7B,QAAQ;KACR,CAAA;AACF,CAAC"}
|
|
@@ -24,6 +24,11 @@
|
|
|
24
24
|
*/
|
|
25
25
|
export interface QueryShapeLike {
|
|
26
26
|
knownFormats: ReadonlyArray<KnownFormatHitLike>;
|
|
27
|
+
regionAbbreviations?: ReadonlyArray<RegionAbbreviationHitLike>;
|
|
28
|
+
}
|
|
29
|
+
export interface RegionAbbreviationHitLike {
|
|
30
|
+
start: number;
|
|
31
|
+
span: string;
|
|
27
32
|
}
|
|
28
33
|
export interface KnownFormatHitLike {
|
|
29
34
|
format: string;
|
|
@@ -45,6 +50,13 @@ export interface BuildPriorsOpts {
|
|
|
45
50
|
* favored label. Confidence-scaled, so a 0.6-confidence format hit gets +0.6 max bias.
|
|
46
51
|
*/
|
|
47
52
|
biasScale?: number;
|
|
53
|
+
/**
|
|
54
|
+
* Bias magnitude for the locality soft prior (in log-odds units). Default 2.0 — adds ~e^2 ≈ 7.4×
|
|
55
|
+
* odds to B-locality / I-locality for tokens preceding a detected region abbreviation.
|
|
56
|
+
*/
|
|
57
|
+
localityBiasScale?: number;
|
|
58
|
+
/** Raw input text for region-name matching in the locality bias guard. */
|
|
59
|
+
inputText?: string;
|
|
48
60
|
}
|
|
49
61
|
/**
|
|
50
62
|
* Build a `[seqLen][numLabels]` matrix of additive log-bias to be added to encoder emissions before
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"query-shape-prior.d.ts","sourceRoot":"","sources":["../query-shape-prior.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;GAmBG;AAEH;;;GAGG;AACH,MAAM,WAAW,cAAc;IAC9B,YAAY,EAAE,aAAa,CAAC,kBAAkB,CAAC,CAAA;
|
|
1
|
+
{"version":3,"file":"query-shape-prior.d.ts","sourceRoot":"","sources":["../query-shape-prior.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;GAmBG;AAEH;;;GAGG;AACH,MAAM,WAAW,cAAc;IAC9B,YAAY,EAAE,aAAa,CAAC,kBAAkB,CAAC,CAAA;IAC/C,mBAAmB,CAAC,EAAE,aAAa,CAAC,yBAAyB,CAAC,CAAA;CAC9D;AAED,MAAM,WAAW,yBAAyB;IACzC,KAAK,EAAE,MAAM,CAAA;IACb,IAAI,EAAE,MAAM,CAAA;CACZ;AAED,MAAM,WAAW,kBAAkB;IAClC,MAAM,EAAE,MAAM,CAAA;IACd,IAAI,EAAE;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,GAAG,EAAE,MAAM,CAAA;KAAE,CAAA;IACpC,4EAA4E;IAC5E,UAAU,EAAE,MAAM,CAAA;CAClB;AAED,+DAA+D;AAC/D,MAAM,WAAW,SAAS;IACzB,KAAK,EAAE,MAAM,CAAA;IACb,GAAG,EAAE,MAAM,CAAA;CACX;AAiBD,MAAM,WAAW,eAAe;IAC/B;;;OAGG;IACH,SAAS,CAAC,EAAE,MAAM,CAAA;IAElB;;;OAGG;IACH,iBAAiB,CAAC,EAAE,MAAM,CAAA;IAC1B,0EAA0E;IAC1E,SAAS,CAAC,EAAE,MAAM,CAAA;CAClB;AAED;;;;;;;;;GASG;AACH,wBAAgB,mBAAmB,CAClC,KAAK,EAAE,cAAc,EACrB,MAAM,EAAE,aAAa,CAAC,SAAS,CAAC,EAChC,MAAM,EAAE,aAAa,CAAC,MAAM,CAAC,EAC7B,IAAI,GAAE,eAAoB,GACxB,MAAM,EAAE,EAAE,CAoCZ;AA2ID,0EAA0E;AAC1E,wBAAgB,iBAAiB,CAAC,SAAS,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,EAAE,GAAG,MAAM,EAAE,EAAE,CAWvF"}
|
package/out/query-shape-prior.js
CHANGED
|
@@ -49,12 +49,13 @@ export function buildEmissionPriors(shape, tokens, labels, opts = {}) {
|
|
|
49
49
|
const matrix = [];
|
|
50
50
|
for (let t = 0; t < T; t++)
|
|
51
51
|
matrix.push(new Array(L).fill(0));
|
|
52
|
-
if (shape.knownFormats.length === 0)
|
|
53
|
-
return matrix;
|
|
54
52
|
// Index label → column for fast lookup.
|
|
55
53
|
const labelToCol = new Map();
|
|
56
54
|
for (let k = 0; k < labels.length; k++)
|
|
57
55
|
labelToCol.set(labels[k], k);
|
|
56
|
+
if (shape.knownFormats.length === 0 && (!shape.regionAbbreviations || shape.regionAbbreviations.length === 0)) {
|
|
57
|
+
return matrix;
|
|
58
|
+
}
|
|
58
59
|
for (const hit of shape.knownFormats) {
|
|
59
60
|
const targetLabel = FORMAT_TO_LABEL.get(hit.format);
|
|
60
61
|
if (!targetLabel)
|
|
@@ -70,8 +71,137 @@ export function buildEmissionPriors(shape, tokens, labels, opts = {}) {
|
|
|
70
71
|
}
|
|
71
72
|
}
|
|
72
73
|
}
|
|
74
|
+
// Locality soft prior: when a region abbreviation is detected (e.g., "DC", "NY"), bias
|
|
75
|
+
// preceding alphabetic tokens toward B-locality / I-locality. This counters the WOF
|
|
76
|
+
// bare-name frequency dominance that makes the model over-emit B-region on ambiguous
|
|
77
|
+
// place names like "Washington" or "New York".
|
|
78
|
+
applyLocalityBias(matrix, shape, tokens, labelToCol, opts.localityBiasScale ?? 2.0, opts.inputText);
|
|
73
79
|
return matrix;
|
|
74
80
|
}
|
|
81
|
+
/**
|
|
82
|
+
* Apply locality bias to tokens preceding a detected region abbreviation.
|
|
83
|
+
*
|
|
84
|
+
* For "Washington, DC" — "DC" is the region abbreviation; "Washington" gets biased toward
|
|
85
|
+
* B-locality. For "New York, NY" — "New" gets B-locality and "York" gets I-locality.
|
|
86
|
+
*
|
|
87
|
+
* Guard: if the preceding text matches the full name of the region that the abbreviation represents
|
|
88
|
+
* (e.g., "Washington" before "WA"), the locality bias is NOT applied — the text IS the region, not
|
|
89
|
+
* a locality within it.
|
|
90
|
+
*
|
|
91
|
+
* Constraint: only bias tokens that appear BEFORE the abbreviation's character offset and are
|
|
92
|
+
* alphabetic (start with uppercase). Tokens that are part of a known postcode format or are
|
|
93
|
+
* themselves region abbreviations are skipped.
|
|
94
|
+
*/
|
|
95
|
+
function applyLocalityBias(matrix, shape, tokens, labelToCol, localityBias, inputText) {
|
|
96
|
+
const abbrevs = shape.regionAbbreviations;
|
|
97
|
+
if (!abbrevs || abbrevs.length === 0)
|
|
98
|
+
return;
|
|
99
|
+
const bLocCol = labelToCol.get("B-locality");
|
|
100
|
+
const iLocCol = labelToCol.get("I-locality");
|
|
101
|
+
if (bLocCol === undefined)
|
|
102
|
+
return;
|
|
103
|
+
for (const abbrev of abbrevs) {
|
|
104
|
+
const candidates = [];
|
|
105
|
+
let prevStart = abbrev.start;
|
|
106
|
+
for (let t = tokens.length - 1; t >= 0; t--) {
|
|
107
|
+
const tok = tokens[t];
|
|
108
|
+
if (tok.end > abbrev.start)
|
|
109
|
+
continue;
|
|
110
|
+
const gap = prevStart - tok.end;
|
|
111
|
+
if (candidates.length === 0 && gap > 4)
|
|
112
|
+
break;
|
|
113
|
+
if (candidates.length > 0 && gap > 2)
|
|
114
|
+
break;
|
|
115
|
+
let isPostcode = false;
|
|
116
|
+
for (const fmt of shape.knownFormats) {
|
|
117
|
+
if (overlaps(tok, fmt.span)) {
|
|
118
|
+
isPostcode = true;
|
|
119
|
+
break;
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
if (isPostcode)
|
|
123
|
+
break;
|
|
124
|
+
candidates.push(t);
|
|
125
|
+
prevStart = tok.start;
|
|
126
|
+
}
|
|
127
|
+
if (candidates.length === 0)
|
|
128
|
+
continue;
|
|
129
|
+
candidates.reverse();
|
|
130
|
+
if (inputText) {
|
|
131
|
+
const firstTok = tokens[candidates[0]];
|
|
132
|
+
const lastTok = tokens[candidates[candidates.length - 1]];
|
|
133
|
+
const candidateText = inputText.slice(firstTok.start, lastTok.end).toLowerCase();
|
|
134
|
+
const regionNames = ABBREV_TO_REGION.get(abbrev.span);
|
|
135
|
+
if (regionNames?.some((name) => candidateText === name))
|
|
136
|
+
continue;
|
|
137
|
+
}
|
|
138
|
+
for (let i = 0; i < candidates.length; i++) {
|
|
139
|
+
const t = candidates[i];
|
|
140
|
+
const col = i === 0 ? bLocCol : iLocCol;
|
|
141
|
+
if (col === undefined)
|
|
142
|
+
continue;
|
|
143
|
+
matrix[t][col] = Math.max(matrix[t][col], localityBias);
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
const ABBREV_TO_REGION = new Map([
|
|
148
|
+
["AL", ["alabama"]],
|
|
149
|
+
["AK", ["alaska"]],
|
|
150
|
+
["AZ", ["arizona"]],
|
|
151
|
+
["AR", ["arkansas"]],
|
|
152
|
+
["CA", ["california"]],
|
|
153
|
+
["CO", ["colorado"]],
|
|
154
|
+
["CT", ["connecticut"]],
|
|
155
|
+
["DE", ["delaware"]],
|
|
156
|
+
["DC", ["district of columbia"]],
|
|
157
|
+
["FL", ["florida"]],
|
|
158
|
+
["GA", ["georgia"]],
|
|
159
|
+
["HI", ["hawaii"]],
|
|
160
|
+
["ID", ["idaho"]],
|
|
161
|
+
["IL", ["illinois"]],
|
|
162
|
+
["IN", ["indiana"]],
|
|
163
|
+
["IA", ["iowa"]],
|
|
164
|
+
["KS", ["kansas"]],
|
|
165
|
+
["KY", ["kentucky"]],
|
|
166
|
+
["LA", ["louisiana"]],
|
|
167
|
+
["ME", ["maine"]],
|
|
168
|
+
["MD", ["maryland"]],
|
|
169
|
+
["MA", ["massachusetts"]],
|
|
170
|
+
["MI", ["michigan"]],
|
|
171
|
+
["MN", ["minnesota"]],
|
|
172
|
+
["MS", ["mississippi"]],
|
|
173
|
+
["MO", ["missouri"]],
|
|
174
|
+
["MT", ["montana"]],
|
|
175
|
+
["NE", ["nebraska"]],
|
|
176
|
+
["NV", ["nevada"]],
|
|
177
|
+
["NH", ["new hampshire"]],
|
|
178
|
+
["NJ", ["new jersey"]],
|
|
179
|
+
["NM", ["new mexico"]],
|
|
180
|
+
["NY", ["new york"]],
|
|
181
|
+
["NC", ["north carolina"]],
|
|
182
|
+
["ND", ["north dakota"]],
|
|
183
|
+
["OH", ["ohio"]],
|
|
184
|
+
["OK", ["oklahoma"]],
|
|
185
|
+
["OR", ["oregon"]],
|
|
186
|
+
["PA", ["pennsylvania"]],
|
|
187
|
+
["RI", ["rhode island"]],
|
|
188
|
+
["SC", ["south carolina"]],
|
|
189
|
+
["SD", ["south dakota"]],
|
|
190
|
+
["TN", ["tennessee"]],
|
|
191
|
+
["TX", ["texas"]],
|
|
192
|
+
["UT", ["utah"]],
|
|
193
|
+
["VT", ["vermont"]],
|
|
194
|
+
["VA", ["virginia"]],
|
|
195
|
+
["WA", ["washington"]],
|
|
196
|
+
["WV", ["west virginia"]],
|
|
197
|
+
["WI", ["wisconsin"]],
|
|
198
|
+
["WY", ["wyoming"]],
|
|
199
|
+
["AS", ["american samoa"]],
|
|
200
|
+
["GU", ["guam"]],
|
|
201
|
+
["MP", ["northern mariana islands"]],
|
|
202
|
+
["PR", ["puerto rico"]],
|
|
203
|
+
["VI", ["virgin islands"]],
|
|
204
|
+
]);
|
|
75
205
|
function overlaps(a, b) {
|
|
76
206
|
return a.start < b.end && b.start < a.end;
|
|
77
207
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"query-shape-prior.js","sourceRoot":"","sources":["../query-shape-prior.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;GAmBG;
|
|
1
|
+
{"version":3,"file":"query-shape-prior.js","sourceRoot":"","sources":["../query-shape-prior.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;GAmBG;AA6BH;;;GAGG;AACH,MAAM,eAAe,GAAgC,IAAI,GAAG,CAAC;IAC5D,CAAC,QAAQ,EAAE,YAAY,CAAC;IACxB,CAAC,SAAS,EAAE,YAAY,CAAC;IACzB,CAAC,aAAa,EAAE,YAAY,CAAC;IAC7B,CAAC,aAAa,EAAE,YAAY,CAAC;IAC7B,CAAC,aAAa,EAAE,YAAY,CAAC;IAC7B,CAAC,aAAa,EAAE,YAAY,CAAC;IAC7B,CAAC,aAAa,EAAE,YAAY,CAAC;IAC7B,CAAC,QAAQ,EAAE,UAAU,CAAC;CACtB,CAAC,CAAA;AAkBF;;;;;;;;;GASG;AACH,MAAM,UAAU,mBAAmB,CAClC,KAAqB,EACrB,MAAgC,EAChC,MAA6B,EAC7B,OAAwB,EAAE;IAE1B,MAAM,CAAC,GAAG,MAAM,CAAC,MAAM,CAAA;IACvB,MAAM,CAAC,GAAG,MAAM,CAAC,MAAM,CAAA;IACvB,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,IAAI,GAAG,CAAA;IACvC,MAAM,MAAM,GAAe,EAAE,CAAA;IAC7B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE;QAAE,MAAM,CAAC,IAAI,CAAC,IAAI,KAAK,CAAS,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAA;IAErE,wCAAwC;IACxC,MAAM,UAAU,GAAG,IAAI,GAAG,EAAkB,CAAA;IAC5C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE;QAAE,UAAU,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAE,EAAE,CAAC,CAAC,CAAA;IAErE,IAAI,KAAK,CAAC,YAAY,CAAC,MAAM,KAAK,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,mBAAmB,IAAI,KAAK,CAAC,mBAAmB,CAAC,MAAM,KAAK,CAAC,CAAC,EAAE,CAAC;QAC/G,OAAO,MAAM,CAAA;IACd,CAAC;IAED,KAAK,MAAM,GAAG,IAAI,KAAK,CAAC,YAAY,EAAE,CAAC;QACtC,MAAM,WAAW,GAAG,eAAe,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,CAAA;QACnD,IAAI,CAAC,WAAW;YAAE,SAAQ;QAC1B,MAAM,GAAG,GAAG,UAAU,CAAC,GAAG,CAAC,WAAW,CAAC,CAAA;QACvC,IAAI,GAAG,KAAK,SAAS;YAAE,SAAQ;QAC/B,MAAM,IAAI,GAAG,GAAG,CAAC,UAAU,GAAG,SAAS,CAAA;QACvC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAC5B,MAAM,GAAG,GAAG,MAAM,CAAC,CAAC,CAAE,CAAA;YACtB,IAAI,QAAQ,CAAC,GAAG,EAAE,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC7B,MAAM,CAAC,CAAC,CAAE,CAAC,GAAG,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,GAAG,CAAE,EAAE,IAAI,CAAC,CAAA;YACnD,CAAC;QACF,CAAC;IACF,CAAC;IAED,uFAAuF;IACvF,oFAAoF;IACpF,qFAAqF;IACrF,+CAA+C;IAC/C,iBAAiB,CAAC,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,UAAU,EAAE,IAAI,CAAC,iBAAiB,IAAI,GAAG,EAAE,IAAI,CAAC,SAAS,CAAC,CAAA;IAEnG,OAAO,MAAM,CAAA;AACd,CAAC;AAED;;;;;;;;;;;;;GAaG;AACH,SAAS,iBAAiB,CACzB,MAAkB,EAClB,KAAqB,EACrB,MAAqD,EACrD,UAA+B,EAC/B,YAAoB,EACpB,SAAkB;IAElB,MAAM,OAAO,GAAG,KAAK,CAAC,mBAAmB,CAAA;IACzC,IAAI,CAAC,OAAO,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC;QAAE,OAAM;IAE5C,MAAM,OAAO,GAAG,UAAU,CAAC,GAAG,CAAC,YAAY,CAAC,CAAA;IAC5C,MAAM,OAAO,GAAG,UAAU,CAAC,GAAG,CAAC,YAAY,CAAC,CAAA;IAC5C,IAAI,OAAO,KAAK,SAAS;QAAE,OAAM;IAEjC,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;QAC9B,MAAM,UAAU,GAAa,EAAE,CAAA;QAC/B,IAAI,SAAS,GAAG,MAAM,CAAC,KAAK,CAAA;QAE5B,KAAK,IAAI,CAAC,GAAG,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAC7C,MAAM,GAAG,GAAG,MAAM,CAAC,CAAC,CAAE,CAAA;YACtB,IAAI,GAAG,CAAC,GAAG,GAAG,MAAM,CAAC,KAAK;gBAAE,SAAQ;YAEpC,MAAM,GAAG,GAAG,SAAS,GAAG,GAAG,CAAC,GAAG,CAAA;YAC/B,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC,IAAI,GAAG,GAAG,CAAC;gBAAE,MAAK;YAC7C,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,IAAI,GAAG,GAAG,CAAC;gBAAE,MAAK;YAE3C,IAAI,UAAU,GAAG,KAAK,CAAA;YACtB,KAAK,MAAM,GAAG,IAAI,KAAK,CAAC,YAAY,EAAE,CAAC;gBACtC,IAAI,QAAQ,CAAC,GAAG,EAAE,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC;oBAC7B,UAAU,GAAG,IAAI,CAAA;oBACjB,MAAK;gBACN,CAAC;YACF,CAAC;YACD,IAAI,UAAU;gBAAE,MAAK;YAErB,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;YAClB,SAAS,GAAG,GAAG,CAAC,KAAK,CAAA;QACtB,CAAC;QAED,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC;YAAE,SAAQ;QACrC,UAAU,CAAC,OAAO,EAAE,CAAA;QAEpB,IAAI,SAAS,EAAE,CAAC;YACf,MAAM,QAAQ,GAAG,MAAM,CAAC,UAAU,CAAC,CAAC,CAAE,CAAE,CAAA;YACxC,MAAM,OAAO,GAAG,MAAM,CAAC,UAAU,CAAC,UAAU,CAAC,MAAM,GAAG,CAAC,CAAE,CAAE,CAAA;YAC3D,MAAM,aAAa,GAAG,SAAS,CAAC,KAAK,CAAC,QAAQ,CAAC,KAAK,EAAE,OAAO,CAAC,GAAG,CAAC,CAAC,WAAW,EAAE,CAAA;YAChF,MAAM,WAAW,GAAG,gBAAgB,CAAC,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,CAAA;YACrD,IAAI,WAAW,EAAE,IAAI,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,aAAa,KAAK,IAAI,CAAC;gBAAE,SAAQ;QAClE,CAAC;QAED,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC5C,MAAM,CAAC,GAAG,UAAU,CAAC,CAAC,CAAE,CAAA;YACxB,MAAM,GAAG,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,OAAO,CAAA;YACvC,IAAI,GAAG,KAAK,SAAS;gBAAE,SAAQ;YAC/B,MAAM,CAAC,CAAC,CAAE,CAAC,GAAG,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,GAAG,CAAE,EAAE,YAAY,CAAC,CAAA;QAC3D,CAAC;IACF,CAAC;AACF,CAAC;AAED,MAAM,gBAAgB,GAAkC,IAAI,GAAG,CAAC;IAC/D,CAAC,IAAI,EAAE,CAAC,SAAS,CAAC,CAAC;IACnB,CAAC,IAAI,EAAE,CAAC,QAAQ,CAAC,CAAC;IAClB,CAAC,IAAI,EAAE,CAAC,SAAS,CAAC,CAAC;IACnB,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,CAAC;IACpB,CAAC,IAAI,EAAE,CAAC,YAAY,CAAC,CAAC;IACtB,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,CAAC;IACpB,CAAC,IAAI,EAAE,CAAC,aAAa,CAAC,CAAC;IACvB,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,CAAC;IACpB,CAAC,IAAI,EAAE,CAAC,sBAAsB,CAAC,CAAC;IAChC,CAAC,IAAI,EAAE,CAAC,SAAS,CAAC,CAAC;IACnB,CAAC,IAAI,EAAE,CAAC,SAAS,CAAC,CAAC;IACnB,CAAC,IAAI,EAAE,CAAC,QAAQ,CAAC,CAAC;IAClB,CAAC,IAAI,EAAE,CAAC,OAAO,CAAC,CAAC;IACjB,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,CAAC;IACpB,CAAC,IAAI,EAAE,CAAC,SAAS,CAAC,CAAC;IACnB,CAAC,IAAI,EAAE,CAAC,MAAM,CAAC,CAAC;IAChB,CAAC,IAAI,EAAE,CAAC,QAAQ,CAAC,CAAC;IAClB,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,CAAC;IACpB,CAAC,IAAI,EAAE,CAAC,WAAW,CAAC,CAAC;IACrB,CAAC,IAAI,EAAE,CAAC,OAAO,CAAC,CAAC;IACjB,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,CAAC;IACpB,CAAC,IAAI,EAAE,CAAC,eAAe,CAAC,CAAC;IACzB,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,CAAC;IACpB,CAAC,IAAI,EAAE,CAAC,WAAW,CAAC,CAAC;IACrB,CAAC,IAAI,EAAE,CAAC,aAAa,CAAC,CAAC;IACvB,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,CAAC;IACpB,CAAC,IAAI,EAAE,CAAC,SAAS,CAAC,CAAC;IACnB,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,CAAC;IACpB,CAAC,IAAI,EAAE,CAAC,QAAQ,CAAC,CAAC;IAClB,CAAC,IAAI,EAAE,CAAC,eAAe,CAAC,CAAC;IACzB,CAAC,IAAI,EAAE,CAAC,YAAY,CAAC,CAAC;IACtB,CAAC,IAAI,EAAE,CAAC,YAAY,CAAC,CAAC;IACtB,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,CAAC;IACpB,CAAC,IAAI,EAAE,CAAC,gBAAgB,CAAC,CAAC;IAC1B,CAAC,IAAI,EAAE,CAAC,cAAc,CAAC,CAAC;IACxB,CAAC,IAAI,EAAE,CAAC,MAAM,CAAC,CAAC;IAChB,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,CAAC;IACpB,CAAC,IAAI,EAAE,CAAC,QAAQ,CAAC,CAAC;IAClB,CAAC,IAAI,EAAE,CAAC,cAAc,CAAC,CAAC;IACxB,CAAC,IAAI,EAAE,CAAC,cAAc,CAAC,CAAC;IACxB,CAAC,IAAI,EAAE,CAAC,gBAAgB,CAAC,CAAC;IAC1B,CAAC,IAAI,EAAE,CAAC,cAAc,CAAC,CAAC;IACxB,CAAC,IAAI,EAAE,CAAC,WAAW,CAAC,CAAC;IACrB,CAAC,IAAI,EAAE,CAAC,OAAO,CAAC,CAAC;IACjB,CAAC,IAAI,EAAE,CAAC,MAAM,CAAC,CAAC;IAChB,CAAC,IAAI,EAAE,CAAC,SAAS,CAAC,CAAC;IACnB,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,CAAC;IACpB,CAAC,IAAI,EAAE,CAAC,YAAY,CAAC,CAAC;IACtB,CAAC,IAAI,EAAE,CAAC,eAAe,CAAC,CAAC;IACzB,CAAC,IAAI,EAAE,CAAC,WAAW,CAAC,CAAC;IACrB,CAAC,IAAI,EAAE,CAAC,SAAS,CAAC,CAAC;IACnB,CAAC,IAAI,EAAE,CAAC,gBAAgB,CAAC,CAAC;IAC1B,CAAC,IAAI,EAAE,CAAC,MAAM,CAAC,CAAC;IAChB,CAAC,IAAI,EAAE,CAAC,0BAA0B,CAAC,CAAC;IACpC,CAAC,IAAI,EAAE,CAAC,aAAa,CAAC,CAAC;IACvB,CAAC,IAAI,EAAE,CAAC,gBAAgB,CAAC,CAAC;CAC1B,CAAC,CAAA;AAEF,SAAS,QAAQ,CAAC,CAAiC,EAAE,CAAiC;IACrF,OAAO,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,GAAG,CAAA;AAC1C,CAAC;AAED,0EAA0E;AAC1E,MAAM,UAAU,iBAAiB,CAAC,SAAqB,EAAE,MAAkB;IAC1E,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,SAAS,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC,CAAA;IACnE,MAAM,GAAG,GAAe,EAAE,CAAA;IAC1B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC3C,MAAM,CAAC,GAAG,SAAS,CAAC,CAAC,CAAE,CAAA;QACvB,MAAM,CAAC,GAAG,MAAM,CAAC,CAAC,CAAC,IAAI,IAAI,KAAK,CAAS,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;QAC1D,MAAM,GAAG,GAAG,IAAI,KAAK,CAAS,CAAC,CAAC,MAAM,CAAC,CAAA;QACvC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC,EAAE;YAAE,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAE,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAA;QAC/D,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;IACd,CAAC;IACD,OAAO,GAAG,CAAA;AACX,CAAC"}
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Street-morphology emission bias — Layer 1 of the four-layer street-supplement architecture (see
|
|
7
|
+
* `docs/articles/concepts/street-supplement-architecture.md`).
|
|
8
|
+
*
|
|
9
|
+
* This module composes with {@linkcode buildFstEmissionPriors} (admin FST) and the QueryShape prior
|
|
10
|
+
* via {@linkcode addEmissionMatrix} — same shape, same additive semantics. Where the admin FST
|
|
11
|
+
* biases admin BIO labels (`B/I-locality`, `B/I-region`, ...), the morphology FST biases:
|
|
12
|
+
*
|
|
13
|
+
* - **Affix-token (the matched span):** toward `B/I-street_prefix` AND `B/I-street_suffix` (position
|
|
14
|
+
* unknown — let the model + context disambiguate).
|
|
15
|
+
* - **Adjacent token (one before AND one after each match):** toward `B/I-street`, AWAY from
|
|
16
|
+
* `B/I-dependent_locality`. The negative bias on `dependent_locality` is the load-bearing
|
|
17
|
+
* piece — it closes the inference-time vacuum that caused v0.6.1's 1066 dep_locality
|
|
18
|
+
* hallucinations (see [[project-v061-failure-mechanism]]).
|
|
19
|
+
*
|
|
20
|
+
* The morphology FST itself is built by `resolver-wof-sqlite/street-morphology-fst-builder.ts` and
|
|
21
|
+
* ships as a separate binary (`fst-street-morphology.bin`) loaded into a second `FstMatcher`
|
|
22
|
+
* instance.
|
|
23
|
+
*/
|
|
24
|
+
import { type FstMatcherLike } from "./fst-prior.js";
|
|
25
|
+
import type { TokenLike } from "./query-shape-prior.js";
|
|
26
|
+
export interface StreetMorphologyPriorOpts {
|
|
27
|
+
/** Multiplier on the base bias before {@linkcode maxBias} is applied. Default 1.0. */
|
|
28
|
+
biasScale?: number;
|
|
29
|
+
/**
|
|
30
|
+
* Maximum bias magnitude (logits) on the affix span itself. Default 3.0 — same as the admin FST.
|
|
31
|
+
* The morphology signal is structurally less ambiguous than admin names (`Avenue` is almost never
|
|
32
|
+
* anything but street-typing), so equal magnitude is justified.
|
|
33
|
+
*/
|
|
34
|
+
maxAffixBias?: number;
|
|
35
|
+
/**
|
|
36
|
+
* Maximum bias magnitude (logits) on the adjacent (neighbour) tokens for the `street` label.
|
|
37
|
+
* Default 2.0 — a touch weaker than the affix bias because the neighbour is inferred from
|
|
38
|
+
* adjacency, not direct match.
|
|
39
|
+
*/
|
|
40
|
+
maxNeighbourStreetBias?: number;
|
|
41
|
+
/**
|
|
42
|
+
* Magnitude of the negative bias applied to `dependent_locality` BIO labels on the adjacent
|
|
43
|
+
* tokens. Default 2.0. This is the load-bearing piece.
|
|
44
|
+
*/
|
|
45
|
+
dependentLocalityPenalty?: number;
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* Build a `[seqLen][numLabels]` bias matrix from street-morphology FST matches.
|
|
49
|
+
*
|
|
50
|
+
* The output composes with the admin FST bias matrix via {@linkcode addEmissionMatrix} — same
|
|
51
|
+
* `addEmissionMatrix(emissions, fstBias) → biasedEmissions` pattern as the existing admin prior.
|
|
52
|
+
*/
|
|
53
|
+
export declare function buildStreetMorphologyEmissionPriors(fst: FstMatcherLike, pieces: ReadonlyArray<TokenLike & {
|
|
54
|
+
piece: string;
|
|
55
|
+
}>, labels: ReadonlyArray<string>, opts?: StreetMorphologyPriorOpts): number[][];
|
|
56
|
+
//# sourceMappingURL=street-morphology-prior.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"street-morphology-prior.d.ts","sourceRoot":"","sources":["../street-morphology-prior.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAEH,OAAO,EAAwB,KAAK,cAAc,EAAkB,MAAM,gBAAgB,CAAA;AAC1F,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,wBAAwB,CAAA;AAEvD,MAAM,WAAW,yBAAyB;IACzC,sFAAsF;IACtF,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB;;;;OAIG;IACH,YAAY,CAAC,EAAE,MAAM,CAAA;IACrB;;;;OAIG;IACH,sBAAsB,CAAC,EAAE,MAAM,CAAA;IAC/B;;;OAGG;IACH,wBAAwB,CAAC,EAAE,MAAM,CAAA;CACjC;AAED;;;;;GAKG;AACH,wBAAgB,mCAAmC,CAClD,GAAG,EAAE,cAAc,EACnB,MAAM,EAAE,aAAa,CAAC,SAAS,GAAG;IAAE,KAAK,EAAE,MAAM,CAAA;CAAE,CAAC,EACpD,MAAM,EAAE,aAAa,CAAC,MAAM,CAAC,EAC7B,IAAI,GAAE,yBAA8B,GAClC,MAAM,EAAE,EAAE,CAgIZ"}
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Street-morphology emission bias — Layer 1 of the four-layer street-supplement architecture (see
|
|
7
|
+
* `docs/articles/concepts/street-supplement-architecture.md`).
|
|
8
|
+
*
|
|
9
|
+
* This module composes with {@linkcode buildFstEmissionPriors} (admin FST) and the QueryShape prior
|
|
10
|
+
* via {@linkcode addEmissionMatrix} — same shape, same additive semantics. Where the admin FST
|
|
11
|
+
* biases admin BIO labels (`B/I-locality`, `B/I-region`, ...), the morphology FST biases:
|
|
12
|
+
*
|
|
13
|
+
* - **Affix-token (the matched span):** toward `B/I-street_prefix` AND `B/I-street_suffix` (position
|
|
14
|
+
* unknown — let the model + context disambiguate).
|
|
15
|
+
* - **Adjacent token (one before AND one after each match):** toward `B/I-street`, AWAY from
|
|
16
|
+
* `B/I-dependent_locality`. The negative bias on `dependent_locality` is the load-bearing
|
|
17
|
+
* piece — it closes the inference-time vacuum that caused v0.6.1's 1066 dep_locality
|
|
18
|
+
* hallucinations (see [[project-v061-failure-mechanism]]).
|
|
19
|
+
*
|
|
20
|
+
* The morphology FST itself is built by `resolver-wof-sqlite/street-morphology-fst-builder.ts` and
|
|
21
|
+
* ships as a separate binary (`fst-street-morphology.bin`) loaded into a second `FstMatcher`
|
|
22
|
+
* instance.
|
|
23
|
+
*/
|
|
24
|
+
import { groupPiecesIntoWords } from "./fst-prior.js";
|
|
25
|
+
/**
|
|
26
|
+
* Build a `[seqLen][numLabels]` bias matrix from street-morphology FST matches.
|
|
27
|
+
*
|
|
28
|
+
* The output composes with the admin FST bias matrix via {@linkcode addEmissionMatrix} — same
|
|
29
|
+
* `addEmissionMatrix(emissions, fstBias) → biasedEmissions` pattern as the existing admin prior.
|
|
30
|
+
*/
|
|
31
|
+
export function buildStreetMorphologyEmissionPriors(fst, pieces, labels, opts = {}) {
|
|
32
|
+
const T = pieces.length;
|
|
33
|
+
const L = labels.length;
|
|
34
|
+
const biasScale = opts.biasScale ?? 1.0;
|
|
35
|
+
const maxAffixBias = opts.maxAffixBias ?? 3.0;
|
|
36
|
+
const maxNeighbourStreetBias = opts.maxNeighbourStreetBias ?? 2.0;
|
|
37
|
+
const dependentLocalityPenalty = opts.dependentLocalityPenalty ?? 2.0;
|
|
38
|
+
const matrix = [];
|
|
39
|
+
for (let t = 0; t < T; t++)
|
|
40
|
+
matrix.push(new Array(L).fill(0));
|
|
41
|
+
const labelToCol = new Map();
|
|
42
|
+
for (let k = 0; k < labels.length; k++)
|
|
43
|
+
labelToCol.set(labels[k], k);
|
|
44
|
+
const bStreetPrefix = labelToCol.get("B-street_prefix");
|
|
45
|
+
const iStreetPrefix = labelToCol.get("I-street_prefix");
|
|
46
|
+
const bStreetSuffix = labelToCol.get("B-street_suffix");
|
|
47
|
+
const iStreetSuffix = labelToCol.get("I-street_suffix");
|
|
48
|
+
const bStreet = labelToCol.get("B-street");
|
|
49
|
+
const iStreet = labelToCol.get("I-street");
|
|
50
|
+
const bDepLoc = labelToCol.get("B-dependent_locality");
|
|
51
|
+
const iDepLoc = labelToCol.get("I-dependent_locality");
|
|
52
|
+
// If the label vocabulary doesn't include street tags at all (e.g. a Stage 1 model), there's
|
|
53
|
+
// nothing to bias toward. Return zero-matrix and let the additive pipeline no-op.
|
|
54
|
+
if (bStreet === undefined || bStreetPrefix === undefined || bStreetSuffix === undefined) {
|
|
55
|
+
return matrix;
|
|
56
|
+
}
|
|
57
|
+
const wordGroups = groupPiecesIntoWords(pieces);
|
|
58
|
+
if (wordGroups.length === 0)
|
|
59
|
+
return matrix;
|
|
60
|
+
const affixMatches = [];
|
|
61
|
+
// Pass 1 — walk every contiguous subpath, collect accepting morphology matches, and apply
|
|
62
|
+
// the affix bias to matched tokens.
|
|
63
|
+
for (let start = 0; start < wordGroups.length; start++) {
|
|
64
|
+
const group = wordGroups[start];
|
|
65
|
+
if (group.fstToken === "")
|
|
66
|
+
continue;
|
|
67
|
+
const initial = fst.walk([group.fstToken]);
|
|
68
|
+
if (!initial)
|
|
69
|
+
continue;
|
|
70
|
+
let bestEnd = -1;
|
|
71
|
+
let bestStateId = -1;
|
|
72
|
+
if (initial.accepted) {
|
|
73
|
+
bestEnd = start;
|
|
74
|
+
bestStateId = initial.stateId;
|
|
75
|
+
}
|
|
76
|
+
let current = initial;
|
|
77
|
+
for (let end = start + 1; end < wordGroups.length; end++) {
|
|
78
|
+
const nextGroup = wordGroups[end];
|
|
79
|
+
if (nextGroup.fstToken === "")
|
|
80
|
+
continue;
|
|
81
|
+
const next = fst.walkFrom(current, nextGroup.fstToken);
|
|
82
|
+
if (!next)
|
|
83
|
+
break;
|
|
84
|
+
if (next.accepted) {
|
|
85
|
+
bestEnd = end;
|
|
86
|
+
bestStateId = next.stateId;
|
|
87
|
+
}
|
|
88
|
+
current = next;
|
|
89
|
+
}
|
|
90
|
+
if (bestEnd === -1)
|
|
91
|
+
continue;
|
|
92
|
+
// Verify the accepting entries are street_affix (the morphology FST may eventually contain
|
|
93
|
+
// other placetypes if the binary format is reused for related priors).
|
|
94
|
+
const entries = fst.accepting(bestStateId);
|
|
95
|
+
const hasAffix = entries.some((e) => e.placetype === "street_affix");
|
|
96
|
+
if (!hasAffix)
|
|
97
|
+
continue;
|
|
98
|
+
affixMatches.push({ startGroupIdx: start, endGroupIdx: bestEnd });
|
|
99
|
+
// Collect piece indices for the matched span.
|
|
100
|
+
const affixPieceIndices = [];
|
|
101
|
+
for (let g = start; g <= bestEnd; g++) {
|
|
102
|
+
const wg = wordGroups[g];
|
|
103
|
+
if (wg.fstToken === "")
|
|
104
|
+
continue;
|
|
105
|
+
for (const pi of wg.pieceIndices)
|
|
106
|
+
affixPieceIndices.push(pi);
|
|
107
|
+
}
|
|
108
|
+
// Apply affix bias: positive bias toward both prefix and suffix BIO labels on the matched
|
|
109
|
+
// tokens. The model's existing logits + the QueryShape prior + the adjacent context (via
|
|
110
|
+
// pass 2) determine which of {prefix, suffix} actually wins. We don't pre-commit to one.
|
|
111
|
+
const affixBias = biasScale * maxAffixBias;
|
|
112
|
+
for (let k = 0; k < affixPieceIndices.length; k++) {
|
|
113
|
+
const pi = affixPieceIndices[k];
|
|
114
|
+
const prefixCol = k === 0 ? bStreetPrefix : (iStreetPrefix ?? bStreetPrefix);
|
|
115
|
+
const suffixCol = k === 0 ? bStreetSuffix : (iStreetSuffix ?? bStreetSuffix);
|
|
116
|
+
matrix[pi][prefixCol] = Math.max(matrix[pi][prefixCol], affixBias);
|
|
117
|
+
matrix[pi][suffixCol] = Math.max(matrix[pi][suffixCol], affixBias);
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
if (affixMatches.length === 0)
|
|
121
|
+
return matrix;
|
|
122
|
+
// Pass 2 — for each affix match, identify the immediately-adjacent word groups (skipping
|
|
123
|
+
// empty/punctuation groups) on either side and bias them toward street, away from
|
|
124
|
+
// dependent_locality.
|
|
125
|
+
const neighbourStreetBias = biasScale * maxNeighbourStreetBias;
|
|
126
|
+
for (const match of affixMatches) {
|
|
127
|
+
const before = findNeighbour(wordGroups, match.startGroupIdx, -1);
|
|
128
|
+
const after = findNeighbour(wordGroups, match.endGroupIdx, +1);
|
|
129
|
+
for (const neighbour of [before, after]) {
|
|
130
|
+
if (!neighbour)
|
|
131
|
+
continue;
|
|
132
|
+
const indices = neighbour.pieceIndices;
|
|
133
|
+
for (let k = 0; k < indices.length; k++) {
|
|
134
|
+
const pi = indices[k];
|
|
135
|
+
const streetCol = k === 0 ? bStreet : (iStreet ?? bStreet);
|
|
136
|
+
matrix[pi][streetCol] = Math.max(matrix[pi][streetCol], neighbourStreetBias);
|
|
137
|
+
if (bDepLoc !== undefined) {
|
|
138
|
+
const depLocCol = k === 0 ? bDepLoc : (iDepLoc ?? bDepLoc);
|
|
139
|
+
matrix[pi][depLocCol] = Math.min(matrix[pi][depLocCol], -dependentLocalityPenalty);
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
return matrix;
|
|
145
|
+
}
|
|
146
|
+
/**
|
|
147
|
+
* Walk word groups outward from `fromGroupIdx` in `direction` (+1 or -1), skipping empty groups
|
|
148
|
+
* (whitespace / punctuation), and return the first non-empty group encountered — or `null` if no
|
|
149
|
+
* such neighbour exists.
|
|
150
|
+
*/
|
|
151
|
+
function findNeighbour(groups, fromGroupIdx, direction) {
|
|
152
|
+
for (let i = fromGroupIdx + direction; i >= 0 && i < groups.length; i += direction) {
|
|
153
|
+
const g = groups[i];
|
|
154
|
+
if (g.fstToken !== "")
|
|
155
|
+
return g;
|
|
156
|
+
}
|
|
157
|
+
return null;
|
|
158
|
+
}
|
|
159
|
+
//# sourceMappingURL=street-morphology-prior.js.map
|