@mailwoman/neural 2.1.0 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/out/anchor-inference.d.ts +57 -0
- package/out/anchor-inference.d.ts.map +1 -0
- package/out/anchor-inference.js +94 -0
- package/out/anchor-inference.js.map +1 -0
- package/out/browser.d.ts +18 -0
- package/out/browser.d.ts.map +1 -0
- package/out/browser.js +19 -0
- package/out/browser.js.map +1 -0
- package/out/classifier.d.ts +145 -11
- package/out/classifier.d.ts.map +1 -1
- package/out/classifier.js +185 -20
- package/out/classifier.js.map +1 -1
- package/out/fst-prior.d.ts +71 -0
- package/out/fst-prior.d.ts.map +1 -0
- package/out/fst-prior.js +173 -0
- package/out/fst-prior.js.map +1 -0
- package/out/index.d.ts +7 -0
- package/out/index.d.ts.map +1 -1
- package/out/index.js +5 -0
- package/out/index.js.map +1 -1
- package/out/labels.d.ts +30 -6
- package/out/labels.d.ts.map +1 -1
- package/out/labels.js +43 -6
- package/out/labels.js.map +1 -1
- package/out/onnx-runner.d.ts +8 -1
- package/out/onnx-runner.d.ts.map +1 -1
- package/out/onnx-runner.js +31 -1
- package/out/onnx-runner.js.map +1 -1
- package/out/postcode-anchor.d.ts +117 -0
- package/out/postcode-anchor.d.ts.map +1 -0
- package/out/postcode-anchor.js +269 -0
- package/out/postcode-anchor.js.map +1 -0
- package/out/postcode-binary-resolver.d.ts +60 -0
- package/out/postcode-binary-resolver.d.ts.map +1 -0
- package/out/postcode-binary-resolver.js +208 -0
- package/out/postcode-binary-resolver.js.map +1 -0
- package/out/postcode-repair.d.ts +65 -0
- package/out/postcode-repair.d.ts.map +1 -0
- package/out/postcode-repair.js +171 -0
- package/out/postcode-repair.js.map +1 -0
- package/out/proposal-classifier.d.ts +5 -1
- package/out/proposal-classifier.d.ts.map +1 -1
- package/out/proposal-classifier.js +5 -3
- package/out/proposal-classifier.js.map +1 -1
- package/out/query-shape-prior.d.ts +74 -0
- package/out/query-shape-prior.d.ts.map +1 -0
- package/out/query-shape-prior.js +223 -0
- package/out/query-shape-prior.js.map +1 -0
- package/out/street-morphology-prior.d.ts +56 -0
- package/out/street-morphology-prior.d.ts.map +1 -0
- package/out/street-morphology-prior.js +159 -0
- package/out/street-morphology-prior.js.map +1 -0
- package/out/tokenizer.d.ts +6 -1
- package/out/tokenizer.d.ts.map +1 -1
- package/out/tokenizer.js +8 -3
- package/out/tokenizer.js.map +1 -1
- package/out/unit-repair.d.ts +46 -0
- package/out/unit-repair.d.ts.map +1 -0
- package/out/unit-repair.js +147 -0
- package/out/unit-repair.js.map +1 -0
- package/out/viterbi.d.ts +76 -0
- package/out/viterbi.d.ts.map +1 -0
- package/out/viterbi.js +163 -0
- package/out/viterbi.js.map +1 -0
- package/out/vitest.config.d.ts.map +1 -1
- package/out/vitest.config.js +3 -0
- package/out/vitest.config.js.map +1 -1
- package/out/weights.d.ts +42 -0
- package/out/weights.d.ts.map +1 -1
- package/out/weights.js +92 -4
- package/out/weights.js.map +1 -1
- package/package.json +10 -3
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Soft-prior emission biases derived from `QueryShape`.
|
|
7
|
+
*
|
|
8
|
+
* When the QueryShape sub-system has identified a known-format span (US ZIP, UK postcode, PO box,
|
|
9
|
+
* etc.), this module produces an additive bias matrix that nudges the encoder's per-token
|
|
10
|
+
* emissions toward the matching BIO label. The biases compose with the structural BIO mask in the
|
|
11
|
+
* Viterbi decoder — confident encoder predictions still win, but uncertain ones get pulled toward
|
|
12
|
+
* the format-implied label.
|
|
13
|
+
*
|
|
14
|
+
* Bitter-lesson-safe boundary: we don't override the encoder, just bias it. The encoder remains the
|
|
15
|
+
* authority on context-dependent calls (the "Buffalo Wild Wings, Buffalo, NY" disambiguation);
|
|
16
|
+
* the QueryShape prior helps on the easy cases (a 5-digit token is _probably_ a postcode).
|
|
17
|
+
*
|
|
18
|
+
* Uses structural typing for the QueryShape input so this module has zero dependencies on
|
|
19
|
+
* `@mailwoman/query-shape` — consumers compute the shape with that package, pass it in here.
|
|
20
|
+
*/
|
|
21
|
+
/**
|
|
22
|
+
* Mapping from `KnownFormat` strings to the BIO label that should be boosted. Multiple formats may
|
|
23
|
+
* map to the same label (all postcode flavors → `B-postcode`).
|
|
24
|
+
*/
|
|
25
|
+
const FORMAT_TO_LABEL = new Map([
|
|
26
|
+
["us_zip", "B-postcode"],
|
|
27
|
+
["us_zip4", "B-postcode"],
|
|
28
|
+
["fr_postcode", "B-postcode"],
|
|
29
|
+
["de_postcode", "B-postcode"],
|
|
30
|
+
["uk_postcode", "B-postcode"],
|
|
31
|
+
["ca_postcode", "B-postcode"],
|
|
32
|
+
["jp_postcode", "B-postcode"],
|
|
33
|
+
["po_box", "B-po_box"],
|
|
34
|
+
]);
|
|
35
|
+
/**
|
|
36
|
+
* Build a `[seqLen][numLabels]` matrix of additive log-bias to be added to encoder emissions before
|
|
37
|
+
* Viterbi decoding.
|
|
38
|
+
*
|
|
39
|
+
* For each (token, format-hit) pair where the token's character span overlaps the hit's span, the
|
|
40
|
+
* matrix entry for the format's mapped label receives `hit.confidence × biasScale`. Tokens that
|
|
41
|
+
* don't overlap any hit, or for which no label mapping exists, get 0.
|
|
42
|
+
*
|
|
43
|
+
* Returns the all-zeros matrix if `shape.knownFormats` is empty — composes harmlessly.
|
|
44
|
+
*/
|
|
45
|
+
export function buildEmissionPriors(shape, tokens, labels, opts = {}) {
|
|
46
|
+
const T = tokens.length;
|
|
47
|
+
const L = labels.length;
|
|
48
|
+
const biasScale = opts.biasScale ?? 1.0;
|
|
49
|
+
const matrix = [];
|
|
50
|
+
for (let t = 0; t < T; t++)
|
|
51
|
+
matrix.push(new Array(L).fill(0));
|
|
52
|
+
// Index label → column for fast lookup.
|
|
53
|
+
const labelToCol = new Map();
|
|
54
|
+
for (let k = 0; k < labels.length; k++)
|
|
55
|
+
labelToCol.set(labels[k], k);
|
|
56
|
+
if (shape.knownFormats.length === 0 && (!shape.regionAbbreviations || shape.regionAbbreviations.length === 0)) {
|
|
57
|
+
return matrix;
|
|
58
|
+
}
|
|
59
|
+
for (const hit of shape.knownFormats) {
|
|
60
|
+
const targetLabel = FORMAT_TO_LABEL.get(hit.format);
|
|
61
|
+
if (!targetLabel)
|
|
62
|
+
continue;
|
|
63
|
+
const col = labelToCol.get(targetLabel);
|
|
64
|
+
if (col === undefined)
|
|
65
|
+
continue;
|
|
66
|
+
const bias = hit.confidence * biasScale;
|
|
67
|
+
for (let t = 0; t < T; t++) {
|
|
68
|
+
const tok = tokens[t];
|
|
69
|
+
if (overlaps(tok, hit.span)) {
|
|
70
|
+
matrix[t][col] = Math.max(matrix[t][col], bias);
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
// Locality soft prior: when a region abbreviation is detected (e.g., "DC", "NY"), bias
|
|
75
|
+
// preceding alphabetic tokens toward B-locality / I-locality. This counters the WOF
|
|
76
|
+
// bare-name frequency dominance that makes the model over-emit B-region on ambiguous
|
|
77
|
+
// place names like "Washington" or "New York".
|
|
78
|
+
applyLocalityBias(matrix, shape, tokens, labelToCol, opts.localityBiasScale ?? 2.0, opts.inputText);
|
|
79
|
+
return matrix;
|
|
80
|
+
}
|
|
81
|
+
/**
|
|
82
|
+
* Apply locality bias to tokens preceding a detected region abbreviation.
|
|
83
|
+
*
|
|
84
|
+
* For "Washington, DC" — "DC" is the region abbreviation; "Washington" gets biased toward
|
|
85
|
+
* B-locality. For "New York, NY" — "New" gets B-locality and "York" gets I-locality.
|
|
86
|
+
*
|
|
87
|
+
* Guard: if the preceding text matches the full name of the region that the abbreviation represents
|
|
88
|
+
* (e.g., "Washington" before "WA"), the locality bias is NOT applied — the text IS the region, not
|
|
89
|
+
* a locality within it.
|
|
90
|
+
*
|
|
91
|
+
* Constraint: only bias tokens that appear BEFORE the abbreviation's character offset and are
|
|
92
|
+
* alphabetic (start with uppercase). Tokens that are part of a known postcode format or are
|
|
93
|
+
* themselves region abbreviations are skipped.
|
|
94
|
+
*/
|
|
95
|
+
function applyLocalityBias(matrix, shape, tokens, labelToCol, localityBias, inputText) {
|
|
96
|
+
const abbrevs = shape.regionAbbreviations;
|
|
97
|
+
if (!abbrevs || abbrevs.length === 0)
|
|
98
|
+
return;
|
|
99
|
+
const bLocCol = labelToCol.get("B-locality");
|
|
100
|
+
const iLocCol = labelToCol.get("I-locality");
|
|
101
|
+
if (bLocCol === undefined)
|
|
102
|
+
return;
|
|
103
|
+
for (const abbrev of abbrevs) {
|
|
104
|
+
const candidates = [];
|
|
105
|
+
let prevStart = abbrev.start;
|
|
106
|
+
for (let t = tokens.length - 1; t >= 0; t--) {
|
|
107
|
+
const tok = tokens[t];
|
|
108
|
+
if (tok.end > abbrev.start)
|
|
109
|
+
continue;
|
|
110
|
+
const gap = prevStart - tok.end;
|
|
111
|
+
if (candidates.length === 0 && gap > 4)
|
|
112
|
+
break;
|
|
113
|
+
if (candidates.length > 0 && gap > 2)
|
|
114
|
+
break;
|
|
115
|
+
let isPostcode = false;
|
|
116
|
+
for (const fmt of shape.knownFormats) {
|
|
117
|
+
if (overlaps(tok, fmt.span)) {
|
|
118
|
+
isPostcode = true;
|
|
119
|
+
break;
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
if (isPostcode)
|
|
123
|
+
break;
|
|
124
|
+
candidates.push(t);
|
|
125
|
+
prevStart = tok.start;
|
|
126
|
+
}
|
|
127
|
+
if (candidates.length === 0)
|
|
128
|
+
continue;
|
|
129
|
+
candidates.reverse();
|
|
130
|
+
if (inputText) {
|
|
131
|
+
const firstTok = tokens[candidates[0]];
|
|
132
|
+
const lastTok = tokens[candidates[candidates.length - 1]];
|
|
133
|
+
const candidateText = inputText.slice(firstTok.start, lastTok.end).toLowerCase();
|
|
134
|
+
const regionNames = ABBREV_TO_REGION.get(abbrev.span);
|
|
135
|
+
if (regionNames?.some((name) => candidateText === name))
|
|
136
|
+
continue;
|
|
137
|
+
}
|
|
138
|
+
for (let i = 0; i < candidates.length; i++) {
|
|
139
|
+
const t = candidates[i];
|
|
140
|
+
const col = i === 0 ? bLocCol : iLocCol;
|
|
141
|
+
if (col === undefined)
|
|
142
|
+
continue;
|
|
143
|
+
matrix[t][col] = Math.max(matrix[t][col], localityBias);
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
const ABBREV_TO_REGION = new Map([
|
|
148
|
+
["AL", ["alabama"]],
|
|
149
|
+
["AK", ["alaska"]],
|
|
150
|
+
["AZ", ["arizona"]],
|
|
151
|
+
["AR", ["arkansas"]],
|
|
152
|
+
["CA", ["california"]],
|
|
153
|
+
["CO", ["colorado"]],
|
|
154
|
+
["CT", ["connecticut"]],
|
|
155
|
+
["DE", ["delaware"]],
|
|
156
|
+
["DC", ["district of columbia"]],
|
|
157
|
+
["FL", ["florida"]],
|
|
158
|
+
["GA", ["georgia"]],
|
|
159
|
+
["HI", ["hawaii"]],
|
|
160
|
+
["ID", ["idaho"]],
|
|
161
|
+
["IL", ["illinois"]],
|
|
162
|
+
["IN", ["indiana"]],
|
|
163
|
+
["IA", ["iowa"]],
|
|
164
|
+
["KS", ["kansas"]],
|
|
165
|
+
["KY", ["kentucky"]],
|
|
166
|
+
["LA", ["louisiana"]],
|
|
167
|
+
["ME", ["maine"]],
|
|
168
|
+
["MD", ["maryland"]],
|
|
169
|
+
["MA", ["massachusetts"]],
|
|
170
|
+
["MI", ["michigan"]],
|
|
171
|
+
["MN", ["minnesota"]],
|
|
172
|
+
["MS", ["mississippi"]],
|
|
173
|
+
["MO", ["missouri"]],
|
|
174
|
+
["MT", ["montana"]],
|
|
175
|
+
["NE", ["nebraska"]],
|
|
176
|
+
["NV", ["nevada"]],
|
|
177
|
+
["NH", ["new hampshire"]],
|
|
178
|
+
["NJ", ["new jersey"]],
|
|
179
|
+
["NM", ["new mexico"]],
|
|
180
|
+
["NY", ["new york"]],
|
|
181
|
+
["NC", ["north carolina"]],
|
|
182
|
+
["ND", ["north dakota"]],
|
|
183
|
+
["OH", ["ohio"]],
|
|
184
|
+
["OK", ["oklahoma"]],
|
|
185
|
+
["OR", ["oregon"]],
|
|
186
|
+
["PA", ["pennsylvania"]],
|
|
187
|
+
["RI", ["rhode island"]],
|
|
188
|
+
["SC", ["south carolina"]],
|
|
189
|
+
["SD", ["south dakota"]],
|
|
190
|
+
["TN", ["tennessee"]],
|
|
191
|
+
["TX", ["texas"]],
|
|
192
|
+
["UT", ["utah"]],
|
|
193
|
+
["VT", ["vermont"]],
|
|
194
|
+
["VA", ["virginia"]],
|
|
195
|
+
["WA", ["washington"]],
|
|
196
|
+
["WV", ["west virginia"]],
|
|
197
|
+
["WI", ["wisconsin"]],
|
|
198
|
+
["WY", ["wyoming"]],
|
|
199
|
+
["AS", ["american samoa"]],
|
|
200
|
+
["GU", ["guam"]],
|
|
201
|
+
["MP", ["northern mariana islands"]],
|
|
202
|
+
["PR", ["puerto rico"]],
|
|
203
|
+
["VI", ["virgin islands"]],
|
|
204
|
+
]);
|
|
205
|
+
function overlaps(a, b) {
|
|
206
|
+
return a.start < b.end && b.start < a.end;
|
|
207
|
+
}
|
|
208
|
+
/** Element-wise add two matrices of equal shape. Returns a new matrix. */
|
|
209
|
+
export function addEmissionMatrix(emissions, priors) {
|
|
210
|
+
if (priors.length === 0)
|
|
211
|
+
return emissions.map((row) => row.slice());
|
|
212
|
+
const out = [];
|
|
213
|
+
for (let t = 0; t < emissions.length; t++) {
|
|
214
|
+
const e = emissions[t];
|
|
215
|
+
const p = priors[t] ?? new Array(e.length).fill(0);
|
|
216
|
+
const row = new Array(e.length);
|
|
217
|
+
for (let k = 0; k < e.length; k++)
|
|
218
|
+
row[k] = e[k] + (p[k] ?? 0);
|
|
219
|
+
out.push(row);
|
|
220
|
+
}
|
|
221
|
+
return out;
|
|
222
|
+
}
|
|
223
|
+
//# sourceMappingURL=query-shape-prior.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"query-shape-prior.js","sourceRoot":"","sources":["../query-shape-prior.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;GAmBG;AA6BH;;;GAGG;AACH,MAAM,eAAe,GAAgC,IAAI,GAAG,CAAC;IAC5D,CAAC,QAAQ,EAAE,YAAY,CAAC;IACxB,CAAC,SAAS,EAAE,YAAY,CAAC;IACzB,CAAC,aAAa,EAAE,YAAY,CAAC;IAC7B,CAAC,aAAa,EAAE,YAAY,CAAC;IAC7B,CAAC,aAAa,EAAE,YAAY,CAAC;IAC7B,CAAC,aAAa,EAAE,YAAY,CAAC;IAC7B,CAAC,aAAa,EAAE,YAAY,CAAC;IAC7B,CAAC,QAAQ,EAAE,UAAU,CAAC;CACtB,CAAC,CAAA;AAkBF;;;;;;;;;GASG;AACH,MAAM,UAAU,mBAAmB,CAClC,KAAqB,EACrB,MAAgC,EAChC,MAA6B,EAC7B,OAAwB,EAAE;IAE1B,MAAM,CAAC,GAAG,MAAM,CAAC,MAAM,CAAA;IACvB,MAAM,CAAC,GAAG,MAAM,CAAC,MAAM,CAAA;IACvB,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,IAAI,GAAG,CAAA;IACvC,MAAM,MAAM,GAAe,EAAE,CAAA;IAC7B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE;QAAE,MAAM,CAAC,IAAI,CAAC,IAAI,KAAK,CAAS,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAA;IAErE,wCAAwC;IACxC,MAAM,UAAU,GAAG,IAAI,GAAG,EAAkB,CAAA;IAC5C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE;QAAE,UAAU,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAE,EAAE,CAAC,CAAC,CAAA;IAErE,IAAI,KAAK,CAAC,YAAY,CAAC,MAAM,KAAK,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,mBAAmB,IAAI,KAAK,CAAC,mBAAmB,CAAC,MAAM,KAAK,CAAC,CAAC,EAAE,CAAC;QAC/G,OAAO,MAAM,CAAA;IACd,CAAC;IAED,KAAK,MAAM,GAAG,IAAI,KAAK,CAAC,YAAY,EAAE,CAAC;QACtC,MAAM,WAAW,GAAG,eAAe,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,CAAA;QACnD,IAAI,CAAC,WAAW;YAAE,SAAQ;QAC1B,MAAM,GAAG,GAAG,UAAU,CAAC,GAAG,CAAC,WAAW,CAAC,CAAA;QACvC,IAAI,GAAG,KAAK,SAAS;YAAE,SAAQ;QAC/B,MAAM,IAAI,GAAG,GAAG,CAAC,UAAU,GAAG,SAAS,CAAA;QACvC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAC5B,MAAM,GAAG,GAAG,MAAM,CAAC,CAAC,CAAE,CAAA;YACtB,IAAI,QAAQ,CAAC,GAAG,EAAE,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC7B,MAAM,CAAC,CAAC,CAAE,CAAC,GAAG,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,GAAG,CAAE,EAAE,IAAI,CAAC,CAAA;YACnD,CAAC;QACF,CAAC;IACF,CAAC;IAED,uFAAuF;IACvF,oFAAoF;IACpF,qFAAqF;IACrF,+CAA+C;IAC/C,iBAAiB,CAAC,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,UAAU,EAAE,IAAI,CAAC,iBAAiB,IAAI,GAAG,EAAE,IAAI,CAAC,SAAS,CAAC,CAAA;IAEnG,OAAO,MAAM,CAAA;AACd,CAAC;AAED;;;;;;;;;;;;;GAaG;AACH,SAAS,iBAAiB,CACzB,MAAkB,EAClB,KAAqB,EACrB,MAAqD,EACrD,UAA+B,EAC/B,YAAoB,EACpB,SAAkB;IAElB,MAAM,OAAO,GAAG,KAAK,CAAC,mBAAmB,CAAA;IACzC,IAAI,CAAC,OAAO,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC;QAAE,OAAM;IAE5C,MAAM,OAAO,GAAG,UAAU,CAAC,GAAG,CAAC,YAAY,CAAC,CAAA;IAC5C,MAAM,OAAO,GAAG,UAAU,CAAC,GAAG,CAAC,YAAY,CAAC,CAAA;IAC5C,IAAI,OAAO,KAAK,SAAS;QAAE,OAAM;IAEjC,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;QAC9B,MAAM,UAAU,GAAa,EAAE,CAAA;QAC/B,IAAI,SAAS,GAAG,MAAM,CAAC,KAAK,CAAA;QAE5B,KAAK,IAAI,CAAC,GAAG,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAC7C,MAAM,GAAG,GAAG,MAAM,CAAC,CAAC,CAAE,CAAA;YACtB,IAAI,GAAG,CAAC,GAAG,GAAG,MAAM,CAAC,KAAK;gBAAE,SAAQ;YAEpC,MAAM,GAAG,GAAG,SAAS,GAAG,GAAG,CAAC,GAAG,CAAA;YAC/B,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC,IAAI,GAAG,GAAG,CAAC;gBAAE,MAAK;YAC7C,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,IAAI,GAAG,GAAG,CAAC;gBAAE,MAAK;YAE3C,IAAI,UAAU,GAAG,KAAK,CAAA;YACtB,KAAK,MAAM,GAAG,IAAI,KAAK,CAAC,YAAY,EAAE,CAAC;gBACtC,IAAI,QAAQ,CAAC,GAAG,EAAE,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC;oBAC7B,UAAU,GAAG,IAAI,CAAA;oBACjB,MAAK;gBACN,CAAC;YACF,CAAC;YACD,IAAI,UAAU;gBAAE,MAAK;YAErB,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;YAClB,SAAS,GAAG,GAAG,CAAC,KAAK,CAAA;QACtB,CAAC;QAED,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC;YAAE,SAAQ;QACrC,UAAU,CAAC,OAAO,EAAE,CAAA;QAEpB,IAAI,SAAS,EAAE,CAAC;YACf,MAAM,QAAQ,GAAG,MAAM,CAAC,UAAU,CAAC,CAAC,CAAE,CAAE,CAAA;YACxC,MAAM,OAAO,GAAG,MAAM,CAAC,UAAU,CAAC,UAAU,CAAC,MAAM,GAAG,CAAC,CAAE,CAAE,CAAA;YAC3D,MAAM,aAAa,GAAG,SAAS,CAAC,KAAK,CAAC,QAAQ,CAAC,KAAK,EAAE,OAAO,CAAC,GAAG,CAAC,CAAC,WAAW,EAAE,CAAA;YAChF,MAAM,WAAW,GAAG,gBAAgB,CAAC,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,CAAA;YACrD,IAAI,WAAW,EAAE,IAAI,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,aAAa,KAAK,IAAI,CAAC;gBAAE,SAAQ;QAClE,CAAC;QAED,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC5C,MAAM,CAAC,GAAG,UAAU,CAAC,CAAC,CAAE,CAAA;YACxB,MAAM,GAAG,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,OAAO,CAAA;YACvC,IAAI,GAAG,KAAK,SAAS;gBAAE,SAAQ;YAC/B,MAAM,CAAC,CAAC,CAAE,CAAC,GAAG,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,GAAG,CAAE,EAAE,YAAY,CAAC,CAAA;QAC3D,CAAC;IACF,CAAC;AACF,CAAC;AAED,MAAM,gBAAgB,GAAkC,IAAI,GAAG,CAAC;IAC/D,CAAC,IAAI,EAAE,CAAC,SAAS,CAAC,CAAC;IACnB,CAAC,IAAI,EAAE,CAAC,QAAQ,CAAC,CAAC;IAClB,CAAC,IAAI,EAAE,CAAC,SAAS,CAAC,CAAC;IACnB,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,CAAC;IACpB,CAAC,IAAI,EAAE,CAAC,YAAY,CAAC,CAAC;IACtB,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,CAAC;IACpB,CAAC,IAAI,EAAE,CAAC,aAAa,CAAC,CAAC;IACvB,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,CAAC;IACpB,CAAC,IAAI,EAAE,CAAC,sBAAsB,CAAC,CAAC;IAChC,CAAC,IAAI,EAAE,CAAC,SAAS,CAAC,CAAC;IACnB,CAAC,IAAI,EAAE,CAAC,SAAS,CAAC,CAAC;IACnB,CAAC,IAAI,EAAE,CAAC,QAAQ,CAAC,CAAC;IAClB,CAAC,IAAI,EAAE,CAAC,OAAO,CAAC,CAAC;IACjB,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,CAAC;IACpB,CAAC,IAAI,EAAE,CAAC,SAAS,CAAC,CAAC;IACnB,CAAC,IAAI,EAAE,CAAC,MAAM,CAAC,CAAC;IAChB,CAAC,IAAI,EAAE,CAAC,QAAQ,CAAC,CAAC;IAClB,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,CAAC;IACpB,CAAC,IAAI,EAAE,CAAC,WAAW,CAAC,CAAC;IACrB,CAAC,IAAI,EAAE,CAAC,OAAO,CAAC,CAAC;IACjB,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,CAAC;IACpB,CAAC,IAAI,EAAE,CAAC,eAAe,CAAC,CAAC;IACzB,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,CAAC;IACpB,CAAC,IAAI,EAAE,CAAC,WAAW,CAAC,CAAC;IACrB,CAAC,IAAI,EAAE,CAAC,aAAa,CAAC,CAAC;IACvB,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,CAAC;IACpB,CAAC,IAAI,EAAE,CAAC,SAAS,CAAC,CAAC;IACnB,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,CAAC;IACpB,CAAC,IAAI,EAAE,CAAC,QAAQ,CAAC,CAAC;IAClB,CAAC,IAAI,EAAE,CAAC,eAAe,CAAC,CAAC;IACzB,CAAC,IAAI,EAAE,CAAC,YAAY,CAAC,CAAC;IACtB,CAAC,IAAI,EAAE,CAAC,YAAY,CAAC,CAAC;IACtB,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,CAAC;IACpB,CAAC,IAAI,EAAE,CAAC,gBAAgB,CAAC,CAAC;IAC1B,CAAC,IAAI,EAAE,CAAC,cAAc,CAAC,CAAC;IACxB,CAAC,IAAI,EAAE,CAAC,MAAM,CAAC,CAAC;IAChB,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,CAAC;IACpB,CAAC,IAAI,EAAE,CAAC,QAAQ,CAAC,CAAC;IAClB,CAAC,IAAI,EAAE,CAAC,cAAc,CAAC,CAAC;IACxB,CAAC,IAAI,EAAE,CAAC,cAAc,CAAC,CAAC;IACxB,CAAC,IAAI,EAAE,CAAC,gBAAgB,CAAC,CAAC;IAC1B,CAAC,IAAI,EAAE,CAAC,cAAc,CAAC,CAAC;IACxB,CAAC,IAAI,EAAE,CAAC,WAAW,CAAC,CAAC;IACrB,CAAC,IAAI,EAAE,CAAC,OAAO,CAAC,CAAC;IACjB,CAAC,IAAI,EAAE,CAAC,MAAM,CAAC,CAAC;IAChB,CAAC,IAAI,EAAE,CAAC,SAAS,CAAC,CAAC;IACnB,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,CAAC;IACpB,CAAC,IAAI,EAAE,CAAC,YAAY,CAAC,CAAC;IACtB,CAAC,IAAI,EAAE,CAAC,eAAe,CAAC,CAAC;IACzB,CAAC,IAAI,EAAE,CAAC,WAAW,CAAC,CAAC;IACrB,CAAC,IAAI,EAAE,CAAC,SAAS,CAAC,CAAC;IACnB,CAAC,IAAI,EAAE,CAAC,gBAAgB,CAAC,CAAC;IAC1B,CAAC,IAAI,EAAE,CAAC,MAAM,CAAC,CAAC;IAChB,CAAC,IAAI,EAAE,CAAC,0BAA0B,CAAC,CAAC;IACpC,CAAC,IAAI,EAAE,CAAC,aAAa,CAAC,CAAC;IACvB,CAAC,IAAI,EAAE,CAAC,gBAAgB,CAAC,CAAC;CAC1B,CAAC,CAAA;AAEF,SAAS,QAAQ,CAAC,CAAiC,EAAE,CAAiC;IACrF,OAAO,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,GAAG,CAAA;AAC1C,CAAC;AAED,0EAA0E;AAC1E,MAAM,UAAU,iBAAiB,CAAC,SAAqB,EAAE,MAAkB;IAC1E,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,SAAS,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC,CAAA;IACnE,MAAM,GAAG,GAAe,EAAE,CAAA;IAC1B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC3C,MAAM,CAAC,GAAG,SAAS,CAAC,CAAC,CAAE,CAAA;QACvB,MAAM,CAAC,GAAG,MAAM,CAAC,CAAC,CAAC,IAAI,IAAI,KAAK,CAAS,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;QAC1D,MAAM,GAAG,GAAG,IAAI,KAAK,CAAS,CAAC,CAAC,MAAM,CAAC,CAAA;QACvC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC,EAAE;YAAE,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAE,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAA;QAC/D,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;IACd,CAAC;IACD,OAAO,GAAG,CAAA;AACX,CAAC"}
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Street-morphology emission bias — Layer 1 of the four-layer street-supplement architecture (see
|
|
7
|
+
* `docs/articles/concepts/street-supplement-architecture.md`).
|
|
8
|
+
*
|
|
9
|
+
* This module composes with {@linkcode buildFstEmissionPriors} (admin FST) and the QueryShape prior
|
|
10
|
+
* via {@linkcode addEmissionMatrix} — same shape, same additive semantics. Where the admin FST
|
|
11
|
+
* biases admin BIO labels (`B/I-locality`, `B/I-region`, ...), the morphology FST biases:
|
|
12
|
+
*
|
|
13
|
+
* - **Affix-token (the matched span):** toward `B/I-street_prefix` AND `B/I-street_suffix` (position
|
|
14
|
+
* unknown — let the model + context disambiguate).
|
|
15
|
+
* - **Adjacent token (one before AND one after each match):** toward `B/I-street`, AWAY from
|
|
16
|
+
* `B/I-dependent_locality`. The negative bias on `dependent_locality` is the load-bearing
|
|
17
|
+
* piece — it closes the inference-time vacuum that caused v0.6.1's 1066 dep_locality
|
|
18
|
+
* hallucinations (see [[project-v061-failure-mechanism]]).
|
|
19
|
+
*
|
|
20
|
+
* The morphology FST itself is built by `resolver-wof-sqlite/street-morphology-fst-builder.ts` and
|
|
21
|
+
* ships as a separate binary (`fst-street-morphology.bin`) loaded into a second `FstMatcher`
|
|
22
|
+
* instance.
|
|
23
|
+
*/
|
|
24
|
+
import { type FstMatcherLike } from "./fst-prior.js";
|
|
25
|
+
import type { TokenLike } from "./query-shape-prior.js";
|
|
26
|
+
export interface StreetMorphologyPriorOpts {
|
|
27
|
+
/** Multiplier on the base bias before {@linkcode maxBias} is applied. Default 1.0. */
|
|
28
|
+
biasScale?: number;
|
|
29
|
+
/**
|
|
30
|
+
* Maximum bias magnitude (logits) on the affix span itself. Default 3.0 — same as the admin FST.
|
|
31
|
+
* The morphology signal is structurally less ambiguous than admin names (`Avenue` is almost never
|
|
32
|
+
* anything but street-typing), so equal magnitude is justified.
|
|
33
|
+
*/
|
|
34
|
+
maxAffixBias?: number;
|
|
35
|
+
/**
|
|
36
|
+
* Maximum bias magnitude (logits) on the adjacent (neighbour) tokens for the `street` label.
|
|
37
|
+
* Default 2.0 — a touch weaker than the affix bias because the neighbour is inferred from
|
|
38
|
+
* adjacency, not direct match.
|
|
39
|
+
*/
|
|
40
|
+
maxNeighbourStreetBias?: number;
|
|
41
|
+
/**
|
|
42
|
+
* Magnitude of the negative bias applied to `dependent_locality` BIO labels on the adjacent
|
|
43
|
+
* tokens. Default 2.0. This is the load-bearing piece.
|
|
44
|
+
*/
|
|
45
|
+
dependentLocalityPenalty?: number;
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* Build a `[seqLen][numLabels]` bias matrix from street-morphology FST matches.
|
|
49
|
+
*
|
|
50
|
+
* The output composes with the admin FST bias matrix via {@linkcode addEmissionMatrix} — same
|
|
51
|
+
* `addEmissionMatrix(emissions, fstBias) → biasedEmissions` pattern as the existing admin prior.
|
|
52
|
+
*/
|
|
53
|
+
export declare function buildStreetMorphologyEmissionPriors(fst: FstMatcherLike, pieces: ReadonlyArray<TokenLike & {
|
|
54
|
+
piece: string;
|
|
55
|
+
}>, labels: ReadonlyArray<string>, opts?: StreetMorphologyPriorOpts): number[][];
|
|
56
|
+
//# sourceMappingURL=street-morphology-prior.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"street-morphology-prior.d.ts","sourceRoot":"","sources":["../street-morphology-prior.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAEH,OAAO,EAAwB,KAAK,cAAc,EAAkB,MAAM,gBAAgB,CAAA;AAC1F,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,wBAAwB,CAAA;AAEvD,MAAM,WAAW,yBAAyB;IACzC,sFAAsF;IACtF,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB;;;;OAIG;IACH,YAAY,CAAC,EAAE,MAAM,CAAA;IACrB;;;;OAIG;IACH,sBAAsB,CAAC,EAAE,MAAM,CAAA;IAC/B;;;OAGG;IACH,wBAAwB,CAAC,EAAE,MAAM,CAAA;CACjC;AAED;;;;;GAKG;AACH,wBAAgB,mCAAmC,CAClD,GAAG,EAAE,cAAc,EACnB,MAAM,EAAE,aAAa,CAAC,SAAS,GAAG;IAAE,KAAK,EAAE,MAAM,CAAA;CAAE,CAAC,EACpD,MAAM,EAAE,aAAa,CAAC,MAAM,CAAC,EAC7B,IAAI,GAAE,yBAA8B,GAClC,MAAM,EAAE,EAAE,CAgIZ"}
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Street-morphology emission bias — Layer 1 of the four-layer street-supplement architecture (see
|
|
7
|
+
* `docs/articles/concepts/street-supplement-architecture.md`).
|
|
8
|
+
*
|
|
9
|
+
* This module composes with {@linkcode buildFstEmissionPriors} (admin FST) and the QueryShape prior
|
|
10
|
+
* via {@linkcode addEmissionMatrix} — same shape, same additive semantics. Where the admin FST
|
|
11
|
+
* biases admin BIO labels (`B/I-locality`, `B/I-region`, ...), the morphology FST biases:
|
|
12
|
+
*
|
|
13
|
+
* - **Affix-token (the matched span):** toward `B/I-street_prefix` AND `B/I-street_suffix` (position
|
|
14
|
+
* unknown — let the model + context disambiguate).
|
|
15
|
+
* - **Adjacent token (one before AND one after each match):** toward `B/I-street`, AWAY from
|
|
16
|
+
* `B/I-dependent_locality`. The negative bias on `dependent_locality` is the load-bearing
|
|
17
|
+
* piece — it closes the inference-time vacuum that caused v0.6.1's 1066 dep_locality
|
|
18
|
+
* hallucinations (see [[project-v061-failure-mechanism]]).
|
|
19
|
+
*
|
|
20
|
+
* The morphology FST itself is built by `resolver-wof-sqlite/street-morphology-fst-builder.ts` and
|
|
21
|
+
* ships as a separate binary (`fst-street-morphology.bin`) loaded into a second `FstMatcher`
|
|
22
|
+
* instance.
|
|
23
|
+
*/
|
|
24
|
+
import { groupPiecesIntoWords } from "./fst-prior.js";
|
|
25
|
+
/**
|
|
26
|
+
* Build a `[seqLen][numLabels]` bias matrix from street-morphology FST matches.
|
|
27
|
+
*
|
|
28
|
+
* The output composes with the admin FST bias matrix via {@linkcode addEmissionMatrix} — same
|
|
29
|
+
* `addEmissionMatrix(emissions, fstBias) → biasedEmissions` pattern as the existing admin prior.
|
|
30
|
+
*/
|
|
31
|
+
export function buildStreetMorphologyEmissionPriors(fst, pieces, labels, opts = {}) {
|
|
32
|
+
const T = pieces.length;
|
|
33
|
+
const L = labels.length;
|
|
34
|
+
const biasScale = opts.biasScale ?? 1.0;
|
|
35
|
+
const maxAffixBias = opts.maxAffixBias ?? 3.0;
|
|
36
|
+
const maxNeighbourStreetBias = opts.maxNeighbourStreetBias ?? 2.0;
|
|
37
|
+
const dependentLocalityPenalty = opts.dependentLocalityPenalty ?? 2.0;
|
|
38
|
+
const matrix = [];
|
|
39
|
+
for (let t = 0; t < T; t++)
|
|
40
|
+
matrix.push(new Array(L).fill(0));
|
|
41
|
+
const labelToCol = new Map();
|
|
42
|
+
for (let k = 0; k < labels.length; k++)
|
|
43
|
+
labelToCol.set(labels[k], k);
|
|
44
|
+
const bStreetPrefix = labelToCol.get("B-street_prefix");
|
|
45
|
+
const iStreetPrefix = labelToCol.get("I-street_prefix");
|
|
46
|
+
const bStreetSuffix = labelToCol.get("B-street_suffix");
|
|
47
|
+
const iStreetSuffix = labelToCol.get("I-street_suffix");
|
|
48
|
+
const bStreet = labelToCol.get("B-street");
|
|
49
|
+
const iStreet = labelToCol.get("I-street");
|
|
50
|
+
const bDepLoc = labelToCol.get("B-dependent_locality");
|
|
51
|
+
const iDepLoc = labelToCol.get("I-dependent_locality");
|
|
52
|
+
// If the label vocabulary doesn't include street tags at all (e.g. a Stage 1 model), there's
|
|
53
|
+
// nothing to bias toward. Return zero-matrix and let the additive pipeline no-op.
|
|
54
|
+
if (bStreet === undefined || bStreetPrefix === undefined || bStreetSuffix === undefined) {
|
|
55
|
+
return matrix;
|
|
56
|
+
}
|
|
57
|
+
const wordGroups = groupPiecesIntoWords(pieces);
|
|
58
|
+
if (wordGroups.length === 0)
|
|
59
|
+
return matrix;
|
|
60
|
+
const affixMatches = [];
|
|
61
|
+
// Pass 1 — walk every contiguous subpath, collect accepting morphology matches, and apply
|
|
62
|
+
// the affix bias to matched tokens.
|
|
63
|
+
for (let start = 0; start < wordGroups.length; start++) {
|
|
64
|
+
const group = wordGroups[start];
|
|
65
|
+
if (group.fstToken === "")
|
|
66
|
+
continue;
|
|
67
|
+
const initial = fst.walk([group.fstToken]);
|
|
68
|
+
if (!initial)
|
|
69
|
+
continue;
|
|
70
|
+
let bestEnd = -1;
|
|
71
|
+
let bestStateId = -1;
|
|
72
|
+
if (initial.accepted) {
|
|
73
|
+
bestEnd = start;
|
|
74
|
+
bestStateId = initial.stateId;
|
|
75
|
+
}
|
|
76
|
+
let current = initial;
|
|
77
|
+
for (let end = start + 1; end < wordGroups.length; end++) {
|
|
78
|
+
const nextGroup = wordGroups[end];
|
|
79
|
+
if (nextGroup.fstToken === "")
|
|
80
|
+
continue;
|
|
81
|
+
const next = fst.walkFrom(current, nextGroup.fstToken);
|
|
82
|
+
if (!next)
|
|
83
|
+
break;
|
|
84
|
+
if (next.accepted) {
|
|
85
|
+
bestEnd = end;
|
|
86
|
+
bestStateId = next.stateId;
|
|
87
|
+
}
|
|
88
|
+
current = next;
|
|
89
|
+
}
|
|
90
|
+
if (bestEnd === -1)
|
|
91
|
+
continue;
|
|
92
|
+
// Verify the accepting entries are street_affix (the morphology FST may eventually contain
|
|
93
|
+
// other placetypes if the binary format is reused for related priors).
|
|
94
|
+
const entries = fst.accepting(bestStateId);
|
|
95
|
+
const hasAffix = entries.some((e) => e.placetype === "street_affix");
|
|
96
|
+
if (!hasAffix)
|
|
97
|
+
continue;
|
|
98
|
+
affixMatches.push({ startGroupIdx: start, endGroupIdx: bestEnd });
|
|
99
|
+
// Collect piece indices for the matched span.
|
|
100
|
+
const affixPieceIndices = [];
|
|
101
|
+
for (let g = start; g <= bestEnd; g++) {
|
|
102
|
+
const wg = wordGroups[g];
|
|
103
|
+
if (wg.fstToken === "")
|
|
104
|
+
continue;
|
|
105
|
+
for (const pi of wg.pieceIndices)
|
|
106
|
+
affixPieceIndices.push(pi);
|
|
107
|
+
}
|
|
108
|
+
// Apply affix bias: positive bias toward both prefix and suffix BIO labels on the matched
|
|
109
|
+
// tokens. The model's existing logits + the QueryShape prior + the adjacent context (via
|
|
110
|
+
// pass 2) determine which of {prefix, suffix} actually wins. We don't pre-commit to one.
|
|
111
|
+
const affixBias = biasScale * maxAffixBias;
|
|
112
|
+
for (let k = 0; k < affixPieceIndices.length; k++) {
|
|
113
|
+
const pi = affixPieceIndices[k];
|
|
114
|
+
const prefixCol = k === 0 ? bStreetPrefix : (iStreetPrefix ?? bStreetPrefix);
|
|
115
|
+
const suffixCol = k === 0 ? bStreetSuffix : (iStreetSuffix ?? bStreetSuffix);
|
|
116
|
+
matrix[pi][prefixCol] = Math.max(matrix[pi][prefixCol], affixBias);
|
|
117
|
+
matrix[pi][suffixCol] = Math.max(matrix[pi][suffixCol], affixBias);
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
if (affixMatches.length === 0)
|
|
121
|
+
return matrix;
|
|
122
|
+
// Pass 2 — for each affix match, identify the immediately-adjacent word groups (skipping
|
|
123
|
+
// empty/punctuation groups) on either side and bias them toward street, away from
|
|
124
|
+
// dependent_locality.
|
|
125
|
+
const neighbourStreetBias = biasScale * maxNeighbourStreetBias;
|
|
126
|
+
for (const match of affixMatches) {
|
|
127
|
+
const before = findNeighbour(wordGroups, match.startGroupIdx, -1);
|
|
128
|
+
const after = findNeighbour(wordGroups, match.endGroupIdx, +1);
|
|
129
|
+
for (const neighbour of [before, after]) {
|
|
130
|
+
if (!neighbour)
|
|
131
|
+
continue;
|
|
132
|
+
const indices = neighbour.pieceIndices;
|
|
133
|
+
for (let k = 0; k < indices.length; k++) {
|
|
134
|
+
const pi = indices[k];
|
|
135
|
+
const streetCol = k === 0 ? bStreet : (iStreet ?? bStreet);
|
|
136
|
+
matrix[pi][streetCol] = Math.max(matrix[pi][streetCol], neighbourStreetBias);
|
|
137
|
+
if (bDepLoc !== undefined) {
|
|
138
|
+
const depLocCol = k === 0 ? bDepLoc : (iDepLoc ?? bDepLoc);
|
|
139
|
+
matrix[pi][depLocCol] = Math.min(matrix[pi][depLocCol], -dependentLocalityPenalty);
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
return matrix;
|
|
145
|
+
}
|
|
146
|
+
/**
|
|
147
|
+
* Walk word groups outward from `fromGroupIdx` in `direction` (+1 or -1), skipping empty groups
|
|
148
|
+
* (whitespace / punctuation), and return the first non-empty group encountered — or `null` if no
|
|
149
|
+
* such neighbour exists.
|
|
150
|
+
*/
|
|
151
|
+
function findNeighbour(groups, fromGroupIdx, direction) {
|
|
152
|
+
for (let i = fromGroupIdx + direction; i >= 0 && i < groups.length; i += direction) {
|
|
153
|
+
const g = groups[i];
|
|
154
|
+
if (g.fstToken !== "")
|
|
155
|
+
return g;
|
|
156
|
+
}
|
|
157
|
+
return null;
|
|
158
|
+
}
|
|
159
|
+
//# sourceMappingURL=street-morphology-prior.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"street-morphology-prior.js","sourceRoot":"","sources":["../street-morphology-prior.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAEH,OAAO,EAAE,oBAAoB,EAAuC,MAAM,gBAAgB,CAAA;AAyB1F;;;;;GAKG;AACH,MAAM,UAAU,mCAAmC,CAClD,GAAmB,EACnB,MAAoD,EACpD,MAA6B,EAC7B,OAAkC,EAAE;IAEpC,MAAM,CAAC,GAAG,MAAM,CAAC,MAAM,CAAA;IACvB,MAAM,CAAC,GAAG,MAAM,CAAC,MAAM,CAAA;IACvB,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,IAAI,GAAG,CAAA;IACvC,MAAM,YAAY,GAAG,IAAI,CAAC,YAAY,IAAI,GAAG,CAAA;IAC7C,MAAM,sBAAsB,GAAG,IAAI,CAAC,sBAAsB,IAAI,GAAG,CAAA;IACjE,MAAM,wBAAwB,GAAG,IAAI,CAAC,wBAAwB,IAAI,GAAG,CAAA;IAErE,MAAM,MAAM,GAAe,EAAE,CAAA;IAC7B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE;QAAE,MAAM,CAAC,IAAI,CAAC,IAAI,KAAK,CAAS,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAA;IAErE,MAAM,UAAU,GAAG,IAAI,GAAG,EAAkB,CAAA;IAC5C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE;QAAE,UAAU,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAE,EAAE,CAAC,CAAC,CAAA;IAErE,MAAM,aAAa,GAAG,UAAU,CAAC,GAAG,CAAC,iBAAiB,CAAC,CAAA;IACvD,MAAM,aAAa,GAAG,UAAU,CAAC,GAAG,CAAC,iBAAiB,CAAC,CAAA;IACvD,MAAM,aAAa,GAAG,UAAU,CAAC,GAAG,CAAC,iBAAiB,CAAC,CAAA;IACvD,MAAM,aAAa,GAAG,UAAU,CAAC,GAAG,CAAC,iBAAiB,CAAC,CAAA;IACvD,MAAM,OAAO,GAAG,UAAU,CAAC,GAAG,CAAC,UAAU,CAAC,CAAA;IAC1C,MAAM,OAAO,GAAG,UAAU,CAAC,GAAG,CAAC,UAAU,CAAC,CAAA;IAC1C,MAAM,OAAO,GAAG,UAAU,CAAC,GAAG,CAAC,sBAAsB,CAAC,CAAA;IACtD,MAAM,OAAO,GAAG,UAAU,CAAC,GAAG,CAAC,sBAAsB,CAAC,CAAA;IAEtD,6FAA6F;IAC7F,kFAAkF;IAClF,IAAI,OAAO,KAAK,SAAS,IAAI,aAAa,KAAK,SAAS,IAAI,aAAa,KAAK,SAAS,EAAE,CAAC;QACzF,OAAO,MAAM,CAAA;IACd,CAAC;IAED,MAAM,UAAU,GAAG,oBAAoB,CAAC,MAAM,CAAC,CAAA;IAC/C,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,MAAM,CAAA;IAQ1C,MAAM,YAAY,GAAiB,EAAE,CAAA;IAErC,0FAA0F;IAC1F,oCAAoC;IACpC,KAAK,IAAI,KAAK,GAAG,CAAC,EAAE,KAAK,GAAG,UAAU,CAAC,MAAM,EAAE,KAAK,EAAE,EAAE,CAAC;QACxD,MAAM,KAAK,GAAG,UAAU,CAAC,KAAK,CAAE,CAAA;QAChC,IAAI,KAAK,CAAC,QAAQ,KAAK,EAAE;YAAE,SAAQ;QAEnC,MAAM,OAAO,GAAG,GAAG,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,CAAA;QAC1C,IAAI,CAAC,OAAO;YAAE,SAAQ;QAEtB,IAAI,OAAO,GAAG,CAAC,CAAC,CAAA;QAChB,IAAI,WAAW,GAAG,CAAC,CAAC,CAAA;QACpB,IAAI,OAAO,CAAC,QAAQ,EAAE,CAAC;YACtB,OAAO,GAAG,KAAK,CAAA;YACf,WAAW,GAAG,OAAO,CAAC,OAAO,CAAA;QAC9B,CAAC;QAED,IAAI,OAAO,GAAG,OAAO,CAAA;QACrB,KAAK,IAAI,GAAG,GAAG,KAAK,GAAG,CAAC,EAAE,GAAG,GAAG,UAAU,CAAC,MAAM,EAAE,GAAG,EAAE,EAAE,CAAC;YAC1D,MAAM,SAAS,GAAG,UAAU,CAAC,GAAG,CAAE,CAAA;YAClC,IAAI,SAAS,CAAC,QAAQ,KAAK,EAAE;gBAAE,SAAQ;YAEvC,MAAM,IAAI,GAAG,GAAG,CAAC,QAAQ,CAAC,OAAO,EAAE,SAAS,CAAC,QAAQ,CAAC,CAAA;YACtD,IAAI,CAAC,IAAI;gBAAE,MAAK;YAChB,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;gBACnB,OAAO,GAAG,GAAG,CAAA;gBACb,WAAW,GAAG,IAAI,CAAC,OAAO,CAAA;YAC3B,CAAC;YACD,OAAO,GAAG,IAAI,CAAA;QACf,CAAC;QAED,IAAI,OAAO,KAAK,CAAC,CAAC;YAAE,SAAQ;QAE5B,2FAA2F;QAC3F,uEAAuE;QACvE,MAAM,OAAO,GAAG,GAAG,CAAC,SAAS,CAAC,WAAW,CAAC,CAAA;QAC1C,MAAM,QAAQ,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,KAAK,cAAc,CAAC,CAAA;QACpE,IAAI,CAAC,QAAQ;YAAE,SAAQ;QAEvB,YAAY,CAAC,IAAI,CAAC,EAAE,aAAa,EAAE,KAAK,EAAE,WAAW,EAAE,OAAO,EAAE,CAAC,CAAA;QAEjE,8CAA8C;QAC9C,MAAM,iBAAiB,GAAa,EAAE,CAAA;QACtC,KAAK,IAAI,CAAC,GAAG,KAAK,EAAE,CAAC,IAAI,OAAO,EAAE,CAAC,EAAE,EAAE,CAAC;YACvC,MAAM,EAAE,GAAG,UAAU,CAAC,CAAC,CAAE,CAAA;YACzB,IAAI,EAAE,CAAC,QAAQ,KAAK,EAAE;gBAAE,SAAQ;YAChC,KAAK,MAAM,EAAE,IAAI,EAAE,CAAC,YAAY;gBAAE,iBAAiB,CAAC,IAAI,CAAC,EAAE,CAAC,CAAA;QAC7D,CAAC;QAED,0FAA0F;QAC1F,yFAAyF;QACzF,yFAAyF;QACzF,MAAM,SAAS,GAAG,SAAS,GAAG,YAAY,CAAA;QAC1C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,iBAAiB,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACnD,MAAM,EAAE,GAAG,iBAAiB,CAAC,CAAC,CAAE,CAAA;YAChC,MAAM,SAAS,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC,aAAa,IAAI,aAAa,CAAC,CAAA;YAC5E,MAAM,SAAS,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC,aAAa,IAAI,aAAa,CAAC,CAAA;YAC5E,MAAM,CAAC,EAAE,CAAE,CAAC,SAAS,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAE,CAAC,SAAS,CAAE,EAAE,SAAS,CAAC,CAAA;YACrE,MAAM,CAAC,EAAE,CAAE,CAAC,SAAS,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAE,CAAC,SAAS,CAAE,EAAE,SAAS,CAAC,CAAA;QACtE,CAAC;IACF,CAAC;IAED,IAAI,YAAY,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,MAAM,CAAA;IAE5C,yFAAyF;IACzF,kFAAkF;IAClF,sBAAsB;IACtB,MAAM,mBAAmB,GAAG,SAAS,GAAG,sBAAsB,CAAA;IAC9D,KAAK,MAAM,KAAK,IAAI,YAAY,EAAE,CAAC;QAClC,MAAM,MAAM,GAAG,aAAa,CAAC,UAAU,EAAE,KAAK,CAAC,aAAa,EAAE,CAAC,CAAC,CAAC,CAAA;QACjE,MAAM,KAAK,GAAG,aAAa,CAAC,UAAU,EAAE,KAAK,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC,CAAA;QAE9D,KAAK,MAAM,SAAS,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC,EAAE,CAAC;YACzC,IAAI,CAAC,SAAS;gBAAE,SAAQ;YACxB,MAAM,OAAO,GAAG,SAAS,CAAC,YAAY,CAAA;YACtC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBACzC,MAAM,EAAE,GAAG,OAAO,CAAC,CAAC,CAAE,CAAA;gBACtB,MAAM,SAAS,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,OAAO,IAAI,OAAO,CAAC,CAAA;gBAC1D,MAAM,CAAC,EAAE,CAAE,CAAC,SAAS,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAE,CAAC,SAAS,CAAE,EAAE,mBAAmB,CAAC,CAAA;gBAE/E,IAAI,OAAO,KAAK,SAAS,EAAE,CAAC;oBAC3B,MAAM,SAAS,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,OAAO,IAAI,OAAO,CAAC,CAAA;oBAC1D,MAAM,CAAC,EAAE,CAAE,CAAC,SAAS,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAE,CAAC,SAAS,CAAE,EAAE,CAAC,wBAAwB,CAAC,CAAA;gBACtF,CAAC;YACF,CAAC;QACF,CAAC;IACF,CAAC;IAED,OAAO,MAAM,CAAA;AACd,CAAC;AAED;;;;GAIG;AACH,SAAS,aAAa,CAAC,MAAmB,EAAE,YAAoB,EAAE,SAAiB;IAClF,KAAK,IAAI,CAAC,GAAG,YAAY,GAAG,SAAS,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,IAAI,SAAS,EAAE,CAAC;QACpF,MAAM,CAAC,GAAG,MAAM,CAAC,CAAC,CAAE,CAAA;QACpB,IAAI,CAAC,CAAC,QAAQ,KAAK,EAAE;YAAE,OAAO,CAAC,CAAA;IAChC,CAAC;IACD,OAAO,IAAI,CAAA;AACZ,CAAC"}
|
package/out/tokenizer.d.ts
CHANGED
|
@@ -51,7 +51,12 @@ export declare class MailwomanTokenizer {
|
|
|
51
51
|
private constructor();
|
|
52
52
|
/** Load from a base64-encoded `tokenizer.model`. Use for in-memory / test setups. */
|
|
53
53
|
static loadFromBase64(b64: string): Promise<MailwomanTokenizer>;
|
|
54
|
-
/**
|
|
54
|
+
/**
|
|
55
|
+
* Load from a path to a `tokenizer.model` file on disk. **Node-only** — the dynamic `node:fs`
|
|
56
|
+
* import keeps this method out of the static dependency graph so the rest of the tokenizer
|
|
57
|
+
* bundles cleanly for the browser. Calling it in a browser throws at runtime; use
|
|
58
|
+
* `loadFromBase64` (or the URL-fetching loaders in `@mailwoman/neural-web`) instead.
|
|
59
|
+
*/
|
|
55
60
|
static loadFromFile(modelPath: string): Promise<MailwomanTokenizer>;
|
|
56
61
|
/**
|
|
57
62
|
* Tokenize `text` to pieces + ids + realigned char offsets.
|
package/out/tokenizer.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"tokenizer.d.ts","sourceRoot":"","sources":["../tokenizer.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA8BG;
|
|
1
|
+
{"version":3,"file":"tokenizer.d.ts","sourceRoot":"","sources":["../tokenizer.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA8BG;AAIH,4EAA4E;AAC5E,eAAO,MAAM,cAAc,WAAM,CAAA;AAEjC,0EAA0E;AAC1E,MAAM,WAAW,cAAc;IAC9B,wFAAwF;IACxF,KAAK,EAAE,MAAM,CAAA;IACb,mCAAmC;IACnC,EAAE,EAAE,MAAM,CAAA;IACV,yDAAyD;IACzD,KAAK,EAAE,MAAM,CAAA;IACb,uDAAuD;IACvD,GAAG,EAAE,MAAM,CAAA;CACX;AAED,MAAM,WAAW,YAAY;IAC5B,MAAM,EAAE,cAAc,EAAE,CAAA;IACxB,GAAG,EAAE,MAAM,EAAE,CAAA;CACb;AAED,qBAAa,kBAAkB;IACV,OAAO,CAAC,QAAQ,CAAC,SAAS;IAA9C,OAAO;IAEP,qFAAqF;WACxE,cAAc,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,kBAAkB,CAAC;IAMrE;;;;;OAKG;WACU,YAAY,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAAC,kBAAkB,CAAC;IAMzE;;;;;;OAMG;IACH,MAAM,CAAC,IAAI,EAAE,MAAM,GAAG,YAAY;IA2BlC,oFAAoF;IACpF,MAAM,CAAC,GAAG,EAAE,MAAM,EAAE,GAAG,UAAU,GAAG,MAAM;CAI1C"}
|
package/out/tokenizer.js
CHANGED
|
@@ -30,7 +30,6 @@
|
|
|
30
30
|
* - `loadFromFile(path)` — convenience helper that does the read + b64 + load.
|
|
31
31
|
*/
|
|
32
32
|
import { SentencePieceProcessor } from "@sctg/sentencepiece-js";
|
|
33
|
-
import { promises as fs } from "node:fs";
|
|
34
33
|
/** SentencePiece's word-boundary marker (U+2581 LOWER ONE EIGHTH BLOCK). */
|
|
35
34
|
export const SPACE_SENTINEL = "▁";
|
|
36
35
|
export class MailwomanTokenizer {
|
|
@@ -44,9 +43,15 @@ export class MailwomanTokenizer {
|
|
|
44
43
|
await processor.loadFromB64StringModel(b64);
|
|
45
44
|
return new MailwomanTokenizer(processor);
|
|
46
45
|
}
|
|
47
|
-
/**
|
|
46
|
+
/**
|
|
47
|
+
* Load from a path to a `tokenizer.model` file on disk. **Node-only** — the dynamic `node:fs`
|
|
48
|
+
* import keeps this method out of the static dependency graph so the rest of the tokenizer
|
|
49
|
+
* bundles cleanly for the browser. Calling it in a browser throws at runtime; use
|
|
50
|
+
* `loadFromBase64` (or the URL-fetching loaders in `@mailwoman/neural-web`) instead.
|
|
51
|
+
*/
|
|
48
52
|
static async loadFromFile(modelPath) {
|
|
49
|
-
const
|
|
53
|
+
const { readFile } = await import(/* webpackIgnore: true */ "node:fs/promises");
|
|
54
|
+
const buf = await readFile(modelPath);
|
|
50
55
|
return MailwomanTokenizer.loadFromBase64(buf.toString("base64"));
|
|
51
56
|
}
|
|
52
57
|
/**
|
package/out/tokenizer.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"tokenizer.js","sourceRoot":"","sources":["../tokenizer.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA8BG;AAEH,OAAO,EAAE,sBAAsB,EAAE,MAAM,wBAAwB,CAAA;
|
|
1
|
+
{"version":3,"file":"tokenizer.js","sourceRoot":"","sources":["../tokenizer.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA8BG;AAEH,OAAO,EAAE,sBAAsB,EAAE,MAAM,wBAAwB,CAAA;AAE/D,4EAA4E;AAC5E,MAAM,CAAC,MAAM,cAAc,GAAG,GAAG,CAAA;AAmBjC,MAAM,OAAO,kBAAkB;IACO;IAArC,YAAqC,SAAiC;QAAjC,cAAS,GAAT,SAAS,CAAwB;IAAG,CAAC;IAE1E,qFAAqF;IACrF,MAAM,CAAC,KAAK,CAAC,cAAc,CAAC,GAAW;QACtC,MAAM,SAAS,GAAG,IAAI,sBAAsB,EAAE,CAAA;QAC9C,MAAM,SAAS,CAAC,sBAAsB,CAAC,GAAG,CAAC,CAAA;QAC3C,OAAO,IAAI,kBAAkB,CAAC,SAAS,CAAC,CAAA;IACzC,CAAC;IAED;;;;;OAKG;IACH,MAAM,CAAC,KAAK,CAAC,YAAY,CAAC,SAAiB;QAC1C,MAAM,EAAE,QAAQ,EAAE,GAAG,MAAM,MAAM,CAAC,yBAAyB,CAAC,kBAAkB,CAAC,CAAA;QAC/E,MAAM,GAAG,GAAG,MAAM,QAAQ,CAAC,SAAS,CAAC,CAAA;QACrC,OAAO,kBAAkB,CAAC,cAAc,CAAC,GAAG,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC,CAAA;IACjE,CAAC;IAED;;;;;;OAMG;IACH,MAAM,CAAC,IAAY;QAClB,MAAM,MAAM,GAAG,IAAI,CAAC,SAAS,CAAC,YAAY,CAAC,IAAI,CAAC,CAAA;QAChD,MAAM,GAAG,GAAG,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,IAAI,CAAC,CAAA;QAE1C,MAAM,SAAS,GAAqB,EAAE,CAAA;QACtC,IAAI,MAAM,GAAG,CAAC,CAAA;QAEd,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACxC,MAAM,KAAK,GAAG,MAAM,CAAC,CAAC,CAAE,CAAA;YACxB,MAAM,EAAE,GAAG,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAA;YACvB,MAAM,WAAW,GAAG,KAAK,CAAC,UAAU,CAAC,cAAc,CAAC,CAAA;YACpD,MAAM,OAAO,GAAG,WAAW,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,KAAK,CAAA;YAExE,IAAI,WAAW,EAAE,CAAC;gBACjB,OAAO,MAAM,GAAG,IAAI,CAAC,MAAM,IAAI,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,CAAE,CAAC;oBAAE,MAAM,EAAE,CAAA;YAClE,CAAC;YAED,MAAM,KAAK,GAAG,MAAM,CAAA;YACpB,MAAM,IAAI,OAAO,CAAC,MAAM,CAAA;YACxB,MAAM,GAAG,GAAG,MAAM,CAAA;YAElB,SAAS,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,EAAE,EAAE,KAAK,EAAE,GAAG,EAAE,CAAC,CAAA;QAC1C,CAAC;QAED,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,GAAG,EAAE,CAAA;IAClC,CAAC;IAED,oFAAoF;IACpF,MAAM,CAAC,GAA0B;QAChC,MAAM,GAAG,GAAG,GAAG,YAAY,UAAU,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,UAAU,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;QAClE,OAAO,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,GAAG,CAAW,CAAA;IAC/C,CAAC;CACD"}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Secondary-unit regex repair pass — parser-improvement backlog (2026-05-30).
|
|
7
|
+
*
|
|
8
|
+
* The three-arena capability eval surfaced a persistent neural weakness: the
|
|
9
|
+
* model DROPS secondary units. "123 Main St Apt 456" → no unit label; the
|
|
10
|
+
* postal-standards secondary-unit edge class scored 0% neural. Units have a
|
|
11
|
+
* rigid surface shape (a designator keyword + an identifier), so — exactly
|
|
12
|
+
* like the postcode-repair pass (#35) — we can detect them deterministically
|
|
13
|
+
* and repair the BIO labels AFTER decode but BEFORE `buildAddressTree`. The
|
|
14
|
+
* model is untouched; this is a decoder-side correction, the same "lowest
|
|
15
|
+
* risk" lever family as postcode-repair.
|
|
16
|
+
*
|
|
17
|
+
* PRECISION GUARDS (mirror postcode-repair — never regress a confident parse):
|
|
18
|
+
* - We only fire on EXPLICIT designators (Apt, Ste, Suite, Unit, Rm, Floor,
|
|
19
|
+
* Bldg, Flat, … + bare "#<n>"). Ambiguous tokens are deliberately excluded:
|
|
20
|
+
* "Box" (that's po_box), bare "F"/"No" (too greedy), "Space"/"Stop" (common
|
|
21
|
+
* words).
|
|
22
|
+
* - ADD path (model emitted no unit over the matched run): allowed ONLY over
|
|
23
|
+
* `O` tokens — never over house_number / street* / postcode / po_box / a
|
|
24
|
+
* geographic container. So a confidently-labeled street or number is safe.
|
|
25
|
+
* - SNAP path: when the model already started a unit span inside the match,
|
|
26
|
+
* we expand/clip it to the full detected shape.
|
|
27
|
+
* - Local smear-clip: unit tokens immediately flanking a snapped run are
|
|
28
|
+
* cleared (mirrors postcode-repair) so "Apt 4 Springfield" can't leave a
|
|
29
|
+
* stray I-unit on "Springfield".
|
|
30
|
+
*
|
|
31
|
+
* Opt-in via `ParseOpts.unitRepair` (postcode-repair earned default-on only
|
|
32
|
+
* after a measured +135/0; unit-repair stays opt-in until the v0.7.2 arena
|
|
33
|
+
* re-run quantifies its delta).
|
|
34
|
+
*/
|
|
35
|
+
import type { DecoderToken } from "@mailwoman/core/decoder";
|
|
36
|
+
export interface RepairResult {
|
|
37
|
+
tokens: DecoderToken[];
|
|
38
|
+
/** Number of token labels changed — for telemetry / logging. */
|
|
39
|
+
changed: number;
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Repair secondary-unit label spans in a decoded token sequence using designator
|
|
43
|
+
* regexes. Returns a NEW token array (inputs are not mutated) plus a change count.
|
|
44
|
+
*/
|
|
45
|
+
export declare function repairUnitLabels(text: string, input: readonly DecoderToken[]): RepairResult;
|
|
46
|
+
//# sourceMappingURL=unit-repair.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"unit-repair.d.ts","sourceRoot":"","sources":["../unit-repair.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAiCG;AAEH,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,yBAAyB,CAAA;AA+E3D,MAAM,WAAW,YAAY;IAC5B,MAAM,EAAE,YAAY,EAAE,CAAA;IACtB,gEAAgE;IAChE,OAAO,EAAE,MAAM,CAAA;CACf;AAED;;;GAGG;AACH,wBAAgB,gBAAgB,CAAC,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,SAAS,YAAY,EAAE,GAAG,YAAY,CA8C3F"}
|