@mailwoman/neural 4.0.0 → 4.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/out/anchor-inference.d.ts +10 -9
- package/out/anchor-inference.d.ts.map +1 -1
- package/out/anchor-inference.js +10 -9
- package/out/anchor-inference.js.map +1 -1
- package/out/classifier.d.ts +38 -7
- package/out/classifier.d.ts.map +1 -1
- package/out/classifier.js +19 -6
- package/out/classifier.js.map +1 -1
- package/out/gazetteer-inference.d.ts +72 -0
- package/out/gazetteer-inference.d.ts.map +1 -0
- package/out/gazetteer-inference.js +163 -0
- package/out/gazetteer-inference.js.map +1 -0
- package/out/index.d.ts +1 -0
- package/out/index.d.ts.map +1 -1
- package/out/index.js +1 -0
- package/out/index.js.map +1 -1
- package/out/onnx-runner.d.ts +3 -0
- package/out/onnx-runner.d.ts.map +1 -1
- package/out/onnx-runner.js +26 -1
- package/out/onnx-runner.js.map +1 -1
- package/out/unit-repair.d.ts +21 -25
- package/out/unit-repair.d.ts.map +1 -1
- package/out/unit-repair.js +33 -38
- package/out/unit-repair.js.map +1 -1
- package/package.json +3 -3
|
@@ -4,14 +4,15 @@
|
|
|
4
4
|
* @author Teffen Ellis, et al.
|
|
5
5
|
*
|
|
6
6
|
* Inference-side postcode-anchor features (#239/#240) — the mirror of the Python training pipeline
|
|
7
|
-
* (`mailwoman_train/tokenizer.py::anchor_feature_vector` + `realign_anchor_to_pieces`). At
|
|
8
|
-
* the model conditions on per-piece anchor features fed alongside `input_ids`; this
|
|
9
|
-
* a raw address + its SentencePiece pieces, using the SAME postcode→anchor
|
|
10
|
-
* against (`scripts/build-pilot-anchor-lookup.py`), so the feature
|
|
7
|
+
* (`mailwoman_train/tokenizer.py::anchor_feature_vector` + `realign_anchor_to_pieces`). At
|
|
8
|
+
* inference the model conditions on per-piece anchor features fed alongside `input_ids`; this
|
|
9
|
+
* builds them from a raw address + its SentencePiece pieces, using the SAME postcode→anchor
|
|
10
|
+
* lookup the model trained against (`scripts/build-pilot-anchor-lookup.py`), so the feature
|
|
11
|
+
* layout matches byte-for-byte.
|
|
11
12
|
*
|
|
12
13
|
* The layout is LOAD-BEARING and cross-language: a wrong locale order or centroid scale feeds the
|
|
13
|
-
* model garbage. `anchor-inference.test.ts` pins both `LOCALE_ORDER` and the vector to values
|
|
14
|
-
* by the Python `anchor_feature_vector` — any drift fails the test.
|
|
14
|
+
* model garbage. `anchor-inference.test.ts` pins both `LOCALE_ORDER` and the vector to values
|
|
15
|
+
* emitted by the Python `anchor_feature_vector` — any drift fails the test.
|
|
15
16
|
*/
|
|
16
17
|
import type { TokenizedPiece } from "./tokenizer.js";
|
|
17
18
|
/**
|
|
@@ -36,9 +37,9 @@ export type AnchorLookup = Map<string, AnchorEntry>;
|
|
|
36
37
|
*/
|
|
37
38
|
export declare function anchorFeatureVector(posterior: Record<string, number>, lat: number, lon: number): number[];
|
|
38
39
|
/**
|
|
39
|
-
* Parse the pilot postcode→anchor lookup JSON (`{postcode: [posterior, lat, lon]}`) into a Map.
|
|
40
|
-
* (takes the parsed object, not a path) so this module stays browser-safe — the file read
|
|
41
|
-
* the Node-side caller (the eval).
|
|
40
|
+
* Parse the pilot postcode→anchor lookup JSON (`{postcode: [posterior, lat, lon]}`) into a Map.
|
|
41
|
+
* Pure (takes the parsed object, not a path) so this module stays browser-safe — the file read
|
|
42
|
+
* lives in the Node-side caller (the eval).
|
|
42
43
|
*/
|
|
43
44
|
export declare function parseAnchorLookup(raw: Record<string, [Record<string, number>, number, number]>): AnchorLookup;
|
|
44
45
|
/**
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"anchor-inference.d.ts","sourceRoot":"","sources":["../anchor-inference.ts"],"names":[],"mappings":"AAAA
|
|
1
|
+
{"version":3,"file":"anchor-inference.d.ts","sourceRoot":"","sources":["../anchor-inference.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAEH,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAA;AAEpD;;;;GAIG;AACH,eAAO,MAAM,YAAY,iEAAkE,CAAA;AAE3F,6EAA6E;AAC7E,eAAO,MAAM,kBAAkB,QAA0B,CAAA;AAEzD,mGAAmG;AACnG,MAAM,WAAW,WAAW;IAC3B,SAAS,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAA;IACjC,GAAG,EAAE,MAAM,CAAA;IACX,GAAG,EAAE,MAAM,CAAA;CACX;AAED,MAAM,MAAM,YAAY,GAAG,GAAG,CAAC,MAAM,EAAE,WAAW,CAAC,CAAA;AAEnD;;;;GAIG;AACH,wBAAgB,mBAAmB,CAAC,SAAS,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,GAAG,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,MAAM,EAAE,CAgBzG;AAED;;;;GAIG;AACH,wBAAgB,iBAAiB,CAAC,GAAG,EAAE,MAAM,CAAC,MAAM,EAAE,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,MAAM,EAAE,MAAM,CAAC,CAAC,GAAG,YAAY,CAI7G;AAED;;;;;;;;GAQG;AACH,wBAAgB,mBAAmB,CAClC,IAAI,EAAE,MAAM,EACZ,MAAM,EAAE,aAAa,CAAC,cAAc,CAAC,EACrC,MAAM,EAAE,YAAY,GAClB;IAAE,QAAQ,EAAE,MAAM,EAAE,EAAE,CAAC;IAAC,UAAU,EAAE,MAAM,EAAE,CAAA;CAAE,CA0BhD"}
|
package/out/anchor-inference.js
CHANGED
|
@@ -4,14 +4,15 @@
|
|
|
4
4
|
* @author Teffen Ellis, et al.
|
|
5
5
|
*
|
|
6
6
|
* Inference-side postcode-anchor features (#239/#240) — the mirror of the Python training pipeline
|
|
7
|
-
* (`mailwoman_train/tokenizer.py::anchor_feature_vector` + `realign_anchor_to_pieces`). At
|
|
8
|
-
* the model conditions on per-piece anchor features fed alongside `input_ids`; this
|
|
9
|
-
* a raw address + its SentencePiece pieces, using the SAME postcode→anchor
|
|
10
|
-
* against (`scripts/build-pilot-anchor-lookup.py`), so the feature
|
|
7
|
+
* (`mailwoman_train/tokenizer.py::anchor_feature_vector` + `realign_anchor_to_pieces`). At
|
|
8
|
+
* inference the model conditions on per-piece anchor features fed alongside `input_ids`; this
|
|
9
|
+
* builds them from a raw address + its SentencePiece pieces, using the SAME postcode→anchor
|
|
10
|
+
* lookup the model trained against (`scripts/build-pilot-anchor-lookup.py`), so the feature
|
|
11
|
+
* layout matches byte-for-byte.
|
|
11
12
|
*
|
|
12
13
|
* The layout is LOAD-BEARING and cross-language: a wrong locale order or centroid scale feeds the
|
|
13
|
-
* model garbage. `anchor-inference.test.ts` pins both `LOCALE_ORDER` and the vector to values
|
|
14
|
-
* by the Python `anchor_feature_vector` — any drift fails the test.
|
|
14
|
+
* model garbage. `anchor-inference.test.ts` pins both `LOCALE_ORDER` and the vector to values
|
|
15
|
+
* emitted by the Python `anchor_feature_vector` — any drift fails the test.
|
|
15
16
|
*/
|
|
16
17
|
/**
|
|
17
18
|
* The locale class order — MUST match Python `mailwoman_train/labels.py::LOCALE_COUNTRIES`. The
|
|
@@ -45,9 +46,9 @@ export function anchorFeatureVector(posterior, lat, lon) {
|
|
|
45
46
|
return vec;
|
|
46
47
|
}
|
|
47
48
|
/**
|
|
48
|
-
* Parse the pilot postcode→anchor lookup JSON (`{postcode: [posterior, lat, lon]}`) into a Map.
|
|
49
|
-
* (takes the parsed object, not a path) so this module stays browser-safe — the file read
|
|
50
|
-
* the Node-side caller (the eval).
|
|
49
|
+
* Parse the pilot postcode→anchor lookup JSON (`{postcode: [posterior, lat, lon]}`) into a Map.
|
|
50
|
+
* Pure (takes the parsed object, not a path) so this module stays browser-safe — the file read
|
|
51
|
+
* lives in the Node-side caller (the eval).
|
|
51
52
|
*/
|
|
52
53
|
export function parseAnchorLookup(raw) {
|
|
53
54
|
const out = new Map();
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"anchor-inference.js","sourceRoot":"","sources":["../anchor-inference.ts"],"names":[],"mappings":"AAAA
|
|
1
|
+
{"version":3,"file":"anchor-inference.js","sourceRoot":"","sources":["../anchor-inference.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAIH;;;;GAIG;AACH,MAAM,CAAC,MAAM,YAAY,GAAG,CAAC,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,CAAU,CAAA;AAE3F,6EAA6E;AAC7E,MAAM,CAAC,MAAM,kBAAkB,GAAG,YAAY,CAAC,MAAM,GAAG,CAAC,CAAA;AAWzD;;;;GAIG;AACH,MAAM,UAAU,mBAAmB,CAAC,SAAiC,EAAE,GAAW,EAAE,GAAW;IAC9F,MAAM,GAAG,GAAG,IAAI,KAAK,CAAS,kBAAkB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IACzD,IAAI,KAAK,GAAG,CAAC,CAAA;IACb,KAAK,MAAM,CAAC,OAAO,EAAE,MAAM,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,EAAE,CAAC;QAC3D,MAAM,GAAG,GAAG,YAAY,CAAC,OAAO,CAAC,OAAO,CAAC,WAAW,EAAmC,CAAC,CAAA;QACxF,IAAI,GAAG,IAAI,CAAC,EAAE,CAAC;YACd,GAAG,CAAC,GAAG,CAAC,GAAG,MAAM,CAAA;YACjB,KAAK,IAAI,MAAM,CAAA;QAChB,CAAC;IACF,CAAC;IACD,IAAI,KAAK,GAAG,CAAC,EAAE,CAAC;QACf,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,YAAY,CAAC,MAAM,EAAE,CAAC,EAAE;YAAE,GAAG,CAAC,CAAC,CAAE,IAAI,KAAK,CAAA;IAC/D,CAAC;IACD,GAAG,CAAC,YAAY,CAAC,MAAM,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,GAAG,GAAG,EAAE,CAAC,CAAC,CAAA;IAC9D,GAAG,CAAC,YAAY,CAAC,MAAM,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,GAAG,GAAG,GAAG,CAAC,CAAC,CAAA;IACnE,OAAO,GAAG,CAAA;AACX,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,iBAAiB,CAAC,GAA6D;IAC9F,MAAM,GAAG,GAAiB,IAAI,GAAG,EAAE,CAAA;IACnC,KAAK,MAAM,CAAC,EAAE,EAAE,CAAC,SAAS,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC;QAAE,GAAG,CAAC,GAAG,CAAC,EAAE,EAAE,EAAE,SAAS,EAAE,GAAG,EAAE,GAAG,EAAE,CAAC,CAAA;IACnG,OAAO,GAAG,CAAA;AACX,CAAC;AAED;;;;;;;;GAQG;AACH,MAAM,UAAU,mBAAmB,CAClC,IAAY,EACZ,MAAqC,EACrC,MAAoB;IAEpB,MAAM,QAAQ,GAAe,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,IAAI,KAAK,CAAS,kBAAkB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAA;IAC5F,MAAM,UAAU,GAAa,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAA;IAEhD,MAAM,OAAO,GAAG,eAAe,CAAA;IAC/B,IAAI,CAAyB,CAAA;IAC7B,OAAO,CAAC,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QAC1C,MAAM,KAAK,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC,CAAA;QAC5C,IAAI,CAAC,KAAK;YAAE,SAAQ;QACpB,MAAM,SAAS,GAAG,CAAC,CAAC,KAAK,CAAA;QACzB,MAAM,OAAO,GAAG,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAA;QACrC,MAAM,GAAG,GAAG,mBAAmB,CAAC,KAAK,CAAC,SAAS,EAAE,KAAK,CAAC,GAAG,EAAE,KAAK,CAAC,GAAG,CAAC,CAAA;QACtE,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACxC,MAAM,CAAC,GAAG,MAAM,CAAC,CAAC,CAAE,CAAA;YACpB,KAAK,IAAI,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;gBACtC,IAAI,CAAC,GAAG,IAAI,CAAC,MAAM,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAE,CAAC,EAAE,CAAC;oBAC7C,IAAI,CAAC,IAAI,SAAS,IAAI,CAAC,GAAG,OAAO,EAAE,CAAC;wBACnC,QAAQ,CAAC,CAAC,CAAC,GAAG,GAAG,CAAA;wBACjB,UAAU,CAAC,CAAC,CAAC,GAAG,GAAG,CAAA;oBACpB,CAAC;oBACD,MAAK,CAAC,oFAAoF;gBAC3F,CAAC;YACF,CAAC;QACF,CAAC;IACF,CAAC;IACD,OAAO,EAAE,QAAQ,EAAE,UAAU,EAAE,CAAA;AAChC,CAAC"}
|
package/out/classifier.d.ts
CHANGED
|
@@ -9,13 +9,14 @@
|
|
|
9
9
|
*
|
|
10
10
|
* Convenience wrappers `parseJson` / `parseTuples` / `parseXml` project the tree on the way out.
|
|
11
11
|
*/
|
|
12
|
-
import { decodeAsXml, type AddressTree, type ComponentTag } from "@mailwoman/core/decoder";
|
|
12
|
+
import { decodeAsXml, type AddressTree, type Calibrator, type ComponentTag } from "@mailwoman/core/decoder";
|
|
13
|
+
import { type AnchorLookup } from "./anchor-inference.js";
|
|
14
|
+
import { type GazetteerLexicon } from "./gazetteer-inference.js";
|
|
13
15
|
import { type FstMatcherLike } from "./fst-prior.js";
|
|
14
16
|
import type { InferResult } from "./onnx-runner.js";
|
|
15
17
|
import { type QueryShapeLike } from "./query-shape-prior.js";
|
|
16
18
|
import { type StreetMorphologyPriorOpts } from "./street-morphology-prior.js";
|
|
17
19
|
import { MailwomanTokenizer } from "./tokenizer.js";
|
|
18
|
-
import { type AnchorLookup } from "./anchor-inference.js";
|
|
19
20
|
import type { ResolveWeightsOpts } from "./weights.js";
|
|
20
21
|
/**
|
|
21
22
|
* Structural type the classifier needs from a runner. Lets callers swap the Node-side `OnnxRunner`
|
|
@@ -26,6 +27,9 @@ export interface NeuralRunner {
|
|
|
26
27
|
infer(tokenIds: number[], anchor?: {
|
|
27
28
|
features: ReadonlyArray<ReadonlyArray<number>>;
|
|
28
29
|
confidence: ReadonlyArray<number>;
|
|
30
|
+
}, gazetteer?: {
|
|
31
|
+
features: ReadonlyArray<ReadonlyArray<number>>;
|
|
32
|
+
confidence: ReadonlyArray<number>;
|
|
29
33
|
}): Promise<InferResult>;
|
|
30
34
|
}
|
|
31
35
|
export interface NeuralAddressClassifierConfig {
|
|
@@ -63,6 +67,26 @@ export interface NeuralAddressClassifierConfig {
|
|
|
63
67
|
* models. Load via `loadAnchorLookup` from `./anchor-inference.js`.
|
|
64
68
|
*/
|
|
65
69
|
postcodeAnchorLookup?: AnchorLookup;
|
|
70
|
+
/**
|
|
71
|
+
* Optional gazetteer-anchor lexicon (#464, knowledge-ladder rung 3.2). When set, `parse` builds
|
|
72
|
+
* per-token candidate-tag-set clues (country/region/po_box/cedex/homograph) from the text + this
|
|
73
|
+
* lexicon and feeds them to the runner — for models trained with the gazetteer-anchor channel
|
|
74
|
+
* (exported with the `gazetteer_features`/`gazetteer_confidence` ONNX inputs). Omit for plain
|
|
75
|
+
* models. Load via `parseGazetteerLexicon` from `./gazetteer-inference.js`.
|
|
76
|
+
*/
|
|
77
|
+
gazetteerLexicon?: GazetteerLexicon;
|
|
78
|
+
/**
|
|
79
|
+
* Channel choreography (#464, v0.9.13 postcode fix): when true, zero the gazetteer clue on pieces
|
|
80
|
+
* adjacent to a postcode-anchor hit (needs both `gazetteerLexicon` and `postcodeAnchorLookup`).
|
|
81
|
+
* Targets the region-clue→postcode CRF interference (~3pp US postcode).
|
|
82
|
+
*
|
|
83
|
+
* PAIRING IS LOAD-BEARING: set this IFF the model was TRAINED with the matching train-time
|
|
84
|
+
* choreography (`data.gazetteer_choreography`). The 2026-06-10 diagnostic showed the harm is
|
|
85
|
+
* WEIGHT-BAKED — applying this at inference on a model trained *without* train-choreography does
|
|
86
|
+
* NOT recover postcode and adds train/inference skew. Only enable for a consolidation-era model
|
|
87
|
+
* trained with the train-time half.
|
|
88
|
+
*/
|
|
89
|
+
suppressGazetteerNearPostcode?: boolean;
|
|
66
90
|
}
|
|
67
91
|
export declare class NeuralAddressClassifier {
|
|
68
92
|
private readonly cfg;
|
|
@@ -163,12 +187,19 @@ export interface ParseOpts {
|
|
|
163
187
|
*/
|
|
164
188
|
postcodeRepair?: boolean;
|
|
165
189
|
/**
|
|
166
|
-
* When true, run the deterministic secondary-unit regex repair pass on the decoded label
|
|
167
|
-
*
|
|
168
|
-
*
|
|
169
|
-
*
|
|
170
|
-
*
|
|
190
|
+
* When true, run the deterministic secondary-unit regex repair pass on the decoded label sequence
|
|
191
|
+
* before tree-building. Detects designator-shaped substrings ("Apt 4B", "Ste 12", "Unit 9400",
|
|
192
|
+
* bare "#104", …) and snaps/adds the unit span, fixing the unit-drop weakness the three-arena
|
|
193
|
+
* capability eval surfaced (postal secondary-unit 0% neural). Off by default — opt-in until the
|
|
194
|
+
* v0.7.2 arena re-run quantifies its delta. See `./unit-repair.ts`.
|
|
171
195
|
*/
|
|
172
196
|
unitRepair?: boolean;
|
|
197
|
+
/**
|
|
198
|
+
* Optional span-confidence calibrator (task #59). When provided, each decoded span's `conf=` is
|
|
199
|
+
* mapped through it (isotonic lookup table → calibrated probability of correctness). OPT-IN —
|
|
200
|
+
* omit for the byte-stable default softmax confidence. Build one via `createCalibrator`
|
|
201
|
+
* (`@mailwoman/core/decoder`) from `data/eval/calibration/isotonic-<locale>-<version>.json`.
|
|
202
|
+
*/
|
|
203
|
+
calibrate?: Calibrator;
|
|
173
204
|
}
|
|
174
205
|
//# sourceMappingURL=classifier.d.ts.map
|
package/out/classifier.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"classifier.d.ts","sourceRoot":"","sources":["../classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,EAIN,WAAW,EACX,KAAK,WAAW,EAChB,KAAK,YAAY,EAEjB,MAAM,yBAAyB,CAAA;AAChC,OAAO,EAA0B,KAAK,cAAc,EAAE,MAAM,gBAAgB,CAAA;AAE5E,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAA;
|
|
1
|
+
{"version":3,"file":"classifier.d.ts","sourceRoot":"","sources":["../classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,EAIN,WAAW,EACX,KAAK,WAAW,EAChB,KAAK,UAAU,EACf,KAAK,YAAY,EAEjB,MAAM,yBAAyB,CAAA;AAChC,OAAO,EAAuB,KAAK,YAAY,EAAE,MAAM,uBAAuB,CAAA;AAC9E,OAAO,EAAyD,KAAK,gBAAgB,EAAE,MAAM,0BAA0B,CAAA;AACvH,OAAO,EAA0B,KAAK,cAAc,EAAE,MAAM,gBAAgB,CAAA;AAE5E,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAA;AAEnD,OAAO,EAA0C,KAAK,cAAc,EAAE,MAAM,wBAAwB,CAAA;AACpG,OAAO,EAAuC,KAAK,yBAAyB,EAAE,MAAM,8BAA8B,CAAA;AAClH,OAAO,EAAE,kBAAkB,EAAE,MAAM,gBAAgB,CAAA;AAGnD,OAAO,KAAK,EAAE,kBAAkB,EAAmB,MAAM,cAAc,CAAA;AAEvE;;;;GAIG;AACH,MAAM,WAAW,YAAY;IAC5B,KAAK,CACJ,QAAQ,EAAE,MAAM,EAAE,EAClB,MAAM,CAAC,EAAE;QAAE,QAAQ,EAAE,aAAa,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC,CAAC;QAAC,UAAU,EAAE,aAAa,CAAC,MAAM,CAAC,CAAA;KAAE,EAC9F,SAAS,CAAC,EAAE;QAAE,QAAQ,EAAE,aAAa,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC,CAAC;QAAC,UAAU,EAAE,aAAa,CAAC,MAAM,CAAC,CAAA;KAAE,GAC/F,OAAO,CAAC,WAAW,CAAC,CAAA;CACvB;AAED,MAAM,WAAW,6BAA6B;IAC7C,SAAS,EAAE,kBAAkB,CAAA;IAC7B,MAAM,EAAE,YAAY,CAAA;IACpB;;;;OAIG;IACH,MAAM,CAAC,EAAE,SAAS,MAAM,EAAE,CAAA;IAC1B;;;;;;;OAOG;IACH,MAAM,CAAC,EAAE,SAAS,GAAG,QAAQ,CAAA;IAC7B;;;;OAIG;IACH,WAAW,CAAC,EAAE,MAAM,EAAE,EAAE,CAAA;IACxB,sEAAsE;IACtE,gBAAgB,CAAC,EAAE,MAAM,EAAE,CAAA;IAC3B,oEAAoE;IACpE,cAAc,CAAC,EAAE,MAAM,EAAE,CAAA;IACzB;;;;;OAKG;IACH,oBAAoB,CAAC,EAAE,YAAY,CAAA;IACnC;;;;;;OAMG;IACH,gBAAgB,CAAC,EAAE,gBAAgB,CAAA;IACnC;;;;;;;;;;OAUG;IACH,6BAA6B,CAAC,EAAE,OAAO,CAAA;CACvC;AAED,qBAAa,uBAAuB;IAOvB,OAAO,CAAC,QAAQ,CAAC,GAAG;IANhC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAmB;IAC1C,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAsB;IACjD,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAY;IACxC,OAAO,CAAC,QAAQ,CAAC,gBAAgB,CAAU;IAC3C,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAU;gBAEZ,GAAG,EAAE,6BAA6B;IAa/D;;;;;;;;;;;OAWG;WACU,eAAe,CAC3B,IAAI,GAAE,kBAAkB,GAAG;QAAE,oBAAoB,CAAC,EAAE,YAAY,CAAA;KAAO,GACrE,OAAO,CAAC,uBAAuB,CAAC;IA4BnC,6DAA6D;IACvD,KAAK,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,SAAS,GAAG,OAAO,CAAC,WAAW,CAAC;IAmFjE;;;OAGG;IACG,eAAe,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,SAAS,GAAG,OAAO,CAAC,qBAAqB,CAAC;IAiF/E,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,SAAS,GAAG,OAAO,CAAC,OAAO,CAAC,MAAM,CAAC,YAAY,EAAE,MAAM,CAAC,CAAC,CAAC;IAIzF,WAAW,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,SAAS,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,YAAY,EAAE,MAAM,CAAC,CAAC,CAAC;IAInF,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,SAAS,GAAG;QAAE,GAAG,CAAC,EAAE,UAAU,CAAC,OAAO,WAAW,CAAC,CAAC,CAAC,CAAC,CAAA;KAAE,GAAG,OAAO,CAAC,MAAM,CAAC;IAI7G;;;;;;;;;;OAUG;IACH,OAAO,CAAC,mBAAmB;CAW3B;AAED,wFAAwF;AACxF,MAAM,WAAW,qBAAqB;IACrC,IAAI,EAAE,WAAW,CAAA;IACjB,MAAM,EAAE,MAAM,EAAE,EAAE,CAAA;IAClB,MAAM,EAAE,KAAK,CAAC;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,GAAG,EAAE,MAAM,CAAA;KAAE,CAAC,CAAA;CAC7C;AAED;;;GAGG;AACH,MAAM,WAAW,SAAS;IACzB;;;;OAIG;IACH,UAAU,CAAC,EAAE,cAAc,CAAA;IAC3B;;;OAGG;IACH,mBAAmB,CAAC,EAAE,MAAM,CAAA;IAC5B;;;OAGG;IACH,GAAG,CAAC,EAAE,cAAc,CAAA;IACpB,6DAA6D;IAC7D,YAAY,CAAC,EAAE,MAAM,CAAA;IACrB;;;;;;OAMG;IACH,mBAAmB,CAAC,EAAE,cAAc,CAAA;IACpC,yDAAyD;IACzD,uBAAuB,CAAC,EAAE,yBAAyB,CAAA;IACnD;;;;;;OAMG;IACH,cAAc,CAAC,EAAE,OAAO,CAAA;IACxB;;;;;;OAMG;IACH,UAAU,CAAC,EAAE,OAAO,CAAA;IACpB;;;;;OAKG;IACH,SAAS,CAAC,EAAE,UAAU,CAAA;CACtB"}
|
package/out/classifier.js
CHANGED
|
@@ -10,14 +10,15 @@
|
|
|
10
10
|
* Convenience wrappers `parseJson` / `parseTuples` / `parseXml` project the tree on the way out.
|
|
11
11
|
*/
|
|
12
12
|
import { buildAddressTree, decodeAsJson, decodeAsTuples, decodeAsXml, } from "@mailwoman/core/decoder";
|
|
13
|
+
import { buildAnchorFeatures } from "./anchor-inference.js";
|
|
14
|
+
import { buildGazetteerFeatures, suppressGazetteerNearPostcode } from "./gazetteer-inference.js";
|
|
13
15
|
import { buildFstEmissionPriors } from "./fst-prior.js";
|
|
14
16
|
import { STAGE2_BIO_LABELS } from "./labels.js";
|
|
15
17
|
import { repairPostcodeLabels } from "./postcode-repair.js";
|
|
16
|
-
import { repairUnitLabels } from "./unit-repair.js";
|
|
17
18
|
import { addEmissionMatrix, buildEmissionPriors } from "./query-shape-prior.js";
|
|
18
19
|
import { buildStreetMorphologyEmissionPriors } from "./street-morphology-prior.js";
|
|
19
20
|
import { MailwomanTokenizer } from "./tokenizer.js";
|
|
20
|
-
import {
|
|
21
|
+
import { repairUnitLabels } from "./unit-repair.js";
|
|
21
22
|
import { buildBioEndMask, buildBioStartMask, buildBioTransitionMask, softmax, viterbi } from "./viterbi.js";
|
|
22
23
|
export class NeuralAddressClassifier {
|
|
23
24
|
cfg;
|
|
@@ -89,7 +90,13 @@ export class NeuralAddressClassifier {
|
|
|
89
90
|
const anchor = this.cfg.postcodeAnchorLookup
|
|
90
91
|
? buildAnchorFeatures(text, pieces, this.cfg.postcodeAnchorLookup)
|
|
91
92
|
: undefined;
|
|
92
|
-
const
|
|
93
|
+
const gazetteer = this.cfg.gazetteerLexicon
|
|
94
|
+
? buildGazetteerFeatures(text, pieces, this.cfg.gazetteerLexicon)
|
|
95
|
+
: undefined;
|
|
96
|
+
const gazFed = gazetteer && anchor && this.cfg.suppressGazetteerNearPostcode
|
|
97
|
+
? suppressGazetteerNearPostcode(gazetteer, anchor.confidence)
|
|
98
|
+
: gazetteer;
|
|
99
|
+
const { logits } = await this.cfg.runner.infer(ids, anchor, gazFed);
|
|
93
100
|
this.assertEmissionWidth(logits);
|
|
94
101
|
let emissions = opts?.queryShape
|
|
95
102
|
? addEmissionMatrix(logits, buildEmissionPriors(opts.queryShape, pieces, this.labels, {
|
|
@@ -130,7 +137,7 @@ export class NeuralAddressClassifier {
|
|
|
130
137
|
if (opts?.unitRepair) {
|
|
131
138
|
tokens = repairUnitLabels(text, tokens).tokens;
|
|
132
139
|
}
|
|
133
|
-
return buildAddressTree(text, tokens);
|
|
140
|
+
return buildAddressTree(text, tokens, opts?.calibrate ? { calibrate: opts.calibrate } : undefined);
|
|
134
141
|
}
|
|
135
142
|
/**
|
|
136
143
|
* Like `parse`, but also returns the raw per-token logits and piece offsets needed for per-span
|
|
@@ -146,7 +153,13 @@ export class NeuralAddressClassifier {
|
|
|
146
153
|
const anchor = this.cfg.postcodeAnchorLookup
|
|
147
154
|
? buildAnchorFeatures(text, pieces, this.cfg.postcodeAnchorLookup)
|
|
148
155
|
: undefined;
|
|
149
|
-
const
|
|
156
|
+
const gazetteer = this.cfg.gazetteerLexicon
|
|
157
|
+
? buildGazetteerFeatures(text, pieces, this.cfg.gazetteerLexicon)
|
|
158
|
+
: undefined;
|
|
159
|
+
const gazFed = gazetteer && anchor && this.cfg.suppressGazetteerNearPostcode
|
|
160
|
+
? suppressGazetteerNearPostcode(gazetteer, anchor.confidence)
|
|
161
|
+
: gazetteer;
|
|
162
|
+
const { logits } = await this.cfg.runner.infer(ids, anchor, gazFed);
|
|
150
163
|
this.assertEmissionWidth(logits);
|
|
151
164
|
let emissions = opts?.queryShape
|
|
152
165
|
? addEmissionMatrix(logits, buildEmissionPriors(opts.queryShape, pieces, this.labels, {
|
|
@@ -182,7 +195,7 @@ export class NeuralAddressClassifier {
|
|
|
182
195
|
};
|
|
183
196
|
});
|
|
184
197
|
return {
|
|
185
|
-
tree: buildAddressTree(text, tokens),
|
|
198
|
+
tree: buildAddressTree(text, tokens, opts?.calibrate ? { calibrate: opts.calibrate } : undefined),
|
|
186
199
|
logits,
|
|
187
200
|
pieces: pieces.map((p) => ({ start: p.start, end: p.end })),
|
|
188
201
|
};
|
package/out/classifier.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"classifier.js","sourceRoot":"","sources":["../classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,EACN,gBAAgB,EAChB,YAAY,EACZ,cAAc,EACd,WAAW,
|
|
1
|
+
{"version":3,"file":"classifier.js","sourceRoot":"","sources":["../classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,EACN,gBAAgB,EAChB,YAAY,EACZ,cAAc,EACd,WAAW,GAKX,MAAM,yBAAyB,CAAA;AAChC,OAAO,EAAE,mBAAmB,EAAqB,MAAM,uBAAuB,CAAA;AAC9E,OAAO,EAAE,sBAAsB,EAAE,6BAA6B,EAAyB,MAAM,0BAA0B,CAAA;AACvH,OAAO,EAAE,sBAAsB,EAAuB,MAAM,gBAAgB,CAAA;AAC5E,OAAO,EAAE,iBAAiB,EAAE,MAAM,aAAa,CAAA;AAE/C,OAAO,EAAE,oBAAoB,EAAE,MAAM,sBAAsB,CAAA;AAC3D,OAAO,EAAE,iBAAiB,EAAE,mBAAmB,EAAuB,MAAM,wBAAwB,CAAA;AACpG,OAAO,EAAE,mCAAmC,EAAkC,MAAM,8BAA8B,CAAA;AAClH,OAAO,EAAE,kBAAkB,EAAE,MAAM,gBAAgB,CAAA;AACnD,OAAO,EAAE,gBAAgB,EAAE,MAAM,kBAAkB,CAAA;AACnD,OAAO,EAAE,eAAe,EAAE,iBAAiB,EAAE,sBAAsB,EAAE,OAAO,EAAE,OAAO,EAAE,MAAM,cAAc,CAAA;AAyE3G,MAAM,OAAO,uBAAuB;IAON;IANZ,MAAM,CAAmB;IACzB,UAAU,CAAsB;IAChC,WAAW,CAAY;IACvB,gBAAgB,CAAU;IAC1B,cAAc,CAAU;IAEzC,YAA6B,GAAkC;QAAlC,QAAG,GAAH,GAAG,CAA+B;QAC9D,IAAI,CAAC,MAAM,GAAG,GAAG,CAAC,MAAM,IAAI,iBAAiB,CAAA;QAC7C,IAAI,CAAC,UAAU,GAAG,GAAG,CAAC,MAAM,IAAI,SAAS,CAAA;QACzC,MAAM,UAAU,GAAG,sBAAsB,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QACtD,IAAI,GAAG,CAAC,WAAW,EAAE,CAAC;YACrB,IAAI,CAAC,WAAW,GAAG,WAAW,CAAC,UAAU,EAAE,GAAG,CAAC,WAAW,CAAC,CAAA;QAC5D,CAAC;aAAM,CAAC;YACP,IAAI,CAAC,WAAW,GAAG,UAAU,CAAA;QAC9B,CAAC;QACD,IAAI,CAAC,gBAAgB,GAAG,GAAG,CAAC,gBAAgB,IAAI,iBAAiB,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QAC9E,IAAI,CAAC,cAAc,GAAG,GAAG,CAAC,cAAc,IAAI,eAAe,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;IACzE,CAAC;IAED;;;;;;;;;;;OAWG;IACH,MAAM,CAAC,KAAK,CAAC,eAAe,CAC3B,OAAqE,EAAE;QAEvE,yFAAyF;QACzF,2FAA2F;QAC3F,uFAAuF;QACvF,0FAA0F;QAC1F,2BAA2B;QAC3B,MAAM,CAAC,EAAE,UAAU,EAAE,EAAE,EAAE,cAAc,EAAE,uBAAuB,EAAE,kBAAkB,EAAE,CAAC,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC;YAC3G,MAAM,CAAC,yBAAyB,CAAC,kBAAkB,CAAC;YACpD,MAAM,CAAC,yBAAyB,CAAC,cAAc,CAAC;SAChD,CAAC,CAAA;QACF,MAAM,QAAQ,GAAoB,cAAc,CAAC,IAAI,CAAC,CAAA;QACtD,MAAM,MAAM,GAAG,uBAAuB,CAAC,QAAQ,CAAC,aAAa,CAAC,CAAA;QAC9D,MAAM,GAAG,GAAG,kBAAkB,CAAC,QAAQ,CAAC,kBAAkB,CAAC,CAAA;QAC3D,MAAM,CAAC,SAAS,EAAE,MAAM,CAAC,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC;YAC7C,kBAAkB,CAAC,YAAY,CAAC,QAAQ,CAAC,aAAa,CAAC;YACvD,UAAU,CAAC,MAAM,CAAC,QAAQ,CAAC,SAAS,CAAC;SACrC,CAAC,CAAA;QACF,OAAO,IAAI,uBAAuB,CAAC;YAClC,SAAS;YACT,MAAM;YACN,MAAM;YACN,WAAW,EAAE,GAAG,EAAE,WAAW;YAC7B,gBAAgB,EAAE,GAAG,EAAE,gBAAgB;YACvC,cAAc,EAAE,GAAG,EAAE,cAAc;YACnC,GAAG,CAAC,IAAI,CAAC,oBAAoB,CAAC,CAAC,CAAC,EAAE,oBAAoB,EAAE,IAAI,CAAC,oBAAoB,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;SACzF,CAAC,CAAA;IACH,CAAC;IAED,6DAA6D;IAC7D,KAAK,CAAC,KAAK,CAAC,IAAY,EAAE,IAAgB;QACzC,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,EAAE,GAAG,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE,EAAE,CAAA;QAEtD,MAAM,EAAE,MAAM,EAAE,GAAG,EAAE,GAAG,IAAI,CAAC,GAAG,CAAC,SAAS,CAAC,MAAM,CAAC,IAAI,CAAC,CAAA;QACvD,gGAAgG;QAChG,+EAA+E;QAC/E,MAAM,MAAM,GAAG,IAAI,CAAC,GAAG,CAAC,oBAAoB;YAC3C,CAAC,CAAC,mBAAmB,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,GAAG,CAAC,oBAAoB,CAAC;YAClE,CAAC,CAAC,SAAS,CAAA;QACZ,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,CAAC,gBAAgB;YAC1C,CAAC,CAAC,sBAAsB,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,GAAG,CAAC,gBAAgB,CAAC;YACjE,CAAC,CAAC,SAAS,CAAA;QACZ,MAAM,MAAM,GACX,SAAS,IAAI,MAAM,IAAI,IAAI,CAAC,GAAG,CAAC,6BAA6B;YAC5D,CAAC,CAAC,6BAA6B,CAAC,SAAS,EAAE,MAAM,CAAC,UAAU,CAAC;YAC7D,CAAC,CAAC,SAAS,CAAA;QACb,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,EAAE,MAAM,EAAE,MAAM,CAAC,CAAA;QAEnE,IAAI,CAAC,mBAAmB,CAAC,MAAM,CAAC,CAAA;QAEhC,IAAI,SAAS,GAAG,IAAI,EAAE,UAAU;YAC/B,CAAC,CAAC,iBAAiB,CACjB,MAAM,EACN,mBAAmB,CAAC,IAAI,CAAC,UAAU,EAAE,MAAM,EAAE,IAAI,CAAC,MAAM,EAAE;gBACzD,SAAS,EAAE,IAAI,CAAC,mBAAmB,IAAI,GAAG;gBAC1C,SAAS,EAAE,IAAI;aACf,CAAC,CACF;YACF,CAAC,CAAC,MAAM,CAAA;QAET,IAAI,IAAI,EAAE,GAAG,EAAE,CAAC;YACf,SAAS,GAAG,iBAAiB,CAC5B,SAAS,EACT,sBAAsB,CAAC,IAAI,CAAC,GAAG,EAAE,MAAM,EAAE,IAAI,CAAC,MAAM,EAAE;gBACrD,SAAS,EAAE,IAAI,CAAC,YAAY,IAAI,GAAG;aACnC,CAAC,CACF,CAAA;QACF,CAAC;QAED,IAAI,IAAI,EAAE,mBAAmB,EAAE,CAAC;YAC/B,SAAS,GAAG,iBAAiB,CAC5B,SAAS,EACT,mCAAmC,CAClC,IAAI,CAAC,mBAAmB,EACxB,MAAM,EACN,IAAI,CAAC,MAAM,EACX,IAAI,CAAC,uBAAuB,IAAI,EAAE,CAClC,CACD,CAAA;QACF,CAAC;QAED,MAAM,YAAY,GACjB,IAAI,CAAC,UAAU,KAAK,SAAS;YAC5B,CAAC,CAAC,OAAO,CAAC;gBACR,SAAS;gBACT,WAAW,EAAE,IAAI,CAAC,WAAW;gBAC7B,gBAAgB,EAAE,IAAI,CAAC,gBAAgB;gBACvC,cAAc,EAAE,IAAI,CAAC,cAAc;aACnC,CAAC,CAAC,IAAI;YACR,CAAC,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,aAAa,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAA;QAElD,IAAI,MAAM,GAAmB,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;YAChD,MAAM,GAAG,GAAG,YAAY,CAAC,CAAC,CAAE,CAAA;YAC5B,MAAM,KAAK,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,CAAA;YACjC,OAAO;gBACN,KAAK,EAAE,CAAC,CAAC,KAAK;gBACd,KAAK,EAAE,CAAC,CAAC,KAAK;gBACd,GAAG,EAAE,CAAC,CAAC,GAAG;gBACV,KAAK,EAAE,CAAC,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,IAAI,GAAG,CAA0B;gBACzD,UAAU,EAAE,KAAK,CAAC,GAAG,CAAE;aACvB,CAAA;QACF,CAAC,CAAC,CAAA;QAEF,IAAI,IAAI,EAAE,cAAc,EAAE,CAAC;YAC1B,MAAM,GAAG,oBAAoB,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC,MAAM,CAAA;QACnD,CAAC;QACD,IAAI,IAAI,EAAE,UAAU,EAAE,CAAC;YACtB,MAAM,GAAG,gBAAgB,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC,MAAM,CAAA;QAC/C,CAAC;QAED,OAAO,gBAAgB,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,SAAS,CAAC,CAAC,CAAC,EAAE,SAAS,EAAE,IAAI,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC,CAAA;IACnG,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,eAAe,CAAC,IAAY,EAAE,IAAgB;QACnD,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACvB,OAAO,EAAE,IAAI,EAAE,EAAE,GAAG,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,CAAA;QAClE,CAAC;QACD,MAAM,EAAE,MAAM,EAAE,GAAG,EAAE,GAAG,IAAI,CAAC,GAAG,CAAC,SAAS,CAAC,MAAM,CAAC,IAAI,CAAC,CAAA;QACvD,gGAAgG;QAChG,+EAA+E;QAC/E,MAAM,MAAM,GAAG,IAAI,CAAC,GAAG,CAAC,oBAAoB;YAC3C,CAAC,CAAC,mBAAmB,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,GAAG,CAAC,oBAAoB,CAAC;YAClE,CAAC,CAAC,SAAS,CAAA;QACZ,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,CAAC,gBAAgB;YAC1C,CAAC,CAAC,sBAAsB,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,GAAG,CAAC,gBAAgB,CAAC;YACjE,CAAC,CAAC,SAAS,CAAA;QACZ,MAAM,MAAM,GACX,SAAS,IAAI,MAAM,IAAI,IAAI,CAAC,GAAG,CAAC,6BAA6B;YAC5D,CAAC,CAAC,6BAA6B,CAAC,SAAS,EAAE,MAAM,CAAC,UAAU,CAAC;YAC7D,CAAC,CAAC,SAAS,CAAA;QACb,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,EAAE,MAAM,EAAE,MAAM,CAAC,CAAA;QAEnE,IAAI,CAAC,mBAAmB,CAAC,MAAM,CAAC,CAAA;QAEhC,IAAI,SAAS,GAAG,IAAI,EAAE,UAAU;YAC/B,CAAC,CAAC,iBAAiB,CACjB,MAAM,EACN,mBAAmB,CAAC,IAAI,CAAC,UAAU,EAAE,MAAM,EAAE,IAAI,CAAC,MAAM,EAAE;gBACzD,SAAS,EAAE,IAAI,CAAC,mBAAmB,IAAI,GAAG;gBAC1C,SAAS,EAAE,IAAI;aACf,CAAC,CACF;YACF,CAAC,CAAC,MAAM,CAAA;QAET,IAAI,IAAI,EAAE,GAAG,EAAE,CAAC;YACf,SAAS,GAAG,iBAAiB,CAC5B,SAAS,EACT,sBAAsB,CAAC,IAAI,CAAC,GAAG,EAAE,MAAM,EAAE,IAAI,CAAC,MAAM,EAAE;gBACrD,SAAS,EAAE,IAAI,CAAC,YAAY,IAAI,GAAG;aACnC,CAAC,CACF,CAAA;QACF,CAAC;QAED,IAAI,IAAI,EAAE,mBAAmB,EAAE,CAAC;YAC/B,SAAS,GAAG,iBAAiB,CAC5B,SAAS,EACT,mCAAmC,CAClC,IAAI,CAAC,mBAAmB,EACxB,MAAM,EACN,IAAI,CAAC,MAAM,EACX,IAAI,CAAC,uBAAuB,IAAI,EAAE,CAClC,CACD,CAAA;QACF,CAAC;QAED,MAAM,YAAY,GACjB,IAAI,CAAC,UAAU,KAAK,SAAS;YAC5B,CAAC,CAAC,OAAO,CAAC;gBACR,SAAS;gBACT,WAAW,EAAE,IAAI,CAAC,WAAW;gBAC7B,gBAAgB,EAAE,IAAI,CAAC,gBAAgB;gBACvC,cAAc,EAAE,IAAI,CAAC,cAAc;aACnC,CAAC,CAAC,IAAI;YACR,CAAC,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,aAAa,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAA;QAElD,MAAM,MAAM,GAAmB,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;YAClD,MAAM,GAAG,GAAG,YAAY,CAAC,CAAC,CAAE,CAAA;YAC5B,MAAM,KAAK,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,CAAA;YACjC,OAAO;gBACN,KAAK,EAAE,CAAC,CAAC,KAAK;gBACd,KAAK,EAAE,CAAC,CAAC,KAAK;gBACd,GAAG,EAAE,CAAC,CAAC,GAAG;gBACV,KAAK,EAAE,CAAC,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,IAAI,GAAG,CAA0B;gBACzD,UAAU,EAAE,KAAK,CAAC,GAAG,CAAE;aACvB,CAAA;QACF,CAAC,CAAC,CAAA;QAEF,OAAO;YACN,IAAI,EAAE,gBAAgB,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,SAAS,CAAC,CAAC,CAAC,EAAE,SAAS,EAAE,IAAI,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC;YACjG,MAAM;YACN,MAAM,EAAE,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC,KAAK,EAAE,GAAG,EAAE,CAAC,CAAC,GAAG,EAAE,CAAC,CAAC;SAC3D,CAAA;IACF,CAAC;IAED,KAAK,CAAC,SAAS,CAAC,IAAY,EAAE,IAAgB;QAC7C,OAAO,YAAY,CAAC,MAAM,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC,CAAA;IAClD,CAAC;IAED,KAAK,CAAC,WAAW,CAAC,IAAY,EAAE,IAAgB;QAC/C,OAAO,cAAc,CAAC,MAAM,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC,CAAA;IACpD,CAAC;IAED,KAAK,CAAC,QAAQ,CAAC,IAAY,EAAE,IAA8D;QAC1F,OAAO,WAAW,CAAC,MAAM,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,IAAI,CAAC,EAAE,IAAI,EAAE,GAAG,CAAC,CAAA;IAC5D,CAAC;IAED;;;;;;;;;;OAUG;IACK,mBAAmB,CAAC,MAA2B;QACtD,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC;YAAE,OAAM;QAC/B,MAAM,KAAK,GAAG,MAAM,CAAC,CAAC,CAAE,CAAC,MAAM,CAAA;QAC/B,IAAI,KAAK,GAAG,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC;YAChC,MAAM,IAAI,KAAK,CACd,wCAAwC,KAAK,2CAA2C;gBACvF,wBAAwB,IAAI,CAAC,MAAM,CAAC,MAAM,iDAAiD;gBAC3F,oFAAoF,CACrF,CAAA;QACF,CAAC;IACF,CAAC;CACD;AAmED,SAAS,aAAa,CAAC,GAAa;IACnC,IAAI,MAAM,GAAG,CAAC,CAAA;IACd,IAAI,MAAM,GAAG,GAAG,CAAC,CAAC,CAAE,CAAA;IACpB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACrC,IAAI,GAAG,CAAC,CAAC,CAAE,GAAG,MAAM,EAAE,CAAC;YACtB,MAAM,GAAG,GAAG,CAAC,CAAC,CAAE,CAAA;YAChB,MAAM,GAAG,CAAC,CAAA;QACX,CAAC;IACF,CAAC;IACD,IAAI,MAAM,GAAG,CAAC,CAAA;IACd,KAAK,MAAM,CAAC,IAAI,GAAG;QAAE,MAAM,IAAI,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,MAAM,CAAC,CAAA;IACnD,MAAM,IAAI,GAAG,CAAC,GAAG,MAAM,CAAA;IACvB,OAAO,EAAE,GAAG,EAAE,MAAM,EAAE,IAAI,EAAE,CAAA;AAC7B,CAAC;AAED,uGAAuG;AACvG,SAAS,WAAW,CAAC,CAAa,EAAE,CAAa;IAChD,MAAM,CAAC,GAAG,CAAC,CAAC,MAAM,CAAA;IAClB,MAAM,GAAG,GAAe,EAAE,CAAA;IAC1B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAC5B,MAAM,GAAG,GAAG,IAAI,KAAK,CAAS,CAAC,CAAC,CAAA;QAChC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE;YAAE,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAE,CAAC,CAAC,CAAE,GAAG,CAAC,CAAC,CAAC,CAAE,CAAC,CAAC,CAAE,CAAA;QAC1D,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;IACd,CAAC;IACD,OAAO,GAAG,CAAA;AACX,CAAC"}
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Inference-side gazetteer-anchor features (#464, knowledge-ladder rung 3.2) — the TS mirror of the
|
|
7
|
+
* Python training pipeline (`mailwoman_train/gazetteer_anchor.py`). Both consumers load the SAME
|
|
8
|
+
* codex-generated lexicon (`scripts/build-gazetteer-anchor-lexicon.mjs` →
|
|
9
|
+
* `data/gazetteer/anchor-lexicon-v1.json`) whose `rules` encode the match semantics as DATA, so the
|
|
10
|
+
* two implementations cannot drift. The model conditions on per-token candidate-tag-set clues fed
|
|
11
|
+
* alongside `input_ids`; this builds them from a raw address + its SentencePiece pieces.
|
|
12
|
+
*
|
|
13
|
+
* The clue INFORMS, the model decides (model-first). `gazetteer-inference.test.ts` pins the matcher
|
|
14
|
+
* against the Python fixture: the homograph clue is symmetric, "in" ≠ "IN", multi-word countries
|
|
15
|
+
* paint every word.
|
|
16
|
+
*/
|
|
17
|
+
import type { TokenizedPiece } from "./tokenizer.js";
|
|
18
|
+
/**
|
|
19
|
+
* The candidate-tag-set feature width: country/region/po_box/cedex/homograph (the lexicon's slot
|
|
20
|
+
* count). Used for the ONNX zero-fallback when a gazetteer-trained model is run with no clue data.
|
|
21
|
+
* MUST match the lexicon JSON's `feature_dim` and the trained model's `gazetteer_feature_dim`.
|
|
22
|
+
*/
|
|
23
|
+
export declare const GAZETTEER_FEATURE_DIM = 5;
|
|
24
|
+
/** The loaded lexicon — the JSON shape from build-gazetteer-anchor-lexicon.mjs. */
|
|
25
|
+
export interface GazetteerLexicon {
|
|
26
|
+
featureDim: number;
|
|
27
|
+
slots: readonly string[];
|
|
28
|
+
bits: Record<string, number>;
|
|
29
|
+
maxNgram: number;
|
|
30
|
+
/** case-insensitive: key = word_norm lowercased → bitmask. */
|
|
31
|
+
entries: Map<string, number>;
|
|
32
|
+
/** case-SENSITIVE: key = word_norm uppercased → bitmask (surface must already be uppercase). */
|
|
33
|
+
codeEntries: Map<string, number>;
|
|
34
|
+
}
|
|
35
|
+
/** Parse the lexicon JSON (already `JSON.parse`d — keeps this module browser-safe; caller reads). */
|
|
36
|
+
export declare function parseGazetteerLexicon(raw: {
|
|
37
|
+
feature_dim: number;
|
|
38
|
+
slots: string[];
|
|
39
|
+
bits: Record<string, number>;
|
|
40
|
+
max_ngram: number;
|
|
41
|
+
entries: Record<string, number>;
|
|
42
|
+
code_entries: Record<string, number>;
|
|
43
|
+
}): GazetteerLexicon;
|
|
44
|
+
/** Scan the raw surface and paint each char with its candidate-tag bitmask (mirrors Python). */
|
|
45
|
+
export declare function gazetteerCharPaint(text: string, lexicon: GazetteerLexicon): number[];
|
|
46
|
+
/**
|
|
47
|
+
* Channel choreography (#464, v0.9.13 postcode fix; DeepSeek 2026-06-10): zero the gazetteer clue on
|
|
48
|
+
* pieces within `window` of a postcode-anchor hit. The clue fires on the region token (`CA`/`GA`)
|
|
49
|
+
* immediately before a US postcode; its additive vector strengthens `B-region`, which makes the
|
|
50
|
+
* `B-region → B-postcode` CRF transition less competitive and drops the postcode (~3pp, US-only — FR
|
|
51
|
+
* postcode precedes the locality, no region neighbor). Suppressing the clue adjacent to the postcode
|
|
52
|
+
* removes the interference while leaving every other clue intact. Returns a NEW features/confidence
|
|
53
|
+
* pair (does not mutate). `anchorConfidence[i] > 0` marks postcode-span pieces. PAIRS WITH the
|
|
54
|
+
* train-time half (`gazetteer_anchor.suppress_gazetteer_near_postcode`) — enable both or neither.
|
|
55
|
+
*/
|
|
56
|
+
export declare function suppressGazetteerNearPostcode(gazetteer: {
|
|
57
|
+
features: number[][];
|
|
58
|
+
confidence: number[];
|
|
59
|
+
}, anchorConfidence: ReadonlyArray<number>, window?: number): {
|
|
60
|
+
features: number[][];
|
|
61
|
+
confidence: number[];
|
|
62
|
+
};
|
|
63
|
+
/**
|
|
64
|
+
* Per-piece gazetteer features + confidence for `text`, projected onto its SP `pieces` by the SAME
|
|
65
|
+
* char→piece rule the labels use (a piece takes the bits of the first non-whitespace char it covers).
|
|
66
|
+
* Returns `(pieces × featureDim)` features + `(pieces,)` confidence (1.0 wherever any bit fires).
|
|
67
|
+
*/
|
|
68
|
+
export declare function buildGazetteerFeatures(text: string, pieces: ReadonlyArray<TokenizedPiece>, lexicon: GazetteerLexicon): {
|
|
69
|
+
features: number[][];
|
|
70
|
+
confidence: number[];
|
|
71
|
+
};
|
|
72
|
+
//# sourceMappingURL=gazetteer-inference.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"gazetteer-inference.d.ts","sourceRoot":"","sources":["../gazetteer-inference.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAEH,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAA;AAEpD;;;;GAIG;AACH,eAAO,MAAM,qBAAqB,IAAI,CAAA;AAEtC,mFAAmF;AACnF,MAAM,WAAW,gBAAgB;IAChC,UAAU,EAAE,MAAM,CAAA;IAClB,KAAK,EAAE,SAAS,MAAM,EAAE,CAAA;IACxB,IAAI,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAA;IAC5B,QAAQ,EAAE,MAAM,CAAA;IAChB,8DAA8D;IAC9D,OAAO,EAAE,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAAA;IAC5B,gGAAgG;IAChG,WAAW,EAAE,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAAA;CAChC;AAED,qGAAqG;AACrG,wBAAgB,qBAAqB,CAAC,GAAG,EAAE;IAC1C,WAAW,EAAE,MAAM,CAAA;IACnB,KAAK,EAAE,MAAM,EAAE,CAAA;IACf,IAAI,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAA;IAC5B,SAAS,EAAE,MAAM,CAAA;IACjB,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAA;IAC/B,YAAY,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAA;CACpC,GAAG,gBAAgB,CASnB;AAsBD,gGAAgG;AAChG,wBAAgB,kBAAkB,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,gBAAgB,GAAG,MAAM,EAAE,CA0DpF;AAED;;;;;;;;;GASG;AACH,wBAAgB,6BAA6B,CAC5C,SAAS,EAAE;IAAE,QAAQ,EAAE,MAAM,EAAE,EAAE,CAAC;IAAC,UAAU,EAAE,MAAM,EAAE,CAAA;CAAE,EACzD,gBAAgB,EAAE,aAAa,CAAC,MAAM,CAAC,EACvC,MAAM,SAAI,GACR;IAAE,QAAQ,EAAE,MAAM,EAAE,EAAE,CAAC;IAAC,UAAU,EAAE,MAAM,EAAE,CAAA;CAAE,CAgBhD;AAED;;;;GAIG;AACH,wBAAgB,sBAAsB,CACrC,IAAI,EAAE,MAAM,EACZ,MAAM,EAAE,aAAa,CAAC,cAAc,CAAC,EACrC,OAAO,EAAE,gBAAgB,GACvB;IAAE,QAAQ,EAAE,MAAM,EAAE,EAAE,CAAC;IAAC,UAAU,EAAE,MAAM,EAAE,CAAA;CAAE,CAiBhD"}
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Inference-side gazetteer-anchor features (#464, knowledge-ladder rung 3.2) — the TS mirror of the
|
|
7
|
+
* Python training pipeline (`mailwoman_train/gazetteer_anchor.py`). Both consumers load the SAME
|
|
8
|
+
* codex-generated lexicon (`scripts/build-gazetteer-anchor-lexicon.mjs` →
|
|
9
|
+
* `data/gazetteer/anchor-lexicon-v1.json`) whose `rules` encode the match semantics as DATA, so the
|
|
10
|
+
* two implementations cannot drift. The model conditions on per-token candidate-tag-set clues fed
|
|
11
|
+
* alongside `input_ids`; this builds them from a raw address + its SentencePiece pieces.
|
|
12
|
+
*
|
|
13
|
+
* The clue INFORMS, the model decides (model-first). `gazetteer-inference.test.ts` pins the matcher
|
|
14
|
+
* against the Python fixture: the homograph clue is symmetric, "in" ≠ "IN", multi-word countries
|
|
15
|
+
* paint every word.
|
|
16
|
+
*/
|
|
17
|
+
/**
|
|
18
|
+
* The candidate-tag-set feature width: country/region/po_box/cedex/homograph (the lexicon's slot
|
|
19
|
+
* count). Used for the ONNX zero-fallback when a gazetteer-trained model is run with no clue data.
|
|
20
|
+
* MUST match the lexicon JSON's `feature_dim` and the trained model's `gazetteer_feature_dim`.
|
|
21
|
+
*/
|
|
22
|
+
export const GAZETTEER_FEATURE_DIM = 5;
|
|
23
|
+
/** Parse the lexicon JSON (already `JSON.parse`d — keeps this module browser-safe; caller reads). */
|
|
24
|
+
export function parseGazetteerLexicon(raw) {
|
|
25
|
+
return {
|
|
26
|
+
featureDim: raw.feature_dim,
|
|
27
|
+
slots: raw.slots,
|
|
28
|
+
bits: raw.bits,
|
|
29
|
+
maxNgram: raw.max_ngram,
|
|
30
|
+
entries: new Map(Object.entries(raw.entries)),
|
|
31
|
+
codeEntries: new Map(Object.entries(raw.code_entries)),
|
|
32
|
+
};
|
|
33
|
+
}
|
|
34
|
+
/** word_norm for one word: strip leading/trailing non-letter/digit chars (keep internal). */
|
|
35
|
+
function stripWord(word) {
|
|
36
|
+
let start = 0;
|
|
37
|
+
let end = word.length;
|
|
38
|
+
const alnum = (c) => /[\p{L}\p{N}]/u.test(c);
|
|
39
|
+
while (start < end && !alnum(word[start]))
|
|
40
|
+
start++;
|
|
41
|
+
while (end > start && !alnum(word[end - 1]))
|
|
42
|
+
end--;
|
|
43
|
+
return word.slice(start, end);
|
|
44
|
+
}
|
|
45
|
+
function bitsToRow(bits, lexicon) {
|
|
46
|
+
return lexicon.slots.map((slot) => (bits & lexicon.bits[slot] ? 1 : 0));
|
|
47
|
+
}
|
|
48
|
+
/** Scan the raw surface and paint each char with its candidate-tag bitmask (mirrors Python). */
|
|
49
|
+
export function gazetteerCharPaint(text, lexicon) {
|
|
50
|
+
const charBits = new Array(text.length).fill(0);
|
|
51
|
+
const wordRe = /\S+/g;
|
|
52
|
+
const words = [];
|
|
53
|
+
let m;
|
|
54
|
+
while ((m = wordRe.exec(text)) !== null) {
|
|
55
|
+
const surface = m[0];
|
|
56
|
+
const stripped = stripWord(surface);
|
|
57
|
+
if (!stripped) {
|
|
58
|
+
words.push({ begin: m.index, end: m.index, text: "" });
|
|
59
|
+
continue;
|
|
60
|
+
}
|
|
61
|
+
let head = 0;
|
|
62
|
+
const alnum = (c) => /[\p{L}\p{N}]/u.test(c);
|
|
63
|
+
while (head < surface.length && !alnum(surface[head]))
|
|
64
|
+
head++;
|
|
65
|
+
words.push({ begin: m.index + head, end: m.index + head + stripped.length, text: stripped });
|
|
66
|
+
}
|
|
67
|
+
let i = 0;
|
|
68
|
+
while (i < words.length) {
|
|
69
|
+
if (!words[i].text) {
|
|
70
|
+
i++;
|
|
71
|
+
continue;
|
|
72
|
+
}
|
|
73
|
+
let matchedN = 0;
|
|
74
|
+
let matchedBits = 0;
|
|
75
|
+
const maxN = Math.min(lexicon.maxNgram, words.length - i);
|
|
76
|
+
for (let n = maxN; n >= 1; n--) {
|
|
77
|
+
const parts = [];
|
|
78
|
+
let ok = true;
|
|
79
|
+
for (let k = i; k < i + n; k++) {
|
|
80
|
+
if (!words[k].text) {
|
|
81
|
+
ok = false;
|
|
82
|
+
break;
|
|
83
|
+
}
|
|
84
|
+
parts.push(words[k].text);
|
|
85
|
+
}
|
|
86
|
+
if (!ok)
|
|
87
|
+
continue;
|
|
88
|
+
const key = parts.join(" ").toLowerCase();
|
|
89
|
+
let bits = lexicon.entries.get(key) ?? 0;
|
|
90
|
+
// code_entries is case-SENSITIVE: the surface must already BE uppercase ("IN" ≠ "in").
|
|
91
|
+
if (n === 1)
|
|
92
|
+
bits |= lexicon.codeEntries.get(parts[0]) ?? 0;
|
|
93
|
+
if (bits) {
|
|
94
|
+
matchedN = n;
|
|
95
|
+
matchedBits = bits;
|
|
96
|
+
break;
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
if (matchedN) {
|
|
100
|
+
const begin = words[i].begin;
|
|
101
|
+
const end = words[i + matchedN - 1].end;
|
|
102
|
+
for (let c = begin; c < Math.min(end, text.length); c++)
|
|
103
|
+
charBits[c] = matchedBits;
|
|
104
|
+
i += matchedN;
|
|
105
|
+
}
|
|
106
|
+
else {
|
|
107
|
+
i++;
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
return charBits;
|
|
111
|
+
}
|
|
112
|
+
/**
|
|
113
|
+
* Channel choreography (#464, v0.9.13 postcode fix; DeepSeek 2026-06-10): zero the gazetteer clue on
|
|
114
|
+
* pieces within `window` of a postcode-anchor hit. The clue fires on the region token (`CA`/`GA`)
|
|
115
|
+
* immediately before a US postcode; its additive vector strengthens `B-region`, which makes the
|
|
116
|
+
* `B-region → B-postcode` CRF transition less competitive and drops the postcode (~3pp, US-only — FR
|
|
117
|
+
* postcode precedes the locality, no region neighbor). Suppressing the clue adjacent to the postcode
|
|
118
|
+
* removes the interference while leaving every other clue intact. Returns a NEW features/confidence
|
|
119
|
+
* pair (does not mutate). `anchorConfidence[i] > 0` marks postcode-span pieces. PAIRS WITH the
|
|
120
|
+
* train-time half (`gazetteer_anchor.suppress_gazetteer_near_postcode`) — enable both or neither.
|
|
121
|
+
*/
|
|
122
|
+
export function suppressGazetteerNearPostcode(gazetteer, anchorConfidence, window = 1) {
|
|
123
|
+
const n = gazetteer.confidence.length;
|
|
124
|
+
const suppress = new Array(n).fill(false);
|
|
125
|
+
for (let i = 0; i < n; i++) {
|
|
126
|
+
if ((anchorConfidence[i] ?? 0) > 0) {
|
|
127
|
+
for (let d = -window; d <= window; d++) {
|
|
128
|
+
const j = i + d;
|
|
129
|
+
if (j >= 0 && j < n && d !== 0)
|
|
130
|
+
suppress[j] = true;
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
const dim = gazetteer.features[0]?.length ?? 0;
|
|
135
|
+
return {
|
|
136
|
+
features: gazetteer.features.map((row, i) => (suppress[i] ? new Array(dim).fill(0) : row)),
|
|
137
|
+
confidence: gazetteer.confidence.map((c, i) => (suppress[i] ? 0 : c)),
|
|
138
|
+
};
|
|
139
|
+
}
|
|
140
|
+
/**
|
|
141
|
+
* Per-piece gazetteer features + confidence for `text`, projected onto its SP `pieces` by the SAME
|
|
142
|
+
* char→piece rule the labels use (a piece takes the bits of the first non-whitespace char it covers).
|
|
143
|
+
* Returns `(pieces × featureDim)` features + `(pieces,)` confidence (1.0 wherever any bit fires).
|
|
144
|
+
*/
|
|
145
|
+
export function buildGazetteerFeatures(text, pieces, lexicon) {
|
|
146
|
+
const charBits = gazetteerCharPaint(text, lexicon);
|
|
147
|
+
const zero = () => new Array(lexicon.featureDim).fill(0);
|
|
148
|
+
const features = [];
|
|
149
|
+
const confidence = [];
|
|
150
|
+
for (const p of pieces) {
|
|
151
|
+
let bits = 0;
|
|
152
|
+
for (let c = p.start; c < p.end; c++) {
|
|
153
|
+
if (c < text.length && !/\s/.test(text[c])) {
|
|
154
|
+
bits = charBits[c];
|
|
155
|
+
break;
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
features.push(bits ? bitsToRow(bits, lexicon) : zero());
|
|
159
|
+
confidence.push(bits ? 1.0 : 0);
|
|
160
|
+
}
|
|
161
|
+
return { features, confidence };
|
|
162
|
+
}
|
|
163
|
+
//# sourceMappingURL=gazetteer-inference.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"gazetteer-inference.js","sourceRoot":"","sources":["../gazetteer-inference.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAIH;;;;GAIG;AACH,MAAM,CAAC,MAAM,qBAAqB,GAAG,CAAC,CAAA;AActC,qGAAqG;AACrG,MAAM,UAAU,qBAAqB,CAAC,GAOrC;IACA,OAAO;QACN,UAAU,EAAE,GAAG,CAAC,WAAW;QAC3B,KAAK,EAAE,GAAG,CAAC,KAAK;QAChB,IAAI,EAAE,GAAG,CAAC,IAAI;QACd,QAAQ,EAAE,GAAG,CAAC,SAAS;QACvB,OAAO,EAAE,IAAI,GAAG,CAAC,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;QAC7C,WAAW,EAAE,IAAI,GAAG,CAAC,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC;KACtD,CAAA;AACF,CAAC;AAED,6FAA6F;AAC7F,SAAS,SAAS,CAAC,IAAY;IAC9B,IAAI,KAAK,GAAG,CAAC,CAAA;IACb,IAAI,GAAG,GAAG,IAAI,CAAC,MAAM,CAAA;IACrB,MAAM,KAAK,GAAG,CAAC,CAAS,EAAE,EAAE,CAAC,eAAe,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IACpD,OAAO,KAAK,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,KAAK,CAAE,CAAC;QAAE,KAAK,EAAE,CAAA;IACnD,OAAO,GAAG,GAAG,KAAK,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,GAAG,CAAC,CAAE,CAAC;QAAE,GAAG,EAAE,CAAA;IACnD,OAAO,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,CAAA;AAC9B,CAAC;AAED,SAAS,SAAS,CAAC,IAAY,EAAE,OAAyB;IACzD,OAAO,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;AACzE,CAAC;AAQD,gGAAgG;AAChG,MAAM,UAAU,kBAAkB,CAAC,IAAY,EAAE,OAAyB;IACzE,MAAM,QAAQ,GAAG,IAAI,KAAK,CAAS,IAAI,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IACvD,MAAM,MAAM,GAAG,MAAM,CAAA;IACrB,MAAM,KAAK,GAAe,EAAE,CAAA;IAC5B,IAAI,CAAyB,CAAA;IAC7B,OAAO,CAAC,CAAC,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QACzC,MAAM,OAAO,GAAG,CAAC,CAAC,CAAC,CAAC,CAAA;QACpB,MAAM,QAAQ,GAAG,SAAS,CAAC,OAAO,CAAC,CAAA;QACnC,IAAI,CAAC,QAAQ,EAAE,CAAC;YACf,KAAK,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC,KAAK,EAAE,GAAG,EAAE,CAAC,CAAC,KAAK,EAAE,IAAI,EAAE,EAAE,EAAE,CAAC,CAAA;YACtD,SAAQ;QACT,CAAC;QACD,IAAI,IAAI,GAAG,CAAC,CAAA;QACZ,MAAM,KAAK,GAAG,CAAC,CAAS,EAAE,EAAE,CAAC,eAAe,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;QACpD,OAAO,IAAI,GAAG,OAAO,CAAC,MAAM,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,IAAI,CAAE,CAAC;YAAE,IAAI,EAAE,CAAA;QAC9D,KAAK,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC,KAAK,GAAG,IAAI,EAAE,GAAG,EAAE,CAAC,CAAC,KAAK,GAAG,IAAI,GAAG,QAAQ,CAAC,MAAM,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC,CAAA;IAC7F,CAAC;IAED,IAAI,CAAC,GAAG,CAAC,CAAA;IACT,OAAO,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;QACzB,IAAI,CAAC,KAAK,CAAC,CAAC,CAAE,CAAC,IAAI,EAAE,CAAC;YACrB,CAAC,EAAE,CAAA;YACH,SAAQ;QACT,CAAC;QACD,IAAI,QAAQ,GAAG,CAAC,CAAA;QAChB,IAAI,WAAW,GAAG,CAAC,CAAA;QACnB,MAAM,IAAI,GAAG,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,QAAQ,EAAE,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;QACzD,KAAK,IAAI,CAAC,GAAG,IAAI,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAChC,MAAM,KAAK,GAAa,EAAE,CAAA;YAC1B,IAAI,EAAE,GAAG,IAAI,CAAA;YACb,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;gBAChC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAE,CAAC,IAAI,EAAE,CAAC;oBACrB,EAAE,GAAG,KAAK,CAAA;oBACV,MAAK;gBACN,CAAC;gBACD,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAE,CAAC,IAAI,CAAC,CAAA;YAC3B,CAAC;YACD,IAAI,CAAC,EAAE;gBAAE,SAAQ;YACjB,MAAM,GAAG,GAAG,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,WAAW,EAAE,CAAA;YACzC,IAAI,IAAI,GAAG,OAAO,CAAC,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,CAAA;YACxC,uFAAuF;YACvF,IAAI,CAAC,KAAK,CAAC;gBAAE,IAAI,IAAI,OAAO,CAAC,WAAW,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,CAAE,CAAC,IAAI,CAAC,CAAA;YAC5D,IAAI,IAAI,EAAE,CAAC;gBACV,QAAQ,GAAG,CAAC,CAAA;gBACZ,WAAW,GAAG,IAAI,CAAA;gBAClB,MAAK;YACN,CAAC;QACF,CAAC;QACD,IAAI,QAAQ,EAAE,CAAC;YACd,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAE,CAAC,KAAK,CAAA;YAC7B,MAAM,GAAG,GAAG,KAAK,CAAC,CAAC,GAAG,QAAQ,GAAG,CAAC,CAAE,CAAC,GAAG,CAAA;YACxC,KAAK,IAAI,CAAC,GAAG,KAAK,EAAE,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,EAAE,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,EAAE;gBAAE,QAAQ,CAAC,CAAC,CAAC,GAAG,WAAW,CAAA;YAClF,CAAC,IAAI,QAAQ,CAAA;QACd,CAAC;aAAM,CAAC;YACP,CAAC,EAAE,CAAA;QACJ,CAAC;IACF,CAAC;IACD,OAAO,QAAQ,CAAA;AAChB,CAAC;AAED;;;;;;;;;GASG;AACH,MAAM,UAAU,6BAA6B,CAC5C,SAAyD,EACzD,gBAAuC,EACvC,MAAM,GAAG,CAAC;IAEV,MAAM,CAAC,GAAG,SAAS,CAAC,UAAU,CAAC,MAAM,CAAA;IACrC,MAAM,QAAQ,GAAG,IAAI,KAAK,CAAU,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;IAClD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAC5B,IAAI,CAAC,gBAAgB,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC;YACpC,KAAK,IAAI,CAAC,GAAG,CAAC,MAAM,EAAE,CAAC,IAAI,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBACxC,MAAM,CAAC,GAAG,CAAC,GAAG,CAAC,CAAA;gBACf,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC;oBAAE,QAAQ,CAAC,CAAC,CAAC,GAAG,IAAI,CAAA;YACnD,CAAC;QACF,CAAC;IACF,CAAC;IACD,MAAM,GAAG,GAAG,SAAS,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,MAAM,IAAI,CAAC,CAAA;IAC9C,OAAO;QACN,QAAQ,EAAE,SAAS,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,KAAK,CAAS,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;QAClG,UAAU,EAAE,SAAS,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;KACrE,CAAA;AACF,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,sBAAsB,CACrC,IAAY,EACZ,MAAqC,EACrC,OAAyB;IAEzB,MAAM,QAAQ,GAAG,kBAAkB,CAAC,IAAI,EAAE,OAAO,CAAC,CAAA;IAClD,MAAM,IAAI,GAAG,GAAG,EAAE,CAAC,IAAI,KAAK,CAAS,OAAO,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IAChE,MAAM,QAAQ,GAAe,EAAE,CAAA;IAC/B,MAAM,UAAU,GAAa,EAAE,CAAA;IAC/B,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;QACxB,IAAI,IAAI,GAAG,CAAC,CAAA;QACZ,KAAK,IAAI,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;YACtC,IAAI,CAAC,GAAG,IAAI,CAAC,MAAM,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAE,CAAC,EAAE,CAAC;gBAC7C,IAAI,GAAG,QAAQ,CAAC,CAAC,CAAE,CAAA;gBACnB,MAAK;YACN,CAAC;QACF,CAAC;QACD,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,SAAS,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAA;QACvD,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;IAChC,CAAC;IACD,OAAO,EAAE,QAAQ,EAAE,UAAU,EAAE,CAAA;AAChC,CAAC"}
|
package/out/index.d.ts
CHANGED
package/out/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,cAAc,uBAAuB,CAAA;AACrC,cAAc,iBAAiB,CAAA;AAC/B,cAAc,aAAa,CAAA;AAC3B,cAAc,kBAAkB,CAAA;AAChC,cAAc,sBAAsB,CAAA;AACpC,cAAc,+BAA+B,CAAA;AAC7C,cAAc,0BAA0B,CAAA;AACxC,OAAO,EAAE,iBAAiB,EAAE,mBAAmB,EAAE,MAAM,wBAAwB,CAAA;AAC/E,YAAY,EAAE,eAAe,EAAE,kBAAkB,EAAE,cAAc,EAAE,SAAS,EAAE,MAAM,wBAAwB,CAAA;AAC5G,cAAc,gBAAgB,CAAA;AAC9B,OAAO,EACN,eAAe,EACf,iBAAiB,EACjB,sBAAsB,EACtB,cAAc,EACd,OAAO,EACP,OAAO,GACP,MAAM,cAAc,CAAA;AACrB,YAAY,EAAE,YAAY,EAAE,aAAa,EAAE,MAAM,cAAc,CAAA;AAC/D,cAAc,cAAc,CAAA"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,cAAc,uBAAuB,CAAA;AACrC,cAAc,iBAAiB,CAAA;AAC/B,cAAc,0BAA0B,CAAA;AACxC,cAAc,aAAa,CAAA;AAC3B,cAAc,kBAAkB,CAAA;AAChC,cAAc,sBAAsB,CAAA;AACpC,cAAc,+BAA+B,CAAA;AAC7C,cAAc,0BAA0B,CAAA;AACxC,OAAO,EAAE,iBAAiB,EAAE,mBAAmB,EAAE,MAAM,wBAAwB,CAAA;AAC/E,YAAY,EAAE,eAAe,EAAE,kBAAkB,EAAE,cAAc,EAAE,SAAS,EAAE,MAAM,wBAAwB,CAAA;AAC5G,cAAc,gBAAgB,CAAA;AAC9B,OAAO,EACN,eAAe,EACf,iBAAiB,EACjB,sBAAsB,EACtB,cAAc,EACd,OAAO,EACP,OAAO,GACP,MAAM,cAAc,CAAA;AACrB,YAAY,EAAE,YAAY,EAAE,aAAa,EAAE,MAAM,cAAc,CAAA;AAC/D,cAAc,cAAc,CAAA"}
|
package/out/index.js
CHANGED
package/out/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,cAAc,uBAAuB,CAAA;AACrC,cAAc,iBAAiB,CAAA;AAC/B,cAAc,aAAa,CAAA;AAC3B,cAAc,kBAAkB,CAAA;AAChC,cAAc,sBAAsB,CAAA;AACpC,cAAc,+BAA+B,CAAA;AAC7C,cAAc,0BAA0B,CAAA;AACxC,OAAO,EAAE,iBAAiB,EAAE,mBAAmB,EAAE,MAAM,wBAAwB,CAAA;AAE/E,cAAc,gBAAgB,CAAA;AAC9B,OAAO,EACN,eAAe,EACf,iBAAiB,EACjB,sBAAsB,EACtB,cAAc,EACd,OAAO,EACP,OAAO,GACP,MAAM,cAAc,CAAA;AAErB,cAAc,cAAc,CAAA"}
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,cAAc,uBAAuB,CAAA;AACrC,cAAc,iBAAiB,CAAA;AAC/B,cAAc,0BAA0B,CAAA;AACxC,cAAc,aAAa,CAAA;AAC3B,cAAc,kBAAkB,CAAA;AAChC,cAAc,sBAAsB,CAAA;AACpC,cAAc,+BAA+B,CAAA;AAC7C,cAAc,0BAA0B,CAAA;AACxC,OAAO,EAAE,iBAAiB,EAAE,mBAAmB,EAAE,MAAM,wBAAwB,CAAA;AAE/E,cAAc,gBAAgB,CAAA;AAC9B,OAAO,EACN,eAAe,EACf,iBAAiB,EACjB,sBAAsB,EACtB,cAAc,EACd,OAAO,EACP,OAAO,GACP,MAAM,cAAc,CAAA;AAErB,cAAc,cAAc,CAAA"}
|
package/out/onnx-runner.d.ts
CHANGED
|
@@ -58,6 +58,9 @@ export declare class OnnxRunner {
|
|
|
58
58
|
infer(tokenIds: number[], anchor?: {
|
|
59
59
|
features: ReadonlyArray<ReadonlyArray<number>>;
|
|
60
60
|
confidence: ReadonlyArray<number>;
|
|
61
|
+
}, gazetteer?: {
|
|
62
|
+
features: ReadonlyArray<ReadonlyArray<number>>;
|
|
63
|
+
confidence: ReadonlyArray<number>;
|
|
61
64
|
}): Promise<InferResult>;
|
|
62
65
|
}
|
|
63
66
|
//# sourceMappingURL=onnx-runner.d.ts.map
|
package/out/onnx-runner.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"onnx-runner.d.ts","sourceRoot":"","sources":["../onnx-runner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;
|
|
1
|
+
{"version":3,"file":"onnx-runner.d.ts","sourceRoot":"","sources":["../onnx-runner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAQH,MAAM,WAAW,cAAc;IAC9B,wEAAwE;IACxE,MAAM,CAAC,EAAE,OAAO,CAAA;IAChB;;;;;OAKG;IACH,WAAW,CAAC,EAAE,MAAM,CAAA;CACpB;AAED,8FAA8F;AAC9F,eAAO,MAAM,qBAAqB,MAAM,CAAA;AAExC,MAAM,WAAW,WAAW;IAC3B,2EAA2E;IAC3E,MAAM,EAAE,MAAM,EAAE,EAAE,CAAA;IAClB,oEAAoE;IACpE,SAAS,EAAE,MAAM,CAAA;CACjB;AAED,qBAAa,UAAU;IAMrB,OAAO,CAAC,QAAQ,CAAC,SAAS;IAC1B,OAAO,CAAC,QAAQ,CAAC,UAAU;IAN5B,OAAO,CAAC,OAAO,CAAoC;IACnD,OAAO,CAAC,WAAW,CAA6C;IAChE,SAAgB,WAAW,EAAE,MAAM,CAAA;IAEnC,OAAO;IAQP,oEAAoE;WACvD,MAAM,CAAC,SAAS,EAAE,MAAM,EAAE,IAAI,GAAE,cAAmB,GAAG,OAAO,CAAC,UAAU,CAAC;IAMtF,6CAA6C;WAChC,SAAS,CAAC,UAAU,EAAE,UAAU,EAAE,IAAI,GAAE,cAAmB,GAAG,OAAO,CAAC,UAAU,CAAC;YAMhF,aAAa;IAgB3B;;;;;;;;;;;OAWG;IACG,KAAK,CACV,QAAQ,EAAE,MAAM,EAAE,EAClB,MAAM,CAAC,EAAE;QAAE,QAAQ,EAAE,aAAa,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC,CAAC;QAAC,UAAU,EAAE,aAAa,CAAC,MAAM,CAAC,CAAA;KAAE,EAC9F,SAAS,CAAC,EAAE;QAAE,QAAQ,EAAE,aAAa,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC,CAAC;QAAC,UAAU,EAAE,aAAa,CAAC,MAAM,CAAC,CAAA;KAAE,GAC/F,OAAO,CAAC,WAAW,CAAC;CA+EvB"}
|
package/out/onnx-runner.js
CHANGED
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
import { promises as fs } from "node:fs";
|
|
16
16
|
import ort from "onnxruntime-node";
|
|
17
17
|
import { ANCHOR_FEATURE_DIM } from "./anchor-inference.js";
|
|
18
|
+
import { GAZETTEER_FEATURE_DIM } from "./gazetteer-inference.js";
|
|
18
19
|
/** Default sequence length for v0.1.0 / v0.2.0 (BertConfig max_position_embeddings = 128). */
|
|
19
20
|
export const DEFAULT_FIXED_SEQ_LEN = 128;
|
|
20
21
|
export class OnnxRunner {
|
|
@@ -70,7 +71,7 @@ export class OnnxRunner {
|
|
|
70
71
|
* `(seqLen × dim)` + confidence `(seqLen,)` are fed, zero-padded to `fixedSeqLen`. Omit for
|
|
71
72
|
* plain models, whose ONNX has no anchor inputs.
|
|
72
73
|
*/
|
|
73
|
-
async infer(tokenIds, anchor) {
|
|
74
|
+
async infer(tokenIds, anchor, gazetteer) {
|
|
74
75
|
const session = await this.ensureSession();
|
|
75
76
|
const seqLen = Math.min(tokenIds.length, this.fixedSeqLen);
|
|
76
77
|
const padded = new BigInt64Array(this.fixedSeqLen);
|
|
@@ -108,6 +109,30 @@ export class OnnxRunner {
|
|
|
108
109
|
]);
|
|
109
110
|
feeds.anchor_confidence = new ort.Tensor("float32", new Float32Array(this.fixedSeqLen), [1, this.fixedSeqLen]);
|
|
110
111
|
}
|
|
112
|
+
// Gazetteer-anchor channel (#464): same feed contract as the postcode anchor. Feature width is
|
|
113
|
+
// read from the supplied rows (the lexicon's slot count); a gazetteer-trained model with no clue
|
|
114
|
+
// data supplied gets the confidence=0 identity (the model's gazetteer-off behavior).
|
|
115
|
+
if (gazetteer && session.inputNames.includes("gazetteer_features")) {
|
|
116
|
+
const dim = gazetteer.features[0]?.length ?? 0;
|
|
117
|
+
const gf = new Float32Array(this.fixedSeqLen * dim);
|
|
118
|
+
const gc = new Float32Array(this.fixedSeqLen);
|
|
119
|
+
for (let i = 0; i < seqLen; i++) {
|
|
120
|
+
gc[i] = gazetteer.confidence[i] ?? 0;
|
|
121
|
+
const row = gazetteer.features[i];
|
|
122
|
+
if (row)
|
|
123
|
+
for (let d = 0; d < dim; d++)
|
|
124
|
+
gf[i * dim + d] = row[d] ?? 0;
|
|
125
|
+
}
|
|
126
|
+
feeds.gazetteer_features = new ort.Tensor("float32", gf, [1, this.fixedSeqLen, dim]);
|
|
127
|
+
feeds.gazetteer_confidence = new ort.Tensor("float32", gc, [1, this.fixedSeqLen]);
|
|
128
|
+
}
|
|
129
|
+
else if (session.inputNames.includes("gazetteer_features")) {
|
|
130
|
+
feeds.gazetteer_features = new ort.Tensor("float32", new Float32Array(this.fixedSeqLen * GAZETTEER_FEATURE_DIM), [1, this.fixedSeqLen, GAZETTEER_FEATURE_DIM]);
|
|
131
|
+
feeds.gazetteer_confidence = new ort.Tensor("float32", new Float32Array(this.fixedSeqLen), [
|
|
132
|
+
1,
|
|
133
|
+
this.fixedSeqLen,
|
|
134
|
+
]);
|
|
135
|
+
}
|
|
111
136
|
const output = await session.run(feeds);
|
|
112
137
|
const logitsTensor = output.logits;
|
|
113
138
|
if (!logitsTensor)
|
package/out/onnx-runner.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"onnx-runner.js","sourceRoot":"","sources":["../onnx-runner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAEH,OAAO,EAAE,QAAQ,IAAI,EAAE,EAAE,MAAM,SAAS,CAAA;AACxC,OAAO,GAAG,MAAM,kBAAkB,CAAA;AAElC,OAAO,EAAE,kBAAkB,EAAE,MAAM,uBAAuB,CAAA;
|
|
1
|
+
{"version":3,"file":"onnx-runner.js","sourceRoot":"","sources":["../onnx-runner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAEH,OAAO,EAAE,QAAQ,IAAI,EAAE,EAAE,MAAM,SAAS,CAAA;AACxC,OAAO,GAAG,MAAM,kBAAkB,CAAA;AAElC,OAAO,EAAE,kBAAkB,EAAE,MAAM,uBAAuB,CAAA;AAC1D,OAAO,EAAE,qBAAqB,EAAE,MAAM,0BAA0B,CAAA;AAchE,8FAA8F;AAC9F,MAAM,CAAC,MAAM,qBAAqB,GAAG,GAAG,CAAA;AASxC,MAAM,OAAO,UAAU;IAMJ;IACA;IANV,OAAO,GAAgC,IAAI,CAAA;IAC3C,WAAW,GAAyC,IAAI,CAAA;IAChD,WAAW,CAAQ;IAEnC,YACkB,SAAiB,EACjB,UAA6B,EAC9C,IAAoB;QAFH,cAAS,GAAT,SAAS,CAAQ;QACjB,eAAU,GAAV,UAAU,CAAmB;QAG9C,IAAI,CAAC,WAAW,GAAG,IAAI,CAAC,WAAW,IAAI,qBAAqB,CAAA;IAC7D,CAAC;IAED,oEAAoE;IACpE,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,SAAiB,EAAE,OAAuB,EAAE;QAC/D,MAAM,MAAM,GAAG,IAAI,UAAU,CAAC,SAAS,EAAE,IAAI,EAAE,IAAI,CAAC,CAAA;QACpD,IAAI,IAAI,CAAC,MAAM;YAAE,MAAM,MAAM,CAAC,aAAa,EAAE,CAAA;QAC7C,OAAO,MAAM,CAAA;IACd,CAAC;IAED,6CAA6C;IAC7C,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,UAAsB,EAAE,OAAuB,EAAE;QACvE,MAAM,MAAM,GAAG,IAAI,UAAU,CAAC,SAAS,EAAE,UAAU,EAAE,IAAI,CAAC,CAAA;QAC1D,IAAI,IAAI,CAAC,MAAM;YAAE,MAAM,MAAM,CAAC,aAAa,EAAE,CAAA;QAC7C,OAAO,MAAM,CAAA;IACd,CAAC;IAEO,KAAK,CAAC,aAAa;QAC1B,IAAI,IAAI,CAAC,OAAO;YAAE,OAAO,IAAI,CAAC,OAAO,CAAA;QACrC,IAAI,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC;YACvB,IAAI,CAAC,WAAW,GAAG,CAAC,KAAK,IAAI,EAAE;gBAC9B,MAAM,KAAK,GAAG,IAAI,CAAC,UAAU,IAAI,IAAI,UAAU,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAA;gBAClF,MAAM,OAAO,GAAG,MAAM,GAAG,CAAC,gBAAgB,CAAC,MAAM,CAAC,KAAK,EAAE;oBACxD,kBAAkB,EAAE,CAAC,KAAK,CAAC;oBAC3B,sBAAsB,EAAE,KAAK;iBAC7B,CAAC,CAAA;gBACF,IAAI,CAAC,OAAO,GAAG,OAAO,CAAA;gBACtB,OAAO,OAAO,CAAA;YACf,CAAC,CAAC,EAAE,CAAA;QACL,CAAC;QACD,OAAO,IAAI,CAAC,WAAW,CAAA;IACxB,CAAC;IAED;;;;;;;;;;;OAWG;IACH,KAAK,CAAC,KAAK,CACV,QAAkB,EAClB,MAA8F,EAC9F,SAAiG;QAEjG,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,aAAa,EAAE,CAAA;QAC1C,MAAM,MAAM,GAAG,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,MAAM,EAAE,IAAI,CAAC,WAAW,CAAC,CAAA;QAC1D,MAAM,MAAM,GAAG,IAAI,aAAa,CAAC,IAAI,CAAC,WAAW,CAAC,CAAA;QAClD,MAAM,IAAI,GAAG,IAAI,aAAa,CAAC,IAAI,CAAC,WAAW,CAAC,CAAA;QAChD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACjC,MAAM,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAE,CAAC,CAAA;YAChC,IAAI,CAAC,CAAC,CAAC,GAAG,EAAE,CAAA;QACb,CAAC;QAED,MAAM,KAAK,GAA+B;YACzC,SAAS,EAAE,IAAI,GAAG,CAAC,MAAM,CAAC,OAAO,EAAE,MAAM,EAAE,CAAC,CAAC,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC;YACjE,cAAc,EAAE,IAAI,GAAG,CAAC,MAAM,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC;SACpE,CAAA;QAED,IAAI,MAAM,EAAE,CAAC;YACZ,MAAM,GAAG,GAAG,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,MAAM,IAAI,CAAC,CAAA;YAC3C,MAAM,EAAE,GAAG,IAAI,YAAY,CAAC,IAAI,CAAC,WAAW,GAAG,GAAG,CAAC,CAAA;YACnD,MAAM,EAAE,GAAG,IAAI,YAAY,CAAC,IAAI,CAAC,WAAW,CAAC,CAAA;YAC7C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBACjC,EAAE,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,UAAU,CAAC,CAAC,CAAC,IAAI,CAAC,CAAA;gBACjC,MAAM,GAAG,GAAG,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAA;gBAC9B,IAAI,GAAG;oBAAE,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,EAAE,CAAC,EAAE;wBAAE,EAAE,CAAC,CAAC,GAAG,GAAG,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAA;YACrE,CAAC;YACD,KAAK,CAAC,eAAe,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,SAAS,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,IAAI,CAAC,WAAW,EAAE,GAAG,CAAC,CAAC,CAAA;YACjF,KAAK,CAAC,iBAAiB,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,SAAS,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC,CAAA;QAC/E,CAAC;aAAM,IAAI,OAAO,CAAC,UAAU,CAAC,QAAQ,CAAC,iBAAiB,CAAC,EAAE,CAAC;YAC3D,6FAA6F;YAC7F,0FAA0F;YAC1F,0EAA0E;YAC1E,KAAK,CAAC,eAAe,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,SAAS,EAAE,IAAI,YAAY,CAAC,IAAI,CAAC,WAAW,GAAG,kBAAkB,CAAC,EAAE;gBAC1G,CAAC;gBACD,IAAI,CAAC,WAAW;gBAChB,kBAAkB;aAClB,CAAC,CAAA;YACF,KAAK,CAAC,iBAAiB,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,SAAS,EAAE,IAAI,YAAY,CAAC,IAAI,CAAC,WAAW,CAAC,EAAE,CAAC,CAAC,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC,CAAA;QAC/G,CAAC;QAED,+FAA+F;QAC/F,iGAAiG;QACjG,qFAAqF;QACrF,IAAI,SAAS,IAAI,OAAO,CAAC,UAAU,CAAC,QAAQ,CAAC,oBAAoB,CAAC,EAAE,CAAC;YACpE,MAAM,GAAG,GAAG,SAAS,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,MAAM,IAAI,CAAC,CAAA;YAC9C,MAAM,EAAE,GAAG,IAAI,YAAY,CAAC,IAAI,CAAC,WAAW,GAAG,GAAG,CAAC,CAAA;YACnD,MAAM,EAAE,GAAG,IAAI,YAAY,CAAC,IAAI,CAAC,WAAW,CAAC,CAAA;YAC7C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBACjC,EAAE,CAAC,CAAC,CAAC,GAAG,SAAS,CAAC,UAAU,CAAC,CAAC,CAAC,IAAI,CAAC,CAAA;gBACpC,MAAM,GAAG,GAAG,SAAS,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAA;gBACjC,IAAI,GAAG;oBAAE,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,EAAE,CAAC,EAAE;wBAAE,EAAE,CAAC,CAAC,GAAG,GAAG,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAA;YACrE,CAAC;YACD,KAAK,CAAC,kBAAkB,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,SAAS,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,IAAI,CAAC,WAAW,EAAE,GAAG,CAAC,CAAC,CAAA;YACpF,KAAK,CAAC,oBAAoB,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,SAAS,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC,CAAA;QAClF,CAAC;aAAM,IAAI,OAAO,CAAC,UAAU,CAAC,QAAQ,CAAC,oBAAoB,CAAC,EAAE,CAAC;YAC9D,KAAK,CAAC,kBAAkB,GAAG,IAAI,GAAG,CAAC,MAAM,CACxC,SAAS,EACT,IAAI,YAAY,CAAC,IAAI,CAAC,WAAW,GAAG,qBAAqB,CAAC,EAC1D,CAAC,CAAC,EAAE,IAAI,CAAC,WAAW,EAAE,qBAAqB,CAAC,CAC5C,CAAA;YACD,KAAK,CAAC,oBAAoB,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,SAAS,EAAE,IAAI,YAAY,CAAC,IAAI,CAAC,WAAW,CAAC,EAAE;gBAC1F,CAAC;gBACD,IAAI,CAAC,WAAW;aAChB,CAAC,CAAA;QACH,CAAC;QAED,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,CAAA;QACvC,MAAM,YAAY,GAAG,MAAM,CAAC,MAAM,CAAA;QAClC,IAAI,CAAC,YAAY;YAAE,MAAM,IAAI,KAAK,CAAC,6CAA6C,CAAC,CAAA;QACjF,MAAM,IAAI,GAAG,YAAY,CAAC,IAAoB,CAAA;QAC9C,MAAM,CAAC,EAAE,AAAD,EAAG,SAAS,CAAC,GAAG,YAAY,CAAC,IAAyC,CAAA;QAE9E,MAAM,MAAM,GAAe,EAAE,CAAA;QAC7B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACjC,MAAM,GAAG,GAAa,IAAI,KAAK,CAAC,SAAS,CAAC,CAAA;YAC1C,MAAM,IAAI,GAAG,CAAC,GAAG,SAAS,CAAA;YAC1B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,EAAE,CAAC,EAAE;gBAAE,GAAG,CAAC,CAAC,CAAC,GAAG,IAAI,CAAC,IAAI,GAAG,CAAC,CAAE,CAAA;YAC5D,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;QACjB,CAAC;QACD,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,CAAA;IAC7B,CAAC;CACD"}
|
package/out/unit-repair.d.ts
CHANGED
|
@@ -5,32 +5,28 @@
|
|
|
5
5
|
*
|
|
6
6
|
* Secondary-unit regex repair pass — parser-improvement backlog (2026-05-30).
|
|
7
7
|
*
|
|
8
|
-
* The three-arena capability eval surfaced a persistent neural weakness: the
|
|
9
|
-
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
12
|
-
*
|
|
13
|
-
*
|
|
14
|
-
* model is untouched; this is a decoder-side correction, the same "lowest
|
|
15
|
-
* risk" lever family as postcode-repair.
|
|
8
|
+
* The three-arena capability eval surfaced a persistent neural weakness: the model DROPS secondary
|
|
9
|
+
* units. "123 Main St Apt 456" → no unit label; the postal-standards secondary-unit edge class
|
|
10
|
+
* scored 0% neural. Units have a rigid surface shape (a designator keyword + an identifier), so —
|
|
11
|
+
* exactly like the postcode-repair pass (#35) — we can detect them deterministically and repair
|
|
12
|
+
* the BIO labels AFTER decode but BEFORE `buildAddressTree`. The model is untouched; this is a
|
|
13
|
+
* decoder-side correction, the same "lowest risk" lever family as postcode-repair.
|
|
16
14
|
*
|
|
17
15
|
* PRECISION GUARDS (mirror postcode-repair — never regress a confident parse):
|
|
18
|
-
* - We only fire on EXPLICIT designators (Apt, Ste, Suite, Unit, Rm, Floor,
|
|
19
|
-
* Bldg, Flat, … + bare "#<n>"). Ambiguous tokens are deliberately excluded:
|
|
20
|
-
* "Box" (that's po_box), bare "F"/"No" (too greedy), "Space"/"Stop" (common
|
|
21
|
-
* words).
|
|
22
|
-
* - ADD path (model emitted no unit over the matched run): allowed ONLY over
|
|
23
|
-
* `O` tokens — never over house_number / street* / postcode / po_box / a
|
|
24
|
-
* geographic container. So a confidently-labeled street or number is safe.
|
|
25
|
-
* - SNAP path: when the model already started a unit span inside the match,
|
|
26
|
-
* we expand/clip it to the full detected shape.
|
|
27
|
-
* - Local smear-clip: unit tokens immediately flanking a snapped run are
|
|
28
|
-
* cleared (mirrors postcode-repair) so "Apt 4 Springfield" can't leave a
|
|
29
|
-
* stray I-unit on "Springfield".
|
|
30
16
|
*
|
|
31
|
-
*
|
|
32
|
-
*
|
|
33
|
-
*
|
|
17
|
+
* - We only fire on EXPLICIT designators (Apt, Ste, Suite, Unit, Rm, Floor, Bldg, Flat, … + bare
|
|
18
|
+
* "#<n>"). Ambiguous tokens are deliberately excluded: "Box" (that's po_box), bare "F"/"No"
|
|
19
|
+
* (too greedy), "Space"/"Stop" (common words).
|
|
20
|
+
* - ADD path (model emitted no unit over the matched run): allowed ONLY over `O` tokens — never over
|
|
21
|
+
* house_number / street* / postcode / po_box / a geographic container. So a
|
|
22
|
+
* confidently-labeled street or number is safe.
|
|
23
|
+
* - SNAP path: when the model already started a unit span inside the match, we expand/clip it to the
|
|
24
|
+
* full detected shape.
|
|
25
|
+
* - Local smear-clip: unit tokens immediately flanking a snapped run are cleared (mirrors
|
|
26
|
+
* postcode-repair) so "Apt 4 Springfield" can't leave a stray I-unit on "Springfield".
|
|
27
|
+
*
|
|
28
|
+
* Opt-in via `ParseOpts.unitRepair` (postcode-repair earned default-on only after a measured
|
|
29
|
+
* +135/0; unit-repair stays opt-in until the v0.7.2 arena re-run quantifies its delta).
|
|
34
30
|
*/
|
|
35
31
|
import type { DecoderToken } from "@mailwoman/core/decoder";
|
|
36
32
|
export interface RepairResult {
|
|
@@ -39,8 +35,8 @@ export interface RepairResult {
|
|
|
39
35
|
changed: number;
|
|
40
36
|
}
|
|
41
37
|
/**
|
|
42
|
-
* Repair secondary-unit label spans in a decoded token sequence using designator
|
|
43
|
-
*
|
|
38
|
+
* Repair secondary-unit label spans in a decoded token sequence using designator regexes. Returns a
|
|
39
|
+
* NEW token array (inputs are not mutated) plus a change count.
|
|
44
40
|
*/
|
|
45
41
|
export declare function repairUnitLabels(text: string, input: readonly DecoderToken[]): RepairResult;
|
|
46
42
|
//# sourceMappingURL=unit-repair.d.ts.map
|
package/out/unit-repair.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"unit-repair.d.ts","sourceRoot":"","sources":["../unit-repair.ts"],"names":[],"mappings":"AAAA
|
|
1
|
+
{"version":3,"file":"unit-repair.d.ts","sourceRoot":"","sources":["../unit-repair.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6BG;AAEH,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,yBAAyB,CAAA;AAiF3D,MAAM,WAAW,YAAY;IAC5B,MAAM,EAAE,YAAY,EAAE,CAAA;IACtB,gEAAgE;IAChE,OAAO,EAAE,MAAM,CAAA;CACf;AAED;;;GAGG;AACH,wBAAgB,gBAAgB,CAAC,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,SAAS,YAAY,EAAE,GAAG,YAAY,CA8C3F"}
|
package/out/unit-repair.js
CHANGED
|
@@ -5,38 +5,34 @@
|
|
|
5
5
|
*
|
|
6
6
|
* Secondary-unit regex repair pass — parser-improvement backlog (2026-05-30).
|
|
7
7
|
*
|
|
8
|
-
* The three-arena capability eval surfaced a persistent neural weakness: the
|
|
9
|
-
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
12
|
-
*
|
|
13
|
-
*
|
|
14
|
-
* model is untouched; this is a decoder-side correction, the same "lowest
|
|
15
|
-
* risk" lever family as postcode-repair.
|
|
8
|
+
* The three-arena capability eval surfaced a persistent neural weakness: the model DROPS secondary
|
|
9
|
+
* units. "123 Main St Apt 456" → no unit label; the postal-standards secondary-unit edge class
|
|
10
|
+
* scored 0% neural. Units have a rigid surface shape (a designator keyword + an identifier), so —
|
|
11
|
+
* exactly like the postcode-repair pass (#35) — we can detect them deterministically and repair
|
|
12
|
+
* the BIO labels AFTER decode but BEFORE `buildAddressTree`. The model is untouched; this is a
|
|
13
|
+
* decoder-side correction, the same "lowest risk" lever family as postcode-repair.
|
|
16
14
|
*
|
|
17
15
|
* PRECISION GUARDS (mirror postcode-repair — never regress a confident parse):
|
|
18
|
-
* - We only fire on EXPLICIT designators (Apt, Ste, Suite, Unit, Rm, Floor,
|
|
19
|
-
* Bldg, Flat, … + bare "#<n>"). Ambiguous tokens are deliberately excluded:
|
|
20
|
-
* "Box" (that's po_box), bare "F"/"No" (too greedy), "Space"/"Stop" (common
|
|
21
|
-
* words).
|
|
22
|
-
* - ADD path (model emitted no unit over the matched run): allowed ONLY over
|
|
23
|
-
* `O` tokens — never over house_number / street* / postcode / po_box / a
|
|
24
|
-
* geographic container. So a confidently-labeled street or number is safe.
|
|
25
|
-
* - SNAP path: when the model already started a unit span inside the match,
|
|
26
|
-
* we expand/clip it to the full detected shape.
|
|
27
|
-
* - Local smear-clip: unit tokens immediately flanking a snapped run are
|
|
28
|
-
* cleared (mirrors postcode-repair) so "Apt 4 Springfield" can't leave a
|
|
29
|
-
* stray I-unit on "Springfield".
|
|
30
16
|
*
|
|
31
|
-
*
|
|
32
|
-
*
|
|
33
|
-
*
|
|
17
|
+
* - We only fire on EXPLICIT designators (Apt, Ste, Suite, Unit, Rm, Floor, Bldg, Flat, … + bare
|
|
18
|
+
* "#<n>"). Ambiguous tokens are deliberately excluded: "Box" (that's po_box), bare "F"/"No"
|
|
19
|
+
* (too greedy), "Space"/"Stop" (common words).
|
|
20
|
+
* - ADD path (model emitted no unit over the matched run): allowed ONLY over `O` tokens — never over
|
|
21
|
+
* house_number / street* / postcode / po_box / a geographic container. So a
|
|
22
|
+
* confidently-labeled street or number is safe.
|
|
23
|
+
* - SNAP path: when the model already started a unit span inside the match, we expand/clip it to the
|
|
24
|
+
* full detected shape.
|
|
25
|
+
* - Local smear-clip: unit tokens immediately flanking a snapped run are cleared (mirrors
|
|
26
|
+
* postcode-repair) so "Apt 4 Springfield" can't leave a stray I-unit on "Springfield".
|
|
27
|
+
*
|
|
28
|
+
* Opt-in via `ParseOpts.unitRepair` (postcode-repair earned default-on only after a measured
|
|
29
|
+
* +135/0; unit-repair stays opt-in until the v0.7.2 arena re-run quantifies its delta).
|
|
34
30
|
*/
|
|
35
31
|
/**
|
|
36
|
-
* Secondary-unit shape patterns, ordered most-specific → least. Case-insensitive
|
|
37
|
-
*
|
|
38
|
-
*
|
|
39
|
-
*
|
|
32
|
+
* Secondary-unit shape patterns, ordered most-specific → least. Case-insensitive (unit designators
|
|
33
|
+
* appear in every casing in real data). The identifier is a 1-5 digit number with an optional
|
|
34
|
+
* trailing letter ("4B"), a single letter ("STE D"), or a letter+digits — kept tight so we don't
|
|
35
|
+
* swallow following words.
|
|
40
36
|
*/
|
|
41
37
|
const UNIT_DESIGNATORS = "APARTMENT|APT|SUITE|STE|UNIT|ROOM|RM|FLOOR|FLR|FL|BUILDING|BLDG|DEPARTMENT|DEPT|LOT|TRAILER|TRLR|SLIP|HANGAR|PIER|FLAT|PH|PENTHOUSE";
|
|
42
38
|
const UNIT_PATTERNS = [
|
|
@@ -57,15 +53,14 @@ const UNIT_B = "B-unit";
|
|
|
57
53
|
const UNIT_I = "I-unit";
|
|
58
54
|
const OUTSIDE = "O";
|
|
59
55
|
/**
|
|
60
|
-
* Tags a unit span is allowed to overwrite on the ADD path. The v0.7.2 arena
|
|
61
|
-
*
|
|
62
|
-
*
|
|
63
|
-
*
|
|
64
|
-
*
|
|
65
|
-
*
|
|
66
|
-
*
|
|
67
|
-
*
|
|
68
|
-
* list so a confident parse is never clobbered. (`O` is always eligible.)
|
|
56
|
+
* Tags a unit span is allowed to overwrite on the ADD path. The v0.7.2 arena showed the dominant
|
|
57
|
+
* failure for bare designator-led units ("Flat 2 14 Smith St", "APT 2 …") is the model labeling the
|
|
58
|
+
* WHOLE designator+identifier run as `locality` — not leaving it `O`. An explicit designator +
|
|
59
|
+
* identifier is a high-confidence "this is a unit" shape (a real locality/suburb name never has
|
|
60
|
+
* that form), so — exactly like postcode-repair's ADD_OVER_TAGS — we let it reclaim a
|
|
61
|
+
* `locality`/`dependent_locality` span. Structural tags (house_number, street*, postcode, po_box,
|
|
62
|
+
* region, country, venue) stay off the list so a confident parse is never clobbered. (`O` is always
|
|
63
|
+
* eligible.)
|
|
69
64
|
*/
|
|
70
65
|
const ADD_OVER_TAGS = new Set(["locality", "dependent_locality"]);
|
|
71
66
|
function isUnitLabel(label) {
|
|
@@ -95,8 +90,8 @@ function collectMatches(text) {
|
|
|
95
90
|
return accepted;
|
|
96
91
|
}
|
|
97
92
|
/**
|
|
98
|
-
* Repair secondary-unit label spans in a decoded token sequence using designator
|
|
99
|
-
*
|
|
93
|
+
* Repair secondary-unit label spans in a decoded token sequence using designator regexes. Returns a
|
|
94
|
+
* NEW token array (inputs are not mutated) plus a change count.
|
|
100
95
|
*/
|
|
101
96
|
export function repairUnitLabels(text, input) {
|
|
102
97
|
const matches = collectMatches(text);
|
package/out/unit-repair.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"unit-repair.js","sourceRoot":"","sources":["../unit-repair.ts"],"names":[],"mappings":"AAAA
|
|
1
|
+
{"version":3,"file":"unit-repair.js","sourceRoot":"","sources":["../unit-repair.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6BG;AAYH;;;;;GAKG;AACH,MAAM,gBAAgB,GACrB,qIAAqI,CAAA;AAEtI,MAAM,aAAa,GAAyC;IAC3D,kFAAkF;IAClF,wEAAwE;IACxE,iFAAiF;IACjF,kFAAkF;IAClF,8EAA8E;IAC9E,kCAAkC;IAClC;QACC,KAAK,EAAE,YAAY;QACnB,EAAE,EAAE,IAAI,MAAM,CACb,SAAS,gBAAgB,4EAA4E,EACrG,IAAI,CACJ;KACD;IACD,8EAA8E;IAC9E,EAAE,KAAK,EAAE,MAAM,EAAE,EAAE,EAAE,yBAAyB,EAAE;CAChD,CAAA;AAED,MAAM,MAAM,GAAG,QAAiC,CAAA;AAChD,MAAM,MAAM,GAAG,QAAiC,CAAA;AAChD,MAAM,OAAO,GAAG,GAA4B,CAAA;AAE5C;;;;;;;;;GASG;AACH,MAAM,aAAa,GAAG,IAAI,GAAG,CAAS,CAAC,UAAU,EAAE,oBAAoB,CAAC,CAAC,CAAA;AAEzE,SAAS,WAAW,CAAC,KAAa;IACjC,OAAO,KAAK,KAAK,QAAQ,IAAI,KAAK,KAAK,QAAQ,CAAA;AAChD,CAAC;AAED,qFAAqF;AACrF,SAAS,KAAK,CAAC,KAAa;IAC3B,OAAO,KAAK,KAAK,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAA;AAC7C,CAAC;AAED,mGAAmG;AACnG,SAAS,cAAc,CAAC,IAAY;IACnC,MAAM,UAAU,GAAgB,EAAE,CAAA;IAClC,aAAa,CAAC,OAAO,CAAC,CAAC,GAAG,EAAE,QAAQ,EAAE,EAAE;QACvC,GAAG,CAAC,EAAE,CAAC,SAAS,GAAG,CAAC,CAAA;QACpB,KAAK,IAAI,CAAC,GAAG,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;YAC1D,UAAU,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC,KAAK,EAAE,GAAG,EAAE,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,EAAE,QAAQ,EAAE,CAAC,CAAA;QAC1E,CAAC;IACF,CAAC,CAAC,CAAA;IACF,yFAAyF;IACzF,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,QAAQ,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAA;IACzF,MAAM,QAAQ,GAAgB,EAAE,CAAA;IAChC,KAAK,MAAM,CAAC,IAAI,UAAU,EAAE,CAAC;QAC5B,IAAI,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,GAAG,CAAC;YAAE,SAAQ;QACtE,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IACjB,CAAC;IACD,OAAO,QAAQ,CAAA;AAChB,CAAC;AAQD;;;GAGG;AACH,MAAM,UAAU,gBAAgB,CAAC,IAAY,EAAE,KAA8B;IAC5E,MAAM,OAAO,GAAG,cAAc,CAAC,IAAI,CAAC,CAAA;IACpC,MAAM,MAAM,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC,CAAC,CAAA;IAC3C,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,CAAC,EAAE,CAAA;IAEvD,IAAI,OAAO,GAAG,CAAC,CAAA;IACf,MAAM,QAAQ,GAAG,CAAC,CAAS,EAAE,KAA4B,EAAQ,EAAE;QAClE,IAAI,MAAM,CAAC,CAAC,CAAE,CAAC,KAAK,KAAK,KAAK,EAAE,CAAC;YAChC,MAAM,CAAC,CAAC,CAAE,CAAC,KAAK,GAAG,KAAK,CAAA;YACxB,OAAO,EAAE,CAAA;QACV,CAAC;IACF,CAAC,CAAA;IAED,KAAK,MAAM,CAAC,IAAI,OAAO,EAAE,CAAC;QACzB,+CAA+C;QAC/C,MAAM,OAAO,GAAa,EAAE,CAAA;QAC5B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACxC,MAAM,CAAC,GAAG,MAAM,CAAC,CAAC,CAAE,CAAA;YACpB,IAAI,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,GAAG;gBAAE,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;QACxD,CAAC;QACD,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC;YAAE,SAAQ;QAElC,MAAM,OAAO,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,KAAK,CAAC,CAAC,CAAA;QAClE,IAAI,CAAC,OAAO,EAAE,CAAC;YACd,iFAAiF;YACjF,6EAA6E;YAC7E,oFAAoF;YACpF,+BAA+B;YAC/B,MAAM,IAAI,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE;gBAChC,MAAM,GAAG,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,KAAK,CAAC,CAAA;gBACnC,OAAO,GAAG,KAAK,IAAI,IAAI,aAAa,CAAC,GAAG,CAAC,GAAG,CAAC,CAAA;YAC9C,CAAC,CAAC,CAAA;YACF,IAAI,CAAC,IAAI;gBAAE,SAAQ;QACpB,CAAC;QAED,2DAA2D;QAC3D,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,EAAE,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAA;QAEjE,4EAA4E;QAC5E,KAAK,IAAI,CAAC,GAAG,OAAO,CAAC,CAAC,CAAE,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,WAAW,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,KAAK,CAAC,EAAE,CAAC,EAAE;YAAE,QAAQ,CAAC,CAAC,EAAE,OAAO,CAAC,CAAA;QAChG,KAAK,IAAI,CAAC,GAAG,OAAO,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAE,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,IAAI,WAAW,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,KAAK,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YACxG,QAAQ,CAAC,CAAC,EAAE,OAAO,CAAC,CAAA;QACrB,CAAC;IACF,CAAC;IAED,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,CAAA;AAC3B,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mailwoman/neural",
|
|
3
|
-
"version": "4.
|
|
3
|
+
"version": "4.2.0",
|
|
4
4
|
"description": "Mailwoman neural classifier runtime: SentencePiece tokenizer + ONNX inference + decoder wiring.",
|
|
5
5
|
"license": "AGPL-3.0-only",
|
|
6
6
|
"repository": {
|
|
@@ -20,8 +20,8 @@
|
|
|
20
20
|
"./browser": "./out/browser.js"
|
|
21
21
|
},
|
|
22
22
|
"dependencies": {
|
|
23
|
-
"@mailwoman/codex": "4.
|
|
24
|
-
"@mailwoman/core": "4.
|
|
23
|
+
"@mailwoman/codex": "4.2.0",
|
|
24
|
+
"@mailwoman/core": "4.2.0",
|
|
25
25
|
"@sctg/sentencepiece-js": "^1.3.3",
|
|
26
26
|
"onnxruntime-node": "^1.26.0"
|
|
27
27
|
},
|