@mailwoman/neural 4.0.0 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/out/anchor-inference.d.ts +10 -9
- package/out/anchor-inference.d.ts.map +1 -1
- package/out/anchor-inference.js +10 -9
- package/out/anchor-inference.js.map +1 -1
- package/out/classifier.d.ts +14 -7
- package/out/classifier.d.ts.map +1 -1
- package/out/classifier.js +4 -4
- package/out/classifier.js.map +1 -1
- package/out/unit-repair.d.ts +21 -25
- package/out/unit-repair.d.ts.map +1 -1
- package/out/unit-repair.js +33 -38
- package/out/unit-repair.js.map +1 -1
- package/package.json +3 -3
|
@@ -4,14 +4,15 @@
|
|
|
4
4
|
* @author Teffen Ellis, et al.
|
|
5
5
|
*
|
|
6
6
|
* Inference-side postcode-anchor features (#239/#240) — the mirror of the Python training pipeline
|
|
7
|
-
* (`mailwoman_train/tokenizer.py::anchor_feature_vector` + `realign_anchor_to_pieces`). At
|
|
8
|
-
* the model conditions on per-piece anchor features fed alongside `input_ids`; this
|
|
9
|
-
* a raw address + its SentencePiece pieces, using the SAME postcode→anchor
|
|
10
|
-
* against (`scripts/build-pilot-anchor-lookup.py`), so the feature
|
|
7
|
+
* (`mailwoman_train/tokenizer.py::anchor_feature_vector` + `realign_anchor_to_pieces`). At
|
|
8
|
+
* inference the model conditions on per-piece anchor features fed alongside `input_ids`; this
|
|
9
|
+
* builds them from a raw address + its SentencePiece pieces, using the SAME postcode→anchor
|
|
10
|
+
* lookup the model trained against (`scripts/build-pilot-anchor-lookup.py`), so the feature
|
|
11
|
+
* layout matches byte-for-byte.
|
|
11
12
|
*
|
|
12
13
|
* The layout is LOAD-BEARING and cross-language: a wrong locale order or centroid scale feeds the
|
|
13
|
-
* model garbage. `anchor-inference.test.ts` pins both `LOCALE_ORDER` and the vector to values
|
|
14
|
-
* by the Python `anchor_feature_vector` — any drift fails the test.
|
|
14
|
+
* model garbage. `anchor-inference.test.ts` pins both `LOCALE_ORDER` and the vector to values
|
|
15
|
+
* emitted by the Python `anchor_feature_vector` — any drift fails the test.
|
|
15
16
|
*/
|
|
16
17
|
import type { TokenizedPiece } from "./tokenizer.js";
|
|
17
18
|
/**
|
|
@@ -36,9 +37,9 @@ export type AnchorLookup = Map<string, AnchorEntry>;
|
|
|
36
37
|
*/
|
|
37
38
|
export declare function anchorFeatureVector(posterior: Record<string, number>, lat: number, lon: number): number[];
|
|
38
39
|
/**
|
|
39
|
-
* Parse the pilot postcode→anchor lookup JSON (`{postcode: [posterior, lat, lon]}`) into a Map.
|
|
40
|
-
* (takes the parsed object, not a path) so this module stays browser-safe — the file read
|
|
41
|
-
* the Node-side caller (the eval).
|
|
40
|
+
* Parse the pilot postcode→anchor lookup JSON (`{postcode: [posterior, lat, lon]}`) into a Map.
|
|
41
|
+
* Pure (takes the parsed object, not a path) so this module stays browser-safe — the file read
|
|
42
|
+
* lives in the Node-side caller (the eval).
|
|
42
43
|
*/
|
|
43
44
|
export declare function parseAnchorLookup(raw: Record<string, [Record<string, number>, number, number]>): AnchorLookup;
|
|
44
45
|
/**
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"anchor-inference.d.ts","sourceRoot":"","sources":["../anchor-inference.ts"],"names":[],"mappings":"AAAA
|
|
1
|
+
{"version":3,"file":"anchor-inference.d.ts","sourceRoot":"","sources":["../anchor-inference.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAEH,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAA;AAEpD;;;;GAIG;AACH,eAAO,MAAM,YAAY,iEAAkE,CAAA;AAE3F,6EAA6E;AAC7E,eAAO,MAAM,kBAAkB,QAA0B,CAAA;AAEzD,mGAAmG;AACnG,MAAM,WAAW,WAAW;IAC3B,SAAS,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAA;IACjC,GAAG,EAAE,MAAM,CAAA;IACX,GAAG,EAAE,MAAM,CAAA;CACX;AAED,MAAM,MAAM,YAAY,GAAG,GAAG,CAAC,MAAM,EAAE,WAAW,CAAC,CAAA;AAEnD;;;;GAIG;AACH,wBAAgB,mBAAmB,CAAC,SAAS,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,GAAG,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,MAAM,EAAE,CAgBzG;AAED;;;;GAIG;AACH,wBAAgB,iBAAiB,CAAC,GAAG,EAAE,MAAM,CAAC,MAAM,EAAE,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,MAAM,EAAE,MAAM,CAAC,CAAC,GAAG,YAAY,CAI7G;AAED;;;;;;;;GAQG;AACH,wBAAgB,mBAAmB,CAClC,IAAI,EAAE,MAAM,EACZ,MAAM,EAAE,aAAa,CAAC,cAAc,CAAC,EACrC,MAAM,EAAE,YAAY,GAClB;IAAE,QAAQ,EAAE,MAAM,EAAE,EAAE,CAAC;IAAC,UAAU,EAAE,MAAM,EAAE,CAAA;CAAE,CA0BhD"}
|
package/out/anchor-inference.js
CHANGED
|
@@ -4,14 +4,15 @@
|
|
|
4
4
|
* @author Teffen Ellis, et al.
|
|
5
5
|
*
|
|
6
6
|
* Inference-side postcode-anchor features (#239/#240) — the mirror of the Python training pipeline
|
|
7
|
-
* (`mailwoman_train/tokenizer.py::anchor_feature_vector` + `realign_anchor_to_pieces`). At
|
|
8
|
-
* the model conditions on per-piece anchor features fed alongside `input_ids`; this
|
|
9
|
-
* a raw address + its SentencePiece pieces, using the SAME postcode→anchor
|
|
10
|
-
* against (`scripts/build-pilot-anchor-lookup.py`), so the feature
|
|
7
|
+
* (`mailwoman_train/tokenizer.py::anchor_feature_vector` + `realign_anchor_to_pieces`). At
|
|
8
|
+
* inference the model conditions on per-piece anchor features fed alongside `input_ids`; this
|
|
9
|
+
* builds them from a raw address + its SentencePiece pieces, using the SAME postcode→anchor
|
|
10
|
+
* lookup the model trained against (`scripts/build-pilot-anchor-lookup.py`), so the feature
|
|
11
|
+
* layout matches byte-for-byte.
|
|
11
12
|
*
|
|
12
13
|
* The layout is LOAD-BEARING and cross-language: a wrong locale order or centroid scale feeds the
|
|
13
|
-
* model garbage. `anchor-inference.test.ts` pins both `LOCALE_ORDER` and the vector to values
|
|
14
|
-
* by the Python `anchor_feature_vector` — any drift fails the test.
|
|
14
|
+
* model garbage. `anchor-inference.test.ts` pins both `LOCALE_ORDER` and the vector to values
|
|
15
|
+
* emitted by the Python `anchor_feature_vector` — any drift fails the test.
|
|
15
16
|
*/
|
|
16
17
|
/**
|
|
17
18
|
* The locale class order — MUST match Python `mailwoman_train/labels.py::LOCALE_COUNTRIES`. The
|
|
@@ -45,9 +46,9 @@ export function anchorFeatureVector(posterior, lat, lon) {
|
|
|
45
46
|
return vec;
|
|
46
47
|
}
|
|
47
48
|
/**
|
|
48
|
-
* Parse the pilot postcode→anchor lookup JSON (`{postcode: [posterior, lat, lon]}`) into a Map.
|
|
49
|
-
* (takes the parsed object, not a path) so this module stays browser-safe — the file read
|
|
50
|
-
* the Node-side caller (the eval).
|
|
49
|
+
* Parse the pilot postcode→anchor lookup JSON (`{postcode: [posterior, lat, lon]}`) into a Map.
|
|
50
|
+
* Pure (takes the parsed object, not a path) so this module stays browser-safe — the file read
|
|
51
|
+
* lives in the Node-side caller (the eval).
|
|
51
52
|
*/
|
|
52
53
|
export function parseAnchorLookup(raw) {
|
|
53
54
|
const out = new Map();
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"anchor-inference.js","sourceRoot":"","sources":["../anchor-inference.ts"],"names":[],"mappings":"AAAA
|
|
1
|
+
{"version":3,"file":"anchor-inference.js","sourceRoot":"","sources":["../anchor-inference.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAIH;;;;GAIG;AACH,MAAM,CAAC,MAAM,YAAY,GAAG,CAAC,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,CAAU,CAAA;AAE3F,6EAA6E;AAC7E,MAAM,CAAC,MAAM,kBAAkB,GAAG,YAAY,CAAC,MAAM,GAAG,CAAC,CAAA;AAWzD;;;;GAIG;AACH,MAAM,UAAU,mBAAmB,CAAC,SAAiC,EAAE,GAAW,EAAE,GAAW;IAC9F,MAAM,GAAG,GAAG,IAAI,KAAK,CAAS,kBAAkB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IACzD,IAAI,KAAK,GAAG,CAAC,CAAA;IACb,KAAK,MAAM,CAAC,OAAO,EAAE,MAAM,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,EAAE,CAAC;QAC3D,MAAM,GAAG,GAAG,YAAY,CAAC,OAAO,CAAC,OAAO,CAAC,WAAW,EAAmC,CAAC,CAAA;QACxF,IAAI,GAAG,IAAI,CAAC,EAAE,CAAC;YACd,GAAG,CAAC,GAAG,CAAC,GAAG,MAAM,CAAA;YACjB,KAAK,IAAI,MAAM,CAAA;QAChB,CAAC;IACF,CAAC;IACD,IAAI,KAAK,GAAG,CAAC,EAAE,CAAC;QACf,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,YAAY,CAAC,MAAM,EAAE,CAAC,EAAE;YAAE,GAAG,CAAC,CAAC,CAAE,IAAI,KAAK,CAAA;IAC/D,CAAC;IACD,GAAG,CAAC,YAAY,CAAC,MAAM,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,GAAG,GAAG,EAAE,CAAC,CAAC,CAAA;IAC9D,GAAG,CAAC,YAAY,CAAC,MAAM,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,GAAG,GAAG,GAAG,CAAC,CAAC,CAAA;IACnE,OAAO,GAAG,CAAA;AACX,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,iBAAiB,CAAC,GAA6D;IAC9F,MAAM,GAAG,GAAiB,IAAI,GAAG,EAAE,CAAA;IACnC,KAAK,MAAM,CAAC,EAAE,EAAE,CAAC,SAAS,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC;QAAE,GAAG,CAAC,GAAG,CAAC,EAAE,EAAE,EAAE,SAAS,EAAE,GAAG,EAAE,GAAG,EAAE,CAAC,CAAA;IACnG,OAAO,GAAG,CAAA;AACX,CAAC;AAED;;;;;;;;GAQG;AACH,MAAM,UAAU,mBAAmB,CAClC,IAAY,EACZ,MAAqC,EACrC,MAAoB;IAEpB,MAAM,QAAQ,GAAe,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,IAAI,KAAK,CAAS,kBAAkB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAA;IAC5F,MAAM,UAAU,GAAa,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAA;IAEhD,MAAM,OAAO,GAAG,eAAe,CAAA;IAC/B,IAAI,CAAyB,CAAA;IAC7B,OAAO,CAAC,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QAC1C,MAAM,KAAK,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC,CAAA;QAC5C,IAAI,CAAC,KAAK;YAAE,SAAQ;QACpB,MAAM,SAAS,GAAG,CAAC,CAAC,KAAK,CAAA;QACzB,MAAM,OAAO,GAAG,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAA;QACrC,MAAM,GAAG,GAAG,mBAAmB,CAAC,KAAK,CAAC,SAAS,EAAE,KAAK,CAAC,GAAG,EAAE,KAAK,CAAC,GAAG,CAAC,CAAA;QACtE,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACxC,MAAM,CAAC,GAAG,MAAM,CAAC,CAAC,CAAE,CAAA;YACpB,KAAK,IAAI,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;gBACtC,IAAI,CAAC,GAAG,IAAI,CAAC,MAAM,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAE,CAAC,EAAE,CAAC;oBAC7C,IAAI,CAAC,IAAI,SAAS,IAAI,CAAC,GAAG,OAAO,EAAE,CAAC;wBACnC,QAAQ,CAAC,CAAC,CAAC,GAAG,GAAG,CAAA;wBACjB,UAAU,CAAC,CAAC,CAAC,GAAG,GAAG,CAAA;oBACpB,CAAC;oBACD,MAAK,CAAC,oFAAoF;gBAC3F,CAAC;YACF,CAAC;QACF,CAAC;IACF,CAAC;IACD,OAAO,EAAE,QAAQ,EAAE,UAAU,EAAE,CAAA;AAChC,CAAC"}
|
package/out/classifier.d.ts
CHANGED
|
@@ -9,13 +9,13 @@
|
|
|
9
9
|
*
|
|
10
10
|
* Convenience wrappers `parseJson` / `parseTuples` / `parseXml` project the tree on the way out.
|
|
11
11
|
*/
|
|
12
|
-
import { decodeAsXml, type AddressTree, type ComponentTag } from "@mailwoman/core/decoder";
|
|
12
|
+
import { decodeAsXml, type AddressTree, type Calibrator, type ComponentTag } from "@mailwoman/core/decoder";
|
|
13
|
+
import { type AnchorLookup } from "./anchor-inference.js";
|
|
13
14
|
import { type FstMatcherLike } from "./fst-prior.js";
|
|
14
15
|
import type { InferResult } from "./onnx-runner.js";
|
|
15
16
|
import { type QueryShapeLike } from "./query-shape-prior.js";
|
|
16
17
|
import { type StreetMorphologyPriorOpts } from "./street-morphology-prior.js";
|
|
17
18
|
import { MailwomanTokenizer } from "./tokenizer.js";
|
|
18
|
-
import { type AnchorLookup } from "./anchor-inference.js";
|
|
19
19
|
import type { ResolveWeightsOpts } from "./weights.js";
|
|
20
20
|
/**
|
|
21
21
|
* Structural type the classifier needs from a runner. Lets callers swap the Node-side `OnnxRunner`
|
|
@@ -163,12 +163,19 @@ export interface ParseOpts {
|
|
|
163
163
|
*/
|
|
164
164
|
postcodeRepair?: boolean;
|
|
165
165
|
/**
|
|
166
|
-
* When true, run the deterministic secondary-unit regex repair pass on the decoded label
|
|
167
|
-
*
|
|
168
|
-
*
|
|
169
|
-
*
|
|
170
|
-
*
|
|
166
|
+
* When true, run the deterministic secondary-unit regex repair pass on the decoded label sequence
|
|
167
|
+
* before tree-building. Detects designator-shaped substrings ("Apt 4B", "Ste 12", "Unit 9400",
|
|
168
|
+
* bare "#104", …) and snaps/adds the unit span, fixing the unit-drop weakness the three-arena
|
|
169
|
+
* capability eval surfaced (postal secondary-unit 0% neural). Off by default — opt-in until the
|
|
170
|
+
* v0.7.2 arena re-run quantifies its delta. See `./unit-repair.ts`.
|
|
171
171
|
*/
|
|
172
172
|
unitRepair?: boolean;
|
|
173
|
+
/**
|
|
174
|
+
* Optional span-confidence calibrator (task #59). When provided, each decoded span's `conf=` is
|
|
175
|
+
* mapped through it (isotonic lookup table → calibrated probability of correctness). OPT-IN —
|
|
176
|
+
* omit for the byte-stable default softmax confidence. Build one via `createCalibrator`
|
|
177
|
+
* (`@mailwoman/core/decoder`) from `data/eval/calibration/isotonic-<locale>-<version>.json`.
|
|
178
|
+
*/
|
|
179
|
+
calibrate?: Calibrator;
|
|
173
180
|
}
|
|
174
181
|
//# sourceMappingURL=classifier.d.ts.map
|
package/out/classifier.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"classifier.d.ts","sourceRoot":"","sources":["../classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,EAIN,WAAW,EACX,KAAK,WAAW,EAChB,KAAK,YAAY,EAEjB,MAAM,yBAAyB,CAAA;AAChC,OAAO,EAA0B,KAAK,cAAc,EAAE,MAAM,gBAAgB,CAAA;AAE5E,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAA;
|
|
1
|
+
{"version":3,"file":"classifier.d.ts","sourceRoot":"","sources":["../classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,EAIN,WAAW,EACX,KAAK,WAAW,EAChB,KAAK,UAAU,EACf,KAAK,YAAY,EAEjB,MAAM,yBAAyB,CAAA;AAChC,OAAO,EAAuB,KAAK,YAAY,EAAE,MAAM,uBAAuB,CAAA;AAC9E,OAAO,EAA0B,KAAK,cAAc,EAAE,MAAM,gBAAgB,CAAA;AAE5E,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAA;AAEnD,OAAO,EAA0C,KAAK,cAAc,EAAE,MAAM,wBAAwB,CAAA;AACpG,OAAO,EAAuC,KAAK,yBAAyB,EAAE,MAAM,8BAA8B,CAAA;AAClH,OAAO,EAAE,kBAAkB,EAAE,MAAM,gBAAgB,CAAA;AAGnD,OAAO,KAAK,EAAE,kBAAkB,EAAmB,MAAM,cAAc,CAAA;AAEvE;;;;GAIG;AACH,MAAM,WAAW,YAAY;IAC5B,KAAK,CACJ,QAAQ,EAAE,MAAM,EAAE,EAClB,MAAM,CAAC,EAAE;QAAE,QAAQ,EAAE,aAAa,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC,CAAC;QAAC,UAAU,EAAE,aAAa,CAAC,MAAM,CAAC,CAAA;KAAE,GAC5F,OAAO,CAAC,WAAW,CAAC,CAAA;CACvB;AAED,MAAM,WAAW,6BAA6B;IAC7C,SAAS,EAAE,kBAAkB,CAAA;IAC7B,MAAM,EAAE,YAAY,CAAA;IACpB;;;;OAIG;IACH,MAAM,CAAC,EAAE,SAAS,MAAM,EAAE,CAAA;IAC1B;;;;;;;OAOG;IACH,MAAM,CAAC,EAAE,SAAS,GAAG,QAAQ,CAAA;IAC7B;;;;OAIG;IACH,WAAW,CAAC,EAAE,MAAM,EAAE,EAAE,CAAA;IACxB,sEAAsE;IACtE,gBAAgB,CAAC,EAAE,MAAM,EAAE,CAAA;IAC3B,oEAAoE;IACpE,cAAc,CAAC,EAAE,MAAM,EAAE,CAAA;IACzB;;;;;OAKG;IACH,oBAAoB,CAAC,EAAE,YAAY,CAAA;CACnC;AAED,qBAAa,uBAAuB;IAOvB,OAAO,CAAC,QAAQ,CAAC,GAAG;IANhC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAmB;IAC1C,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAsB;IACjD,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAY;IACxC,OAAO,CAAC,QAAQ,CAAC,gBAAgB,CAAU;IAC3C,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAU;gBAEZ,GAAG,EAAE,6BAA6B;IAa/D;;;;;;;;;;;OAWG;WACU,eAAe,CAC3B,IAAI,GAAE,kBAAkB,GAAG;QAAE,oBAAoB,CAAC,EAAE,YAAY,CAAA;KAAO,GACrE,OAAO,CAAC,uBAAuB,CAAC;IA4BnC,6DAA6D;IACvD,KAAK,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,SAAS,GAAG,OAAO,CAAC,WAAW,CAAC;IA4EjE;;;OAGG;IACG,eAAe,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,SAAS,GAAG,OAAO,CAAC,qBAAqB,CAAC;IA0E/E,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,SAAS,GAAG,OAAO,CAAC,OAAO,CAAC,MAAM,CAAC,YAAY,EAAE,MAAM,CAAC,CAAC,CAAC;IAIzF,WAAW,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,SAAS,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,YAAY,EAAE,MAAM,CAAC,CAAC,CAAC;IAInF,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,SAAS,GAAG;QAAE,GAAG,CAAC,EAAE,UAAU,CAAC,OAAO,WAAW,CAAC,CAAC,CAAC,CAAC,CAAA;KAAE,GAAG,OAAO,CAAC,MAAM,CAAC;IAI7G;;;;;;;;;;OAUG;IACH,OAAO,CAAC,mBAAmB;CAW3B;AAED,wFAAwF;AACxF,MAAM,WAAW,qBAAqB;IACrC,IAAI,EAAE,WAAW,CAAA;IACjB,MAAM,EAAE,MAAM,EAAE,EAAE,CAAA;IAClB,MAAM,EAAE,KAAK,CAAC;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,GAAG,EAAE,MAAM,CAAA;KAAE,CAAC,CAAA;CAC7C;AAED;;;GAGG;AACH,MAAM,WAAW,SAAS;IACzB;;;;OAIG;IACH,UAAU,CAAC,EAAE,cAAc,CAAA;IAC3B;;;OAGG;IACH,mBAAmB,CAAC,EAAE,MAAM,CAAA;IAC5B;;;OAGG;IACH,GAAG,CAAC,EAAE,cAAc,CAAA;IACpB,6DAA6D;IAC7D,YAAY,CAAC,EAAE,MAAM,CAAA;IACrB;;;;;;OAMG;IACH,mBAAmB,CAAC,EAAE,cAAc,CAAA;IACpC,yDAAyD;IACzD,uBAAuB,CAAC,EAAE,yBAAyB,CAAA;IACnD;;;;;;OAMG;IACH,cAAc,CAAC,EAAE,OAAO,CAAA;IACxB;;;;;;OAMG;IACH,UAAU,CAAC,EAAE,OAAO,CAAA;IACpB;;;;;OAKG;IACH,SAAS,CAAC,EAAE,UAAU,CAAA;CACtB"}
|
package/out/classifier.js
CHANGED
|
@@ -10,14 +10,14 @@
|
|
|
10
10
|
* Convenience wrappers `parseJson` / `parseTuples` / `parseXml` project the tree on the way out.
|
|
11
11
|
*/
|
|
12
12
|
import { buildAddressTree, decodeAsJson, decodeAsTuples, decodeAsXml, } from "@mailwoman/core/decoder";
|
|
13
|
+
import { buildAnchorFeatures } from "./anchor-inference.js";
|
|
13
14
|
import { buildFstEmissionPriors } from "./fst-prior.js";
|
|
14
15
|
import { STAGE2_BIO_LABELS } from "./labels.js";
|
|
15
16
|
import { repairPostcodeLabels } from "./postcode-repair.js";
|
|
16
|
-
import { repairUnitLabels } from "./unit-repair.js";
|
|
17
17
|
import { addEmissionMatrix, buildEmissionPriors } from "./query-shape-prior.js";
|
|
18
18
|
import { buildStreetMorphologyEmissionPriors } from "./street-morphology-prior.js";
|
|
19
19
|
import { MailwomanTokenizer } from "./tokenizer.js";
|
|
20
|
-
import {
|
|
20
|
+
import { repairUnitLabels } from "./unit-repair.js";
|
|
21
21
|
import { buildBioEndMask, buildBioStartMask, buildBioTransitionMask, softmax, viterbi } from "./viterbi.js";
|
|
22
22
|
export class NeuralAddressClassifier {
|
|
23
23
|
cfg;
|
|
@@ -130,7 +130,7 @@ export class NeuralAddressClassifier {
|
|
|
130
130
|
if (opts?.unitRepair) {
|
|
131
131
|
tokens = repairUnitLabels(text, tokens).tokens;
|
|
132
132
|
}
|
|
133
|
-
return buildAddressTree(text, tokens);
|
|
133
|
+
return buildAddressTree(text, tokens, opts?.calibrate ? { calibrate: opts.calibrate } : undefined);
|
|
134
134
|
}
|
|
135
135
|
/**
|
|
136
136
|
* Like `parse`, but also returns the raw per-token logits and piece offsets needed for per-span
|
|
@@ -182,7 +182,7 @@ export class NeuralAddressClassifier {
|
|
|
182
182
|
};
|
|
183
183
|
});
|
|
184
184
|
return {
|
|
185
|
-
tree: buildAddressTree(text, tokens),
|
|
185
|
+
tree: buildAddressTree(text, tokens, opts?.calibrate ? { calibrate: opts.calibrate } : undefined),
|
|
186
186
|
logits,
|
|
187
187
|
pieces: pieces.map((p) => ({ start: p.start, end: p.end })),
|
|
188
188
|
};
|
package/out/classifier.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"classifier.js","sourceRoot":"","sources":["../classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,EACN,gBAAgB,EAChB,YAAY,EACZ,cAAc,EACd,WAAW,
|
|
1
|
+
{"version":3,"file":"classifier.js","sourceRoot":"","sources":["../classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,EACN,gBAAgB,EAChB,YAAY,EACZ,cAAc,EACd,WAAW,GAKX,MAAM,yBAAyB,CAAA;AAChC,OAAO,EAAE,mBAAmB,EAAqB,MAAM,uBAAuB,CAAA;AAC9E,OAAO,EAAE,sBAAsB,EAAuB,MAAM,gBAAgB,CAAA;AAC5E,OAAO,EAAE,iBAAiB,EAAE,MAAM,aAAa,CAAA;AAE/C,OAAO,EAAE,oBAAoB,EAAE,MAAM,sBAAsB,CAAA;AAC3D,OAAO,EAAE,iBAAiB,EAAE,mBAAmB,EAAuB,MAAM,wBAAwB,CAAA;AACpG,OAAO,EAAE,mCAAmC,EAAkC,MAAM,8BAA8B,CAAA;AAClH,OAAO,EAAE,kBAAkB,EAAE,MAAM,gBAAgB,CAAA;AACnD,OAAO,EAAE,gBAAgB,EAAE,MAAM,kBAAkB,CAAA;AACnD,OAAO,EAAE,eAAe,EAAE,iBAAiB,EAAE,sBAAsB,EAAE,OAAO,EAAE,OAAO,EAAE,MAAM,cAAc,CAAA;AAoD3G,MAAM,OAAO,uBAAuB;IAON;IANZ,MAAM,CAAmB;IACzB,UAAU,CAAsB;IAChC,WAAW,CAAY;IACvB,gBAAgB,CAAU;IAC1B,cAAc,CAAU;IAEzC,YAA6B,GAAkC;QAAlC,QAAG,GAAH,GAAG,CAA+B;QAC9D,IAAI,CAAC,MAAM,GAAG,GAAG,CAAC,MAAM,IAAI,iBAAiB,CAAA;QAC7C,IAAI,CAAC,UAAU,GAAG,GAAG,CAAC,MAAM,IAAI,SAAS,CAAA;QACzC,MAAM,UAAU,GAAG,sBAAsB,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QACtD,IAAI,GAAG,CAAC,WAAW,EAAE,CAAC;YACrB,IAAI,CAAC,WAAW,GAAG,WAAW,CAAC,UAAU,EAAE,GAAG,CAAC,WAAW,CAAC,CAAA;QAC5D,CAAC;aAAM,CAAC;YACP,IAAI,CAAC,WAAW,GAAG,UAAU,CAAA;QAC9B,CAAC;QACD,IAAI,CAAC,gBAAgB,GAAG,GAAG,CAAC,gBAAgB,IAAI,iBAAiB,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QAC9E,IAAI,CAAC,cAAc,GAAG,GAAG,CAAC,cAAc,IAAI,eAAe,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;IACzE,CAAC;IAED;;;;;;;;;;;OAWG;IACH,MAAM,CAAC,KAAK,CAAC,eAAe,CAC3B,OAAqE,EAAE;QAEvE,yFAAyF;QACzF,2FAA2F;QAC3F,uFAAuF;QACvF,0FAA0F;QAC1F,2BAA2B;QAC3B,MAAM,CAAC,EAAE,UAAU,EAAE,EAAE,EAAE,cAAc,EAAE,uBAAuB,EAAE,kBAAkB,EAAE,CAAC,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC;YAC3G,MAAM,CAAC,yBAAyB,CAAC,kBAAkB,CAAC;YACpD,MAAM,CAAC,yBAAyB,CAAC,cAAc,CAAC;SAChD,CAAC,CAAA;QACF,MAAM,QAAQ,GAAoB,cAAc,CAAC,IAAI,CAAC,CAAA;QACtD,MAAM,MAAM,GAAG,uBAAuB,CAAC,QAAQ,CAAC,aAAa,CAAC,CAAA;QAC9D,MAAM,GAAG,GAAG,kBAAkB,CAAC,QAAQ,CAAC,kBAAkB,CAAC,CAAA;QAC3D,MAAM,CAAC,SAAS,EAAE,MAAM,CAAC,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC;YAC7C,kBAAkB,CAAC,YAAY,CAAC,QAAQ,CAAC,aAAa,CAAC;YACvD,UAAU,CAAC,MAAM,CAAC,QAAQ,CAAC,SAAS,CAAC;SACrC,CAAC,CAAA;QACF,OAAO,IAAI,uBAAuB,CAAC;YAClC,SAAS;YACT,MAAM;YACN,MAAM;YACN,WAAW,EAAE,GAAG,EAAE,WAAW;YAC7B,gBAAgB,EAAE,GAAG,EAAE,gBAAgB;YACvC,cAAc,EAAE,GAAG,EAAE,cAAc;YACnC,GAAG,CAAC,IAAI,CAAC,oBAAoB,CAAC,CAAC,CAAC,EAAE,oBAAoB,EAAE,IAAI,CAAC,oBAAoB,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;SACzF,CAAC,CAAA;IACH,CAAC;IAED,6DAA6D;IAC7D,KAAK,CAAC,KAAK,CAAC,IAAY,EAAE,IAAgB;QACzC,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,EAAE,GAAG,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE,EAAE,CAAA;QAEtD,MAAM,EAAE,MAAM,EAAE,GAAG,EAAE,GAAG,IAAI,CAAC,GAAG,CAAC,SAAS,CAAC,MAAM,CAAC,IAAI,CAAC,CAAA;QACvD,gGAAgG;QAChG,+EAA+E;QAC/E,MAAM,MAAM,GAAG,IAAI,CAAC,GAAG,CAAC,oBAAoB;YAC3C,CAAC,CAAC,mBAAmB,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,GAAG,CAAC,oBAAoB,CAAC;YAClE,CAAC,CAAC,SAAS,CAAA;QACZ,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,EAAE,MAAM,CAAC,CAAA;QAE3D,IAAI,CAAC,mBAAmB,CAAC,MAAM,CAAC,CAAA;QAEhC,IAAI,SAAS,GAAG,IAAI,EAAE,UAAU;YAC/B,CAAC,CAAC,iBAAiB,CACjB,MAAM,EACN,mBAAmB,CAAC,IAAI,CAAC,UAAU,EAAE,MAAM,EAAE,IAAI,CAAC,MAAM,EAAE;gBACzD,SAAS,EAAE,IAAI,CAAC,mBAAmB,IAAI,GAAG;gBAC1C,SAAS,EAAE,IAAI;aACf,CAAC,CACF;YACF,CAAC,CAAC,MAAM,CAAA;QAET,IAAI,IAAI,EAAE,GAAG,EAAE,CAAC;YACf,SAAS,GAAG,iBAAiB,CAC5B,SAAS,EACT,sBAAsB,CAAC,IAAI,CAAC,GAAG,EAAE,MAAM,EAAE,IAAI,CAAC,MAAM,EAAE;gBACrD,SAAS,EAAE,IAAI,CAAC,YAAY,IAAI,GAAG;aACnC,CAAC,CACF,CAAA;QACF,CAAC;QAED,IAAI,IAAI,EAAE,mBAAmB,EAAE,CAAC;YAC/B,SAAS,GAAG,iBAAiB,CAC5B,SAAS,EACT,mCAAmC,CAClC,IAAI,CAAC,mBAAmB,EACxB,MAAM,EACN,IAAI,CAAC,MAAM,EACX,IAAI,CAAC,uBAAuB,IAAI,EAAE,CAClC,CACD,CAAA;QACF,CAAC;QAED,MAAM,YAAY,GACjB,IAAI,CAAC,UAAU,KAAK,SAAS;YAC5B,CAAC,CAAC,OAAO,CAAC;gBACR,SAAS;gBACT,WAAW,EAAE,IAAI,CAAC,WAAW;gBAC7B,gBAAgB,EAAE,IAAI,CAAC,gBAAgB;gBACvC,cAAc,EAAE,IAAI,CAAC,cAAc;aACnC,CAAC,CAAC,IAAI;YACR,CAAC,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,aAAa,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAA;QAElD,IAAI,MAAM,GAAmB,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;YAChD,MAAM,GAAG,GAAG,YAAY,CAAC,CAAC,CAAE,CAAA;YAC5B,MAAM,KAAK,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,CAAA;YACjC,OAAO;gBACN,KAAK,EAAE,CAAC,CAAC,KAAK;gBACd,KAAK,EAAE,CAAC,CAAC,KAAK;gBACd,GAAG,EAAE,CAAC,CAAC,GAAG;gBACV,KAAK,EAAE,CAAC,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,IAAI,GAAG,CAA0B;gBACzD,UAAU,EAAE,KAAK,CAAC,GAAG,CAAE;aACvB,CAAA;QACF,CAAC,CAAC,CAAA;QAEF,IAAI,IAAI,EAAE,cAAc,EAAE,CAAC;YAC1B,MAAM,GAAG,oBAAoB,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC,MAAM,CAAA;QACnD,CAAC;QACD,IAAI,IAAI,EAAE,UAAU,EAAE,CAAC;YACtB,MAAM,GAAG,gBAAgB,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC,MAAM,CAAA;QAC/C,CAAC;QAED,OAAO,gBAAgB,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,SAAS,CAAC,CAAC,CAAC,EAAE,SAAS,EAAE,IAAI,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC,CAAA;IACnG,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,eAAe,CAAC,IAAY,EAAE,IAAgB;QACnD,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACvB,OAAO,EAAE,IAAI,EAAE,EAAE,GAAG,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,CAAA;QAClE,CAAC;QACD,MAAM,EAAE,MAAM,EAAE,GAAG,EAAE,GAAG,IAAI,CAAC,GAAG,CAAC,SAAS,CAAC,MAAM,CAAC,IAAI,CAAC,CAAA;QACvD,gGAAgG;QAChG,+EAA+E;QAC/E,MAAM,MAAM,GAAG,IAAI,CAAC,GAAG,CAAC,oBAAoB;YAC3C,CAAC,CAAC,mBAAmB,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,GAAG,CAAC,oBAAoB,CAAC;YAClE,CAAC,CAAC,SAAS,CAAA;QACZ,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,EAAE,MAAM,CAAC,CAAA;QAE3D,IAAI,CAAC,mBAAmB,CAAC,MAAM,CAAC,CAAA;QAEhC,IAAI,SAAS,GAAG,IAAI,EAAE,UAAU;YAC/B,CAAC,CAAC,iBAAiB,CACjB,MAAM,EACN,mBAAmB,CAAC,IAAI,CAAC,UAAU,EAAE,MAAM,EAAE,IAAI,CAAC,MAAM,EAAE;gBACzD,SAAS,EAAE,IAAI,CAAC,mBAAmB,IAAI,GAAG;gBAC1C,SAAS,EAAE,IAAI;aACf,CAAC,CACF;YACF,CAAC,CAAC,MAAM,CAAA;QAET,IAAI,IAAI,EAAE,GAAG,EAAE,CAAC;YACf,SAAS,GAAG,iBAAiB,CAC5B,SAAS,EACT,sBAAsB,CAAC,IAAI,CAAC,GAAG,EAAE,MAAM,EAAE,IAAI,CAAC,MAAM,EAAE;gBACrD,SAAS,EAAE,IAAI,CAAC,YAAY,IAAI,GAAG;aACnC,CAAC,CACF,CAAA;QACF,CAAC;QAED,IAAI,IAAI,EAAE,mBAAmB,EAAE,CAAC;YAC/B,SAAS,GAAG,iBAAiB,CAC5B,SAAS,EACT,mCAAmC,CAClC,IAAI,CAAC,mBAAmB,EACxB,MAAM,EACN,IAAI,CAAC,MAAM,EACX,IAAI,CAAC,uBAAuB,IAAI,EAAE,CAClC,CACD,CAAA;QACF,CAAC;QAED,MAAM,YAAY,GACjB,IAAI,CAAC,UAAU,KAAK,SAAS;YAC5B,CAAC,CAAC,OAAO,CAAC;gBACR,SAAS;gBACT,WAAW,EAAE,IAAI,CAAC,WAAW;gBAC7B,gBAAgB,EAAE,IAAI,CAAC,gBAAgB;gBACvC,cAAc,EAAE,IAAI,CAAC,cAAc;aACnC,CAAC,CAAC,IAAI;YACR,CAAC,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,aAAa,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAA;QAElD,MAAM,MAAM,GAAmB,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;YAClD,MAAM,GAAG,GAAG,YAAY,CAAC,CAAC,CAAE,CAAA;YAC5B,MAAM,KAAK,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,CAAA;YACjC,OAAO;gBACN,KAAK,EAAE,CAAC,CAAC,KAAK;gBACd,KAAK,EAAE,CAAC,CAAC,KAAK;gBACd,GAAG,EAAE,CAAC,CAAC,GAAG;gBACV,KAAK,EAAE,CAAC,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,IAAI,GAAG,CAA0B;gBACzD,UAAU,EAAE,KAAK,CAAC,GAAG,CAAE;aACvB,CAAA;QACF,CAAC,CAAC,CAAA;QAEF,OAAO;YACN,IAAI,EAAE,gBAAgB,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,SAAS,CAAC,CAAC,CAAC,EAAE,SAAS,EAAE,IAAI,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC;YACjG,MAAM;YACN,MAAM,EAAE,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC,KAAK,EAAE,GAAG,EAAE,CAAC,CAAC,GAAG,EAAE,CAAC,CAAC;SAC3D,CAAA;IACF,CAAC;IAED,KAAK,CAAC,SAAS,CAAC,IAAY,EAAE,IAAgB;QAC7C,OAAO,YAAY,CAAC,MAAM,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC,CAAA;IAClD,CAAC;IAED,KAAK,CAAC,WAAW,CAAC,IAAY,EAAE,IAAgB;QAC/C,OAAO,cAAc,CAAC,MAAM,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC,CAAA;IACpD,CAAC;IAED,KAAK,CAAC,QAAQ,CAAC,IAAY,EAAE,IAA8D;QAC1F,OAAO,WAAW,CAAC,MAAM,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,IAAI,CAAC,EAAE,IAAI,EAAE,GAAG,CAAC,CAAA;IAC5D,CAAC;IAED;;;;;;;;;;OAUG;IACK,mBAAmB,CAAC,MAA2B;QACtD,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC;YAAE,OAAM;QAC/B,MAAM,KAAK,GAAG,MAAM,CAAC,CAAC,CAAE,CAAC,MAAM,CAAA;QAC/B,IAAI,KAAK,GAAG,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC;YAChC,MAAM,IAAI,KAAK,CACd,wCAAwC,KAAK,2CAA2C;gBACvF,wBAAwB,IAAI,CAAC,MAAM,CAAC,MAAM,iDAAiD;gBAC3F,oFAAoF,CACrF,CAAA;QACF,CAAC;IACF,CAAC;CACD;AAmED,SAAS,aAAa,CAAC,GAAa;IACnC,IAAI,MAAM,GAAG,CAAC,CAAA;IACd,IAAI,MAAM,GAAG,GAAG,CAAC,CAAC,CAAE,CAAA;IACpB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACrC,IAAI,GAAG,CAAC,CAAC,CAAE,GAAG,MAAM,EAAE,CAAC;YACtB,MAAM,GAAG,GAAG,CAAC,CAAC,CAAE,CAAA;YAChB,MAAM,GAAG,CAAC,CAAA;QACX,CAAC;IACF,CAAC;IACD,IAAI,MAAM,GAAG,CAAC,CAAA;IACd,KAAK,MAAM,CAAC,IAAI,GAAG;QAAE,MAAM,IAAI,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,MAAM,CAAC,CAAA;IACnD,MAAM,IAAI,GAAG,CAAC,GAAG,MAAM,CAAA;IACvB,OAAO,EAAE,GAAG,EAAE,MAAM,EAAE,IAAI,EAAE,CAAA;AAC7B,CAAC;AAED,uGAAuG;AACvG,SAAS,WAAW,CAAC,CAAa,EAAE,CAAa;IAChD,MAAM,CAAC,GAAG,CAAC,CAAC,MAAM,CAAA;IAClB,MAAM,GAAG,GAAe,EAAE,CAAA;IAC1B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAC5B,MAAM,GAAG,GAAG,IAAI,KAAK,CAAS,CAAC,CAAC,CAAA;QAChC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE;YAAE,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAE,CAAC,CAAC,CAAE,GAAG,CAAC,CAAC,CAAC,CAAE,CAAC,CAAC,CAAE,CAAA;QAC1D,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;IACd,CAAC;IACD,OAAO,GAAG,CAAA;AACX,CAAC"}
|
package/out/unit-repair.d.ts
CHANGED
|
@@ -5,32 +5,28 @@
|
|
|
5
5
|
*
|
|
6
6
|
* Secondary-unit regex repair pass — parser-improvement backlog (2026-05-30).
|
|
7
7
|
*
|
|
8
|
-
* The three-arena capability eval surfaced a persistent neural weakness: the
|
|
9
|
-
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
12
|
-
*
|
|
13
|
-
*
|
|
14
|
-
* model is untouched; this is a decoder-side correction, the same "lowest
|
|
15
|
-
* risk" lever family as postcode-repair.
|
|
8
|
+
* The three-arena capability eval surfaced a persistent neural weakness: the model DROPS secondary
|
|
9
|
+
* units. "123 Main St Apt 456" → no unit label; the postal-standards secondary-unit edge class
|
|
10
|
+
* scored 0% neural. Units have a rigid surface shape (a designator keyword + an identifier), so —
|
|
11
|
+
* exactly like the postcode-repair pass (#35) — we can detect them deterministically and repair
|
|
12
|
+
* the BIO labels AFTER decode but BEFORE `buildAddressTree`. The model is untouched; this is a
|
|
13
|
+
* decoder-side correction, the same "lowest risk" lever family as postcode-repair.
|
|
16
14
|
*
|
|
17
15
|
* PRECISION GUARDS (mirror postcode-repair — never regress a confident parse):
|
|
18
|
-
* - We only fire on EXPLICIT designators (Apt, Ste, Suite, Unit, Rm, Floor,
|
|
19
|
-
* Bldg, Flat, … + bare "#<n>"). Ambiguous tokens are deliberately excluded:
|
|
20
|
-
* "Box" (that's po_box), bare "F"/"No" (too greedy), "Space"/"Stop" (common
|
|
21
|
-
* words).
|
|
22
|
-
* - ADD path (model emitted no unit over the matched run): allowed ONLY over
|
|
23
|
-
* `O` tokens — never over house_number / street* / postcode / po_box / a
|
|
24
|
-
* geographic container. So a confidently-labeled street or number is safe.
|
|
25
|
-
* - SNAP path: when the model already started a unit span inside the match,
|
|
26
|
-
* we expand/clip it to the full detected shape.
|
|
27
|
-
* - Local smear-clip: unit tokens immediately flanking a snapped run are
|
|
28
|
-
* cleared (mirrors postcode-repair) so "Apt 4 Springfield" can't leave a
|
|
29
|
-
* stray I-unit on "Springfield".
|
|
30
16
|
*
|
|
31
|
-
*
|
|
32
|
-
*
|
|
33
|
-
*
|
|
17
|
+
* - We only fire on EXPLICIT designators (Apt, Ste, Suite, Unit, Rm, Floor, Bldg, Flat, … + bare
|
|
18
|
+
* "#<n>"). Ambiguous tokens are deliberately excluded: "Box" (that's po_box), bare "F"/"No"
|
|
19
|
+
* (too greedy), "Space"/"Stop" (common words).
|
|
20
|
+
* - ADD path (model emitted no unit over the matched run): allowed ONLY over `O` tokens — never over
|
|
21
|
+
* house_number / street* / postcode / po_box / a geographic container. So a
|
|
22
|
+
* confidently-labeled street or number is safe.
|
|
23
|
+
* - SNAP path: when the model already started a unit span inside the match, we expand/clip it to the
|
|
24
|
+
* full detected shape.
|
|
25
|
+
* - Local smear-clip: unit tokens immediately flanking a snapped run are cleared (mirrors
|
|
26
|
+
* postcode-repair) so "Apt 4 Springfield" can't leave a stray I-unit on "Springfield".
|
|
27
|
+
*
|
|
28
|
+
* Opt-in via `ParseOpts.unitRepair` (postcode-repair earned default-on only after a measured
|
|
29
|
+
* +135/0; unit-repair stays opt-in until the v0.7.2 arena re-run quantifies its delta).
|
|
34
30
|
*/
|
|
35
31
|
import type { DecoderToken } from "@mailwoman/core/decoder";
|
|
36
32
|
export interface RepairResult {
|
|
@@ -39,8 +35,8 @@ export interface RepairResult {
|
|
|
39
35
|
changed: number;
|
|
40
36
|
}
|
|
41
37
|
/**
|
|
42
|
-
* Repair secondary-unit label spans in a decoded token sequence using designator
|
|
43
|
-
*
|
|
38
|
+
* Repair secondary-unit label spans in a decoded token sequence using designator regexes. Returns a
|
|
39
|
+
* NEW token array (inputs are not mutated) plus a change count.
|
|
44
40
|
*/
|
|
45
41
|
export declare function repairUnitLabels(text: string, input: readonly DecoderToken[]): RepairResult;
|
|
46
42
|
//# sourceMappingURL=unit-repair.d.ts.map
|
package/out/unit-repair.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"unit-repair.d.ts","sourceRoot":"","sources":["../unit-repair.ts"],"names":[],"mappings":"AAAA
|
|
1
|
+
{"version":3,"file":"unit-repair.d.ts","sourceRoot":"","sources":["../unit-repair.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6BG;AAEH,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,yBAAyB,CAAA;AAiF3D,MAAM,WAAW,YAAY;IAC5B,MAAM,EAAE,YAAY,EAAE,CAAA;IACtB,gEAAgE;IAChE,OAAO,EAAE,MAAM,CAAA;CACf;AAED;;;GAGG;AACH,wBAAgB,gBAAgB,CAAC,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,SAAS,YAAY,EAAE,GAAG,YAAY,CA8C3F"}
|
package/out/unit-repair.js
CHANGED
|
@@ -5,38 +5,34 @@
|
|
|
5
5
|
*
|
|
6
6
|
* Secondary-unit regex repair pass — parser-improvement backlog (2026-05-30).
|
|
7
7
|
*
|
|
8
|
-
* The three-arena capability eval surfaced a persistent neural weakness: the
|
|
9
|
-
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
12
|
-
*
|
|
13
|
-
*
|
|
14
|
-
* model is untouched; this is a decoder-side correction, the same "lowest
|
|
15
|
-
* risk" lever family as postcode-repair.
|
|
8
|
+
* The three-arena capability eval surfaced a persistent neural weakness: the model DROPS secondary
|
|
9
|
+
* units. "123 Main St Apt 456" → no unit label; the postal-standards secondary-unit edge class
|
|
10
|
+
* scored 0% neural. Units have a rigid surface shape (a designator keyword + an identifier), so —
|
|
11
|
+
* exactly like the postcode-repair pass (#35) — we can detect them deterministically and repair
|
|
12
|
+
* the BIO labels AFTER decode but BEFORE `buildAddressTree`. The model is untouched; this is a
|
|
13
|
+
* decoder-side correction, the same "lowest risk" lever family as postcode-repair.
|
|
16
14
|
*
|
|
17
15
|
* PRECISION GUARDS (mirror postcode-repair — never regress a confident parse):
|
|
18
|
-
* - We only fire on EXPLICIT designators (Apt, Ste, Suite, Unit, Rm, Floor,
|
|
19
|
-
* Bldg, Flat, … + bare "#<n>"). Ambiguous tokens are deliberately excluded:
|
|
20
|
-
* "Box" (that's po_box), bare "F"/"No" (too greedy), "Space"/"Stop" (common
|
|
21
|
-
* words).
|
|
22
|
-
* - ADD path (model emitted no unit over the matched run): allowed ONLY over
|
|
23
|
-
* `O` tokens — never over house_number / street* / postcode / po_box / a
|
|
24
|
-
* geographic container. So a confidently-labeled street or number is safe.
|
|
25
|
-
* - SNAP path: when the model already started a unit span inside the match,
|
|
26
|
-
* we expand/clip it to the full detected shape.
|
|
27
|
-
* - Local smear-clip: unit tokens immediately flanking a snapped run are
|
|
28
|
-
* cleared (mirrors postcode-repair) so "Apt 4 Springfield" can't leave a
|
|
29
|
-
* stray I-unit on "Springfield".
|
|
30
16
|
*
|
|
31
|
-
*
|
|
32
|
-
*
|
|
33
|
-
*
|
|
17
|
+
* - We only fire on EXPLICIT designators (Apt, Ste, Suite, Unit, Rm, Floor, Bldg, Flat, … + bare
|
|
18
|
+
* "#<n>"). Ambiguous tokens are deliberately excluded: "Box" (that's po_box), bare "F"/"No"
|
|
19
|
+
* (too greedy), "Space"/"Stop" (common words).
|
|
20
|
+
* - ADD path (model emitted no unit over the matched run): allowed ONLY over `O` tokens — never over
|
|
21
|
+
* house_number / street* / postcode / po_box / a geographic container. So a
|
|
22
|
+
* confidently-labeled street or number is safe.
|
|
23
|
+
* - SNAP path: when the model already started a unit span inside the match, we expand/clip it to the
|
|
24
|
+
* full detected shape.
|
|
25
|
+
* - Local smear-clip: unit tokens immediately flanking a snapped run are cleared (mirrors
|
|
26
|
+
* postcode-repair) so "Apt 4 Springfield" can't leave a stray I-unit on "Springfield".
|
|
27
|
+
*
|
|
28
|
+
* Opt-in via `ParseOpts.unitRepair` (postcode-repair earned default-on only after a measured
|
|
29
|
+
* +135/0; unit-repair stays opt-in until the v0.7.2 arena re-run quantifies its delta).
|
|
34
30
|
*/
|
|
35
31
|
/**
|
|
36
|
-
* Secondary-unit shape patterns, ordered most-specific → least. Case-insensitive
|
|
37
|
-
*
|
|
38
|
-
*
|
|
39
|
-
*
|
|
32
|
+
* Secondary-unit shape patterns, ordered most-specific → least. Case-insensitive (unit designators
|
|
33
|
+
* appear in every casing in real data). The identifier is a 1-5 digit number with an optional
|
|
34
|
+
* trailing letter ("4B"), a single letter ("STE D"), or a letter+digits — kept tight so we don't
|
|
35
|
+
* swallow following words.
|
|
40
36
|
*/
|
|
41
37
|
const UNIT_DESIGNATORS = "APARTMENT|APT|SUITE|STE|UNIT|ROOM|RM|FLOOR|FLR|FL|BUILDING|BLDG|DEPARTMENT|DEPT|LOT|TRAILER|TRLR|SLIP|HANGAR|PIER|FLAT|PH|PENTHOUSE";
|
|
42
38
|
const UNIT_PATTERNS = [
|
|
@@ -57,15 +53,14 @@ const UNIT_B = "B-unit";
|
|
|
57
53
|
const UNIT_I = "I-unit";
|
|
58
54
|
const OUTSIDE = "O";
|
|
59
55
|
/**
|
|
60
|
-
* Tags a unit span is allowed to overwrite on the ADD path. The v0.7.2 arena
|
|
61
|
-
*
|
|
62
|
-
*
|
|
63
|
-
*
|
|
64
|
-
*
|
|
65
|
-
*
|
|
66
|
-
*
|
|
67
|
-
*
|
|
68
|
-
* list so a confident parse is never clobbered. (`O` is always eligible.)
|
|
56
|
+
* Tags a unit span is allowed to overwrite on the ADD path. The v0.7.2 arena showed the dominant
|
|
57
|
+
* failure for bare designator-led units ("Flat 2 14 Smith St", "APT 2 …") is the model labeling the
|
|
58
|
+
* WHOLE designator+identifier run as `locality` — not leaving it `O`. An explicit designator +
|
|
59
|
+
* identifier is a high-confidence "this is a unit" shape (a real locality/suburb name never has
|
|
60
|
+
* that form), so — exactly like postcode-repair's ADD_OVER_TAGS — we let it reclaim a
|
|
61
|
+
* `locality`/`dependent_locality` span. Structural tags (house_number, street*, postcode, po_box,
|
|
62
|
+
* region, country, venue) stay off the list so a confident parse is never clobbered. (`O` is always
|
|
63
|
+
* eligible.)
|
|
69
64
|
*/
|
|
70
65
|
const ADD_OVER_TAGS = new Set(["locality", "dependent_locality"]);
|
|
71
66
|
function isUnitLabel(label) {
|
|
@@ -95,8 +90,8 @@ function collectMatches(text) {
|
|
|
95
90
|
return accepted;
|
|
96
91
|
}
|
|
97
92
|
/**
|
|
98
|
-
* Repair secondary-unit label spans in a decoded token sequence using designator
|
|
99
|
-
*
|
|
93
|
+
* Repair secondary-unit label spans in a decoded token sequence using designator regexes. Returns a
|
|
94
|
+
* NEW token array (inputs are not mutated) plus a change count.
|
|
100
95
|
*/
|
|
101
96
|
export function repairUnitLabels(text, input) {
|
|
102
97
|
const matches = collectMatches(text);
|
package/out/unit-repair.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"unit-repair.js","sourceRoot":"","sources":["../unit-repair.ts"],"names":[],"mappings":"AAAA
|
|
1
|
+
{"version":3,"file":"unit-repair.js","sourceRoot":"","sources":["../unit-repair.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6BG;AAYH;;;;;GAKG;AACH,MAAM,gBAAgB,GACrB,qIAAqI,CAAA;AAEtI,MAAM,aAAa,GAAyC;IAC3D,kFAAkF;IAClF,wEAAwE;IACxE,iFAAiF;IACjF,kFAAkF;IAClF,8EAA8E;IAC9E,kCAAkC;IAClC;QACC,KAAK,EAAE,YAAY;QACnB,EAAE,EAAE,IAAI,MAAM,CACb,SAAS,gBAAgB,4EAA4E,EACrG,IAAI,CACJ;KACD;IACD,8EAA8E;IAC9E,EAAE,KAAK,EAAE,MAAM,EAAE,EAAE,EAAE,yBAAyB,EAAE;CAChD,CAAA;AAED,MAAM,MAAM,GAAG,QAAiC,CAAA;AAChD,MAAM,MAAM,GAAG,QAAiC,CAAA;AAChD,MAAM,OAAO,GAAG,GAA4B,CAAA;AAE5C;;;;;;;;;GASG;AACH,MAAM,aAAa,GAAG,IAAI,GAAG,CAAS,CAAC,UAAU,EAAE,oBAAoB,CAAC,CAAC,CAAA;AAEzE,SAAS,WAAW,CAAC,KAAa;IACjC,OAAO,KAAK,KAAK,QAAQ,IAAI,KAAK,KAAK,QAAQ,CAAA;AAChD,CAAC;AAED,qFAAqF;AACrF,SAAS,KAAK,CAAC,KAAa;IAC3B,OAAO,KAAK,KAAK,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAA;AAC7C,CAAC;AAED,mGAAmG;AACnG,SAAS,cAAc,CAAC,IAAY;IACnC,MAAM,UAAU,GAAgB,EAAE,CAAA;IAClC,aAAa,CAAC,OAAO,CAAC,CAAC,GAAG,EAAE,QAAQ,EAAE,EAAE;QACvC,GAAG,CAAC,EAAE,CAAC,SAAS,GAAG,CAAC,CAAA;QACpB,KAAK,IAAI,CAAC,GAAG,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;YAC1D,UAAU,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC,KAAK,EAAE,GAAG,EAAE,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,EAAE,QAAQ,EAAE,CAAC,CAAA;QAC1E,CAAC;IACF,CAAC,CAAC,CAAA;IACF,yFAAyF;IACzF,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,QAAQ,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAA;IACzF,MAAM,QAAQ,GAAgB,EAAE,CAAA;IAChC,KAAK,MAAM,CAAC,IAAI,UAAU,EAAE,CAAC;QAC5B,IAAI,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,GAAG,CAAC;YAAE,SAAQ;QACtE,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IACjB,CAAC;IACD,OAAO,QAAQ,CAAA;AAChB,CAAC;AAQD;;;GAGG;AACH,MAAM,UAAU,gBAAgB,CAAC,IAAY,EAAE,KAA8B;IAC5E,MAAM,OAAO,GAAG,cAAc,CAAC,IAAI,CAAC,CAAA;IACpC,MAAM,MAAM,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC,CAAC,CAAA;IAC3C,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,CAAC,EAAE,CAAA;IAEvD,IAAI,OAAO,GAAG,CAAC,CAAA;IACf,MAAM,QAAQ,GAAG,CAAC,CAAS,EAAE,KAA4B,EAAQ,EAAE;QAClE,IAAI,MAAM,CAAC,CAAC,CAAE,CAAC,KAAK,KAAK,KAAK,EAAE,CAAC;YAChC,MAAM,CAAC,CAAC,CAAE,CAAC,KAAK,GAAG,KAAK,CAAA;YACxB,OAAO,EAAE,CAAA;QACV,CAAC;IACF,CAAC,CAAA;IAED,KAAK,MAAM,CAAC,IAAI,OAAO,EAAE,CAAC;QACzB,+CAA+C;QAC/C,MAAM,OAAO,GAAa,EAAE,CAAA;QAC5B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACxC,MAAM,CAAC,GAAG,MAAM,CAAC,CAAC,CAAE,CAAA;YACpB,IAAI,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,GAAG;gBAAE,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;QACxD,CAAC;QACD,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC;YAAE,SAAQ;QAElC,MAAM,OAAO,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,KAAK,CAAC,CAAC,CAAA;QAClE,IAAI,CAAC,OAAO,EAAE,CAAC;YACd,iFAAiF;YACjF,6EAA6E;YAC7E,oFAAoF;YACpF,+BAA+B;YAC/B,MAAM,IAAI,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE;gBAChC,MAAM,GAAG,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,KAAK,CAAC,CAAA;gBACnC,OAAO,GAAG,KAAK,IAAI,IAAI,aAAa,CAAC,GAAG,CAAC,GAAG,CAAC,CAAA;YAC9C,CAAC,CAAC,CAAA;YACF,IAAI,CAAC,IAAI;gBAAE,SAAQ;QACpB,CAAC;QAED,2DAA2D;QAC3D,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,EAAE,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAA;QAEjE,4EAA4E;QAC5E,KAAK,IAAI,CAAC,GAAG,OAAO,CAAC,CAAC,CAAE,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,WAAW,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,KAAK,CAAC,EAAE,CAAC,EAAE;YAAE,QAAQ,CAAC,CAAC,EAAE,OAAO,CAAC,CAAA;QAChG,KAAK,IAAI,CAAC,GAAG,OAAO,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAE,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,IAAI,WAAW,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,KAAK,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YACxG,QAAQ,CAAC,CAAC,EAAE,OAAO,CAAC,CAAA;QACrB,CAAC;IACF,CAAC;IAED,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,CAAA;AAC3B,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mailwoman/neural",
|
|
3
|
-
"version": "4.
|
|
3
|
+
"version": "4.1.0",
|
|
4
4
|
"description": "Mailwoman neural classifier runtime: SentencePiece tokenizer + ONNX inference + decoder wiring.",
|
|
5
5
|
"license": "AGPL-3.0-only",
|
|
6
6
|
"repository": {
|
|
@@ -20,8 +20,8 @@
|
|
|
20
20
|
"./browser": "./out/browser.js"
|
|
21
21
|
},
|
|
22
22
|
"dependencies": {
|
|
23
|
-
"@mailwoman/codex": "4.
|
|
24
|
-
"@mailwoman/core": "4.
|
|
23
|
+
"@mailwoman/codex": "4.1.0",
|
|
24
|
+
"@mailwoman/core": "4.1.0",
|
|
25
25
|
"@sctg/sentencepiece-js": "^1.3.3",
|
|
26
26
|
"onnxruntime-node": "^1.26.0"
|
|
27
27
|
},
|