@mailwoman/neural 4.1.0 → 4.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/out/address-system.d.ts +35 -0
- package/out/address-system.d.ts.map +1 -0
- package/out/address-system.js +55 -0
- package/out/address-system.js.map +1 -0
- package/out/classifier.d.ts +53 -1
- package/out/classifier.d.ts.map +1 -1
- package/out/classifier.js +64 -55
- package/out/classifier.js.map +1 -1
- package/out/gazetteer-inference.d.ts +72 -0
- package/out/gazetteer-inference.d.ts.map +1 -0
- package/out/gazetteer-inference.js +180 -0
- package/out/gazetteer-inference.js.map +1 -0
- package/out/index.d.ts +1 -0
- package/out/index.d.ts.map +1 -1
- package/out/index.js +1 -0
- package/out/index.js.map +1 -1
- package/out/onnx-runner.d.ts +9 -0
- package/out/onnx-runner.d.ts.map +1 -1
- package/out/onnx-runner.js +30 -2
- package/out/onnx-runner.js.map +1 -1
- package/package.json +3 -3
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Address-system detection from the model's locale head (#511 Tier A — the consumer the head
|
|
7
|
+
* never had). The PR3 self-conditioning head predicts which country an address belongs to from
|
|
8
|
+
* the pooled sequence; v1.1.0+ exports surface it as the `locale_logits` ONNX output. This module
|
|
9
|
+
* turns that posterior into a `SystemCode` the conventions layer can act on.
|
|
10
|
+
*
|
|
11
|
+
* Conservative by contract: below the confidence threshold, or for locales without a codex
|
|
12
|
+
* system slice, detection returns null and the parse proceeds exactly as before. The mask must
|
|
13
|
+
* never fire on a guess.
|
|
14
|
+
*/
|
|
15
|
+
import type { SystemCode } from "@mailwoman/codex";
|
|
16
|
+
/**
|
|
17
|
+
* Locale-head class order — MUST mirror `corpus-python/src/mailwoman_train/labels.py`
|
|
18
|
+
* `LOCALE_COUNTRIES` exactly (same never-reorder/append-only discipline; a drift here silently
|
|
19
|
+
* mislabels every detection).
|
|
20
|
+
*/
|
|
21
|
+
export declare const LOCALE_COUNTRIES: readonly ["US", "FR", "DE", "CA", "GB", "JP", "ES", "IT", "NL"];
|
|
22
|
+
export interface DetectedSystem {
|
|
23
|
+
system: SystemCode;
|
|
24
|
+
country: (typeof LOCALE_COUNTRIES)[number];
|
|
25
|
+
confidence: number;
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Read the locale head's posterior into a confident `SystemCode`, or null.
|
|
29
|
+
*
|
|
30
|
+
* @param localeLogits The raw `locale_logits` output (LOCALE_COUNTRIES order).
|
|
31
|
+
* @param threshold Minimum softmax probability to act on (default 0.8 — the head's held-out
|
|
32
|
+
* accuracy is ~0.98, so 0.8 trades a little recall for never masking on a coin flip).
|
|
33
|
+
*/
|
|
34
|
+
export declare function detectAddressSystem(localeLogits: readonly number[] | undefined, threshold?: number): DetectedSystem | null;
|
|
35
|
+
//# sourceMappingURL=address-system.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"address-system.d.ts","sourceRoot":"","sources":["../address-system.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAEH,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,kBAAkB,CAAA;AAIlD;;;;GAIG;AACH,eAAO,MAAM,gBAAgB,iEAAkE,CAAA;AAY/F,MAAM,WAAW,cAAc;IAC9B,MAAM,EAAE,UAAU,CAAA;IAClB,OAAO,EAAE,CAAC,OAAO,gBAAgB,CAAC,CAAC,MAAM,CAAC,CAAA;IAC1C,UAAU,EAAE,MAAM,CAAA;CAClB;AAED;;;;;;GAMG;AACH,wBAAgB,mBAAmB,CAClC,YAAY,EAAE,SAAS,MAAM,EAAE,GAAG,SAAS,EAC3C,SAAS,SAAM,GACb,cAAc,GAAG,IAAI,CAWvB"}
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Address-system detection from the model's locale head (#511 Tier A — the consumer the head
|
|
7
|
+
* never had). The PR3 self-conditioning head predicts which country an address belongs to from
|
|
8
|
+
* the pooled sequence; v1.1.0+ exports surface it as the `locale_logits` ONNX output. This module
|
|
9
|
+
* turns that posterior into a `SystemCode` the conventions layer can act on.
|
|
10
|
+
*
|
|
11
|
+
* Conservative by contract: below the confidence threshold, or for locales without a codex
|
|
12
|
+
* system slice, detection returns null and the parse proceeds exactly as before. The mask must
|
|
13
|
+
* never fire on a guess.
|
|
14
|
+
*/
|
|
15
|
+
import { softmax } from "./viterbi.js";
|
|
16
|
+
/**
|
|
17
|
+
* Locale-head class order — MUST mirror `corpus-python/src/mailwoman_train/labels.py`
|
|
18
|
+
* `LOCALE_COUNTRIES` exactly (same never-reorder/append-only discipline; a drift here silently
|
|
19
|
+
* mislabels every detection).
|
|
20
|
+
*/
|
|
21
|
+
export const LOCALE_COUNTRIES = ["US", "FR", "DE", "CA", "GB", "JP", "ES", "IT", "NL"];
|
|
22
|
+
/** ISO-2 country → codex address-system slice. Unmapped locales have no conventions yet. */
|
|
23
|
+
const COUNTRY_TO_SYSTEM = {
|
|
24
|
+
US: "us",
|
|
25
|
+
FR: "fr",
|
|
26
|
+
DE: "de",
|
|
27
|
+
CA: "ca",
|
|
28
|
+
GB: "gb",
|
|
29
|
+
JP: "jp",
|
|
30
|
+
};
|
|
31
|
+
/**
|
|
32
|
+
* Read the locale head's posterior into a confident `SystemCode`, or null.
|
|
33
|
+
*
|
|
34
|
+
* @param localeLogits The raw `locale_logits` output (LOCALE_COUNTRIES order).
|
|
35
|
+
* @param threshold Minimum softmax probability to act on (default 0.8 — the head's held-out
|
|
36
|
+
* accuracy is ~0.98, so 0.8 trades a little recall for never masking on a coin flip).
|
|
37
|
+
*/
|
|
38
|
+
export function detectAddressSystem(localeLogits, threshold = 0.8) {
|
|
39
|
+
if (!localeLogits || localeLogits.length !== LOCALE_COUNTRIES.length)
|
|
40
|
+
return null;
|
|
41
|
+
const probs = softmax(localeLogits);
|
|
42
|
+
let best = 0;
|
|
43
|
+
for (let i = 1; i < probs.length; i++)
|
|
44
|
+
if (probs[i] > probs[best])
|
|
45
|
+
best = i;
|
|
46
|
+
const confidence = probs[best];
|
|
47
|
+
if (confidence < threshold)
|
|
48
|
+
return null;
|
|
49
|
+
const country = LOCALE_COUNTRIES[best];
|
|
50
|
+
const system = COUNTRY_TO_SYSTEM[country];
|
|
51
|
+
if (!system)
|
|
52
|
+
return null;
|
|
53
|
+
return { system, country, confidence };
|
|
54
|
+
}
|
|
55
|
+
//# sourceMappingURL=address-system.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"address-system.js","sourceRoot":"","sources":["../address-system.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAIH,OAAO,EAAE,OAAO,EAAE,MAAM,cAAc,CAAA;AAEtC;;;;GAIG;AACH,MAAM,CAAC,MAAM,gBAAgB,GAAG,CAAC,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,CAAU,CAAA;AAE/F,4FAA4F;AAC5F,MAAM,iBAAiB,GAAmE;IACzF,EAAE,EAAE,IAAI;IACR,EAAE,EAAE,IAAI;IACR,EAAE,EAAE,IAAI;IACR,EAAE,EAAE,IAAI;IACR,EAAE,EAAE,IAAI;IACR,EAAE,EAAE,IAAI;CACR,CAAA;AAQD;;;;;;GAMG;AACH,MAAM,UAAU,mBAAmB,CAClC,YAA2C,EAC3C,SAAS,GAAG,GAAG;IAEf,IAAI,CAAC,YAAY,IAAI,YAAY,CAAC,MAAM,KAAK,gBAAgB,CAAC,MAAM;QAAE,OAAO,IAAI,CAAA;IACjF,MAAM,KAAK,GAAG,OAAO,CAAC,YAAwB,CAAC,CAAA;IAC/C,IAAI,IAAI,GAAG,CAAC,CAAA;IACZ,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE;QAAE,IAAI,KAAK,CAAC,CAAC,CAAE,GAAG,KAAK,CAAC,IAAI,CAAE;YAAE,IAAI,GAAG,CAAC,CAAA;IAC7E,MAAM,UAAU,GAAG,KAAK,CAAC,IAAI,CAAE,CAAA;IAC/B,IAAI,UAAU,GAAG,SAAS;QAAE,OAAO,IAAI,CAAA;IACvC,MAAM,OAAO,GAAG,gBAAgB,CAAC,IAAI,CAAE,CAAA;IACvC,MAAM,MAAM,GAAG,iBAAiB,CAAC,OAAO,CAAC,CAAA;IACzC,IAAI,CAAC,MAAM;QAAE,OAAO,IAAI,CAAA;IACxB,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,UAAU,EAAE,CAAA;AACvC,CAAC"}
|
package/out/classifier.d.ts
CHANGED
|
@@ -10,7 +10,9 @@
|
|
|
10
10
|
* Convenience wrappers `parseJson` / `parseTuples` / `parseXml` project the tree on the way out.
|
|
11
11
|
*/
|
|
12
12
|
import { decodeAsXml, type AddressTree, type Calibrator, type ComponentTag } from "@mailwoman/core/decoder";
|
|
13
|
+
import { type SystemCode } from "@mailwoman/codex";
|
|
13
14
|
import { type AnchorLookup } from "./anchor-inference.js";
|
|
15
|
+
import { type GazetteerLexicon } from "./gazetteer-inference.js";
|
|
14
16
|
import { type FstMatcherLike } from "./fst-prior.js";
|
|
15
17
|
import type { InferResult } from "./onnx-runner.js";
|
|
16
18
|
import { type QueryShapeLike } from "./query-shape-prior.js";
|
|
@@ -26,6 +28,9 @@ export interface NeuralRunner {
|
|
|
26
28
|
infer(tokenIds: number[], anchor?: {
|
|
27
29
|
features: ReadonlyArray<ReadonlyArray<number>>;
|
|
28
30
|
confidence: ReadonlyArray<number>;
|
|
31
|
+
}, gazetteer?: {
|
|
32
|
+
features: ReadonlyArray<ReadonlyArray<number>>;
|
|
33
|
+
confidence: ReadonlyArray<number>;
|
|
29
34
|
}): Promise<InferResult>;
|
|
30
35
|
}
|
|
31
36
|
export interface NeuralAddressClassifierConfig {
|
|
@@ -63,8 +68,35 @@ export interface NeuralAddressClassifierConfig {
|
|
|
63
68
|
* models. Load via `loadAnchorLookup` from `./anchor-inference.js`.
|
|
64
69
|
*/
|
|
65
70
|
postcodeAnchorLookup?: AnchorLookup;
|
|
71
|
+
/**
|
|
72
|
+
* Optional gazetteer-anchor lexicon (#464, knowledge-ladder rung 3.2). When set, `parse` builds
|
|
73
|
+
* per-token candidate-tag-set clues (country/region/po_box/cedex/homograph) from the text + this
|
|
74
|
+
* lexicon and feeds them to the runner — for models trained with the gazetteer-anchor channel
|
|
75
|
+
* (exported with the `gazetteer_features`/`gazetteer_confidence` ONNX inputs). Omit for plain
|
|
76
|
+
* models. Load via `parseGazetteerLexicon` from `./gazetteer-inference.js`.
|
|
77
|
+
*/
|
|
78
|
+
gazetteerLexicon?: GazetteerLexicon;
|
|
79
|
+
/**
|
|
80
|
+
* Channel choreography (#464, v0.9.13 postcode fix): when true, zero the gazetteer clue on pieces
|
|
81
|
+
* adjacent to a postcode-anchor hit (needs both `gazetteerLexicon` and `postcodeAnchorLookup`).
|
|
82
|
+
* Targets the region-clue→postcode CRF interference (~3pp US postcode).
|
|
83
|
+
*
|
|
84
|
+
* PAIRING IS LOAD-BEARING: set this IFF the model was TRAINED with the matching train-time
|
|
85
|
+
* choreography (`data.gazetteer_choreography`). The 2026-06-10 diagnostic showed the harm is
|
|
86
|
+
* WEIGHT-BAKED — applying this at inference on a model trained *without* train-choreography does
|
|
87
|
+
* NOT recover postcode and adds train/inference skew. Only enable for a consolidation-era model
|
|
88
|
+
* trained with the train-time half.
|
|
89
|
+
*/
|
|
90
|
+
suppressGazetteerNearPostcode?: boolean;
|
|
91
|
+
/**
|
|
92
|
+
* Default address-system conventions mode for every parse (see `ParseOpts.addressSystemConventions`
|
|
93
|
+
* for semantics — `"auto"` reads the model's locale head; a `SystemCode` pins it). Per-parse opts
|
|
94
|
+
* override this. Omit for the byte-stable pre-#511 default (no detection, no mask).
|
|
95
|
+
*/
|
|
96
|
+
addressSystemConventions?: "auto" | SystemCode;
|
|
66
97
|
}
|
|
67
98
|
export declare class NeuralAddressClassifier {
|
|
99
|
+
#private;
|
|
68
100
|
private readonly cfg;
|
|
69
101
|
private readonly labels;
|
|
70
102
|
private readonly decodeMode;
|
|
@@ -91,7 +123,11 @@ export declare class NeuralAddressClassifier {
|
|
|
91
123
|
parse(text: string, opts?: ParseOpts): Promise<AddressTree>;
|
|
92
124
|
/**
|
|
93
125
|
* Like `parse`, but also returns the raw per-token logits and piece offsets needed for per-span
|
|
94
|
-
* logit aggregation (Option C joint-reconcile integration).
|
|
126
|
+
* logit aggregation (Option C joint-reconcile integration). Shares the ENTIRE decode path with
|
|
127
|
+
* `parse` (one `#decode`, #481) — including the repair passes, which previously ran only in
|
|
128
|
+
* `parse`: reconcile must consume the same tokens the argmax path serves users, and the repair
|
|
129
|
+
* opts were silently ignored here before. `logits` stay RAW (pre-prior, pre-repair) — they are
|
|
130
|
+
* the model's emissions, not the decode's opinions.
|
|
95
131
|
*/
|
|
96
132
|
parseWithLogits(text: string, opts?: ParseOpts): Promise<ParseWithLogitsResult>;
|
|
97
133
|
parseJson(text: string, opts?: ParseOpts): Promise<Partial<Record<ComponentTag, string>>>;
|
|
@@ -177,5 +213,21 @@ export interface ParseOpts {
|
|
|
177
213
|
* (`@mailwoman/core/decoder`) from `data/eval/calibration/isotonic-<locale>-<version>.json`.
|
|
178
214
|
*/
|
|
179
215
|
calibrate?: Calibrator;
|
|
216
|
+
/**
|
|
217
|
+
* Address-system conventions enforcement (#511 Tier A / #478's rules-as-constraints slice).
|
|
218
|
+
*
|
|
219
|
+
* - `"auto"` — detect the system from the model's locale head (`locale_logits` output, v1.1.0+
|
|
220
|
+
* exports; silently no-ops on models without it) and apply that system's codex conventions:
|
|
221
|
+
* forbidden tags become a hard emission mask before Viterbi, and a conventions postcode shape
|
|
222
|
+
* enables the snap-only postcode repair pass.
|
|
223
|
+
* - A `SystemCode` (`"fr"`, `"us"`, …) — apply that system's conventions unconditionally
|
|
224
|
+
* (callers that already know the locale, e.g. the pipeline's BCP-47 region).
|
|
225
|
+
* - Omit — byte-stable default: no detection, no mask (pre-#511 behavior).
|
|
226
|
+
*
|
|
227
|
+
* The detection threshold is deliberately high (0.8): the mask must never fire on a guess.
|
|
228
|
+
* Measured motivation: the 2026-06-10 v1.1.0 gate, where US suffix logic fired inside French
|
|
229
|
+
* parses (`street_suffix: "Rue"`) and digit-splits corrupted leading FR postcodes.
|
|
230
|
+
*/
|
|
231
|
+
addressSystemConventions?: "auto" | SystemCode;
|
|
180
232
|
}
|
|
181
233
|
//# sourceMappingURL=classifier.d.ts.map
|
package/out/classifier.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"classifier.d.ts","sourceRoot":"","sources":["../classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,EAIN,WAAW,EACX,KAAK,WAAW,EAChB,KAAK,UAAU,EACf,KAAK,YAAY,EAEjB,MAAM,yBAAyB,CAAA;AAChC,OAAO,EAAuB,KAAK,YAAY,EAAE,MAAM,uBAAuB,CAAA;AAC9E,OAAO,EAA0B,KAAK,cAAc,EAAE,MAAM,gBAAgB,CAAA;AAE5E,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAA;AAEnD,OAAO,EAA0C,KAAK,cAAc,EAAE,MAAM,wBAAwB,CAAA;AACpG,OAAO,EAAuC,KAAK,yBAAyB,EAAE,MAAM,8BAA8B,CAAA;AAClH,OAAO,EAAE,kBAAkB,EAAE,MAAM,gBAAgB,CAAA;AAGnD,OAAO,KAAK,EAAE,kBAAkB,EAAmB,MAAM,cAAc,CAAA;AAEvE;;;;GAIG;AACH,MAAM,WAAW,YAAY;IAC5B,KAAK,CACJ,QAAQ,EAAE,MAAM,EAAE,EAClB,MAAM,CAAC,EAAE;QAAE,QAAQ,EAAE,aAAa,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC,CAAC;QAAC,UAAU,EAAE,aAAa,CAAC,MAAM,CAAC,CAAA;KAAE,
|
|
1
|
+
{"version":3,"file":"classifier.d.ts","sourceRoot":"","sources":["../classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,EAIN,WAAW,EACX,KAAK,WAAW,EAChB,KAAK,UAAU,EACf,KAAK,YAAY,EAEjB,MAAM,yBAAyB,CAAA;AAChC,OAAO,EAAwB,KAAK,UAAU,EAAE,MAAM,kBAAkB,CAAA;AAGxE,OAAO,EAAuB,KAAK,YAAY,EAAE,MAAM,uBAAuB,CAAA;AAC9E,OAAO,EAAyD,KAAK,gBAAgB,EAAE,MAAM,0BAA0B,CAAA;AACvH,OAAO,EAA0B,KAAK,cAAc,EAAE,MAAM,gBAAgB,CAAA;AAE5E,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAA;AAEnD,OAAO,EAA0C,KAAK,cAAc,EAAE,MAAM,wBAAwB,CAAA;AACpG,OAAO,EAAuC,KAAK,yBAAyB,EAAE,MAAM,8BAA8B,CAAA;AAClH,OAAO,EAAE,kBAAkB,EAAE,MAAM,gBAAgB,CAAA;AAGnD,OAAO,KAAK,EAAE,kBAAkB,EAAmB,MAAM,cAAc,CAAA;AAEvE;;;;GAIG;AACH,MAAM,WAAW,YAAY;IAC5B,KAAK,CACJ,QAAQ,EAAE,MAAM,EAAE,EAClB,MAAM,CAAC,EAAE;QAAE,QAAQ,EAAE,aAAa,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC,CAAC;QAAC,UAAU,EAAE,aAAa,CAAC,MAAM,CAAC,CAAA;KAAE,EAC9F,SAAS,CAAC,EAAE;QAAE,QAAQ,EAAE,aAAa,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC,CAAC;QAAC,UAAU,EAAE,aAAa,CAAC,MAAM,CAAC,CAAA;KAAE,GAC/F,OAAO,CAAC,WAAW,CAAC,CAAA;CACvB;AAED,MAAM,WAAW,6BAA6B;IAC7C,SAAS,EAAE,kBAAkB,CAAA;IAC7B,MAAM,EAAE,YAAY,CAAA;IACpB;;;;OAIG;IACH,MAAM,CAAC,EAAE,SAAS,MAAM,EAAE,CAAA;IAC1B;;;;;;;OAOG;IACH,MAAM,CAAC,EAAE,SAAS,GAAG,QAAQ,CAAA;IAC7B;;;;OAIG;IACH,WAAW,CAAC,EAAE,MAAM,EAAE,EAAE,CAAA;IACxB,sEAAsE;IACtE,gBAAgB,CAAC,EAAE,MAAM,EAAE,CAAA;IAC3B,oEAAoE;IACpE,cAAc,CAAC,EAAE,MAAM,EAAE,CAAA;IACzB;;;;;OAKG;IACH,oBAAoB,CAAC,EAAE,YAAY,CAAA;IACnC;;;;;;OAMG;IACH,gBAAgB,CAAC,EAAE,gBAAgB,CAAA;IACnC;;;;;;;;;;OAUG;IACH,6BAA6B,CAAC,EAAE,OAAO,CAAA;IACvC;;;;OAIG;IACH,wBAAwB,CAAC,EAAE,MAAM,GAAG,UAAU,CAAA;CAC9C;AAED,qBAAa,uBAAuB;;IAOvB,OAAO,CAAC,QAAQ,CAAC,GAAG;IANhC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAmB;IAC1C,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAsB;IACjD,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAY;IACxC,OAAO,CAAC,QAAQ,CAAC,gBAAgB,CAAU;IAC3C,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAU;gBAEZ,GAAG,EAAE,6BAA6B;IAa/D;;;;;;;;;;;OAWG;WACU,eAAe,CAC3B,IAAI,GAAE,kBAAkB,GAAG;QAAE,oBAAoB,CAAC,EAAE,YAAY,CAAA;KAAO,GACrE,OAAO,CAAC,uBAAuB,CAAC;IA4BnC,6DAA6D;IACvD,KAAK,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,SAAS,GAAG,OAAO,CAAC,WAAW,CAAC;IAMjE;;;;;;;OAOG;IACG,eAAe,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,SAAS,GAAG,OAAO,CAAC,qBAAqB,CAAC;IAmI/E,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,SAAS,GAAG,OAAO,CAAC,OAAO,CAAC,MAAM,CAAC,YAAY,EAAE,MAAM,CAAC,CAAC,CAAC;IAIzF,WAAW,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,SAAS,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,YAAY,EAAE,MAAM,CAAC,CAAC,CAAC;IAInF,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,SAAS,GAAG;QAAE,GAAG,CAAC,EAAE,UAAU,CAAC,OAAO,WAAW,CAAC,CAAC,CAAC,CAAC,CAAA;KAAE,GAAG,OAAO,CAAC,MAAM,CAAC;IAI7G;;;;;;;;;;OAUG;IACH,OAAO,CAAC,mBAAmB;CAW3B;AAED,wFAAwF;AACxF,MAAM,WAAW,qBAAqB;IACrC,IAAI,EAAE,WAAW,CAAA;IACjB,MAAM,EAAE,MAAM,EAAE,EAAE,CAAA;IAClB,MAAM,EAAE,KAAK,CAAC;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,GAAG,EAAE,MAAM,CAAA;KAAE,CAAC,CAAA;CAC7C;AAED;;;GAGG;AACH,MAAM,WAAW,SAAS;IACzB;;;;OAIG;IACH,UAAU,CAAC,EAAE,cAAc,CAAA;IAC3B;;;OAGG;IACH,mBAAmB,CAAC,EAAE,MAAM,CAAA;IAC5B;;;OAGG;IACH,GAAG,CAAC,EAAE,cAAc,CAAA;IACpB,6DAA6D;IAC7D,YAAY,CAAC,EAAE,MAAM,CAAA;IACrB;;;;;;OAMG;IACH,mBAAmB,CAAC,EAAE,cAAc,CAAA;IACpC,yDAAyD;IACzD,uBAAuB,CAAC,EAAE,yBAAyB,CAAA;IACnD;;;;;;OAMG;IACH,cAAc,CAAC,EAAE,OAAO,CAAA;IACxB;;;;;;OAMG;IACH,UAAU,CAAC,EAAE,OAAO,CAAA;IACpB;;;;;OAKG;IACH,SAAS,CAAC,EAAE,UAAU,CAAA;IACtB;;;;;;;;;;;;;;OAcG;IACH,wBAAwB,CAAC,EAAE,MAAM,GAAG,UAAU,CAAA;CAC9C"}
|
package/out/classifier.js
CHANGED
|
@@ -10,7 +10,10 @@
|
|
|
10
10
|
* Convenience wrappers `parseJson` / `parseTuples` / `parseXml` project the tree on the way out.
|
|
11
11
|
*/
|
|
12
12
|
import { buildAddressTree, decodeAsJson, decodeAsTuples, decodeAsXml, } from "@mailwoman/core/decoder";
|
|
13
|
+
import { conventionsForSystem } from "@mailwoman/codex";
|
|
14
|
+
import { detectAddressSystem } from "./address-system.js";
|
|
13
15
|
import { buildAnchorFeatures } from "./anchor-inference.js";
|
|
16
|
+
import { buildGazetteerFeatures, suppressGazetteerNearPostcode } from "./gazetteer-inference.js";
|
|
14
17
|
import { buildFstEmissionPriors } from "./fst-prior.js";
|
|
15
18
|
import { STAGE2_BIO_LABELS } from "./labels.js";
|
|
16
19
|
import { repairPostcodeLabels } from "./postcode-repair.js";
|
|
@@ -83,71 +86,55 @@ export class NeuralAddressClassifier {
|
|
|
83
86
|
async parse(text, opts) {
|
|
84
87
|
if (text.length === 0)
|
|
85
88
|
return { raw: text, roots: [] };
|
|
86
|
-
const {
|
|
87
|
-
// Postcode-anchor channel (#239/#240): build per-piece anchor features from the same lookup the
|
|
88
|
-
// model trained on, fed alongside the ids. No-op when no lookup is configured.
|
|
89
|
-
const anchor = this.cfg.postcodeAnchorLookup
|
|
90
|
-
? buildAnchorFeatures(text, pieces, this.cfg.postcodeAnchorLookup)
|
|
91
|
-
: undefined;
|
|
92
|
-
const { logits } = await this.cfg.runner.infer(ids, anchor);
|
|
93
|
-
this.assertEmissionWidth(logits);
|
|
94
|
-
let emissions = opts?.queryShape
|
|
95
|
-
? addEmissionMatrix(logits, buildEmissionPriors(opts.queryShape, pieces, this.labels, {
|
|
96
|
-
biasScale: opts.queryShapeBiasScale ?? 1.0,
|
|
97
|
-
inputText: text,
|
|
98
|
-
}))
|
|
99
|
-
: logits;
|
|
100
|
-
if (opts?.fst) {
|
|
101
|
-
emissions = addEmissionMatrix(emissions, buildFstEmissionPriors(opts.fst, pieces, this.labels, {
|
|
102
|
-
biasScale: opts.fstBiasScale ?? 1.0,
|
|
103
|
-
}));
|
|
104
|
-
}
|
|
105
|
-
if (opts?.fstStreetMorphology) {
|
|
106
|
-
emissions = addEmissionMatrix(emissions, buildStreetMorphologyEmissionPriors(opts.fstStreetMorphology, pieces, this.labels, opts.fstStreetMorphologyOpts ?? {}));
|
|
107
|
-
}
|
|
108
|
-
const labelIndices = this.decodeMode === "viterbi"
|
|
109
|
-
? viterbi({
|
|
110
|
-
emissions,
|
|
111
|
-
transitions: this.transitions,
|
|
112
|
-
startTransitions: this.startTransitions,
|
|
113
|
-
endTransitions: this.endTransitions,
|
|
114
|
-
}).path
|
|
115
|
-
: emissions.map((row) => argmaxSoftmax(row).idx);
|
|
116
|
-
let tokens = pieces.map((p, i) => {
|
|
117
|
-
const idx = labelIndices[i];
|
|
118
|
-
const probs = softmax(logits[i]);
|
|
119
|
-
return {
|
|
120
|
-
piece: p.piece,
|
|
121
|
-
start: p.start,
|
|
122
|
-
end: p.end,
|
|
123
|
-
label: (this.labels[idx] ?? "O"),
|
|
124
|
-
confidence: probs[idx],
|
|
125
|
-
};
|
|
126
|
-
});
|
|
127
|
-
if (opts?.postcodeRepair) {
|
|
128
|
-
tokens = repairPostcodeLabels(text, tokens).tokens;
|
|
129
|
-
}
|
|
130
|
-
if (opts?.unitRepair) {
|
|
131
|
-
tokens = repairUnitLabels(text, tokens).tokens;
|
|
132
|
-
}
|
|
89
|
+
const { tokens } = await this.#decode(text, opts);
|
|
133
90
|
return buildAddressTree(text, tokens, opts?.calibrate ? { calibrate: opts.calibrate } : undefined);
|
|
134
91
|
}
|
|
135
92
|
/**
|
|
136
93
|
* Like `parse`, but also returns the raw per-token logits and piece offsets needed for per-span
|
|
137
|
-
* logit aggregation (Option C joint-reconcile integration).
|
|
94
|
+
* logit aggregation (Option C joint-reconcile integration). Shares the ENTIRE decode path with
|
|
95
|
+
* `parse` (one `#decode`, #481) — including the repair passes, which previously ran only in
|
|
96
|
+
* `parse`: reconcile must consume the same tokens the argmax path serves users, and the repair
|
|
97
|
+
* opts were silently ignored here before. `logits` stay RAW (pre-prior, pre-repair) — they are
|
|
98
|
+
* the model's emissions, not the decode's opinions.
|
|
138
99
|
*/
|
|
139
100
|
async parseWithLogits(text, opts) {
|
|
140
101
|
if (text.length === 0) {
|
|
141
102
|
return { tree: { raw: text, roots: [] }, logits: [], pieces: [] };
|
|
142
103
|
}
|
|
104
|
+
const { tokens, logits, pieces } = await this.#decode(text, opts);
|
|
105
|
+
return {
|
|
106
|
+
tree: buildAddressTree(text, tokens, opts?.calibrate ? { calibrate: opts.calibrate } : undefined),
|
|
107
|
+
logits,
|
|
108
|
+
pieces: pieces.map((p) => ({ start: p.start, end: p.end })),
|
|
109
|
+
};
|
|
110
|
+
}
|
|
111
|
+
/**
|
|
112
|
+
* THE decode path (#481): tokenize → anchor/gazetteer features → infer → priors → CRF/argmax →
|
|
113
|
+
* tokens → repairs. Both `parse` and `parseWithLogits` consume this — never fork it; the 2026-06
|
|
114
|
+
* audit found three drift surfaces in the previous duplicated copies.
|
|
115
|
+
*/
|
|
116
|
+
async #decode(text, opts) {
|
|
143
117
|
const { pieces, ids } = this.cfg.tokenizer.encode(text);
|
|
144
118
|
// Postcode-anchor channel (#239/#240): build per-piece anchor features from the same lookup the
|
|
145
119
|
// model trained on, fed alongside the ids. No-op when no lookup is configured.
|
|
146
120
|
const anchor = this.cfg.postcodeAnchorLookup
|
|
147
121
|
? buildAnchorFeatures(text, pieces, this.cfg.postcodeAnchorLookup)
|
|
148
122
|
: undefined;
|
|
149
|
-
const
|
|
123
|
+
const gazetteer = this.cfg.gazetteerLexicon
|
|
124
|
+
? buildGazetteerFeatures(text, pieces, this.cfg.gazetteerLexicon)
|
|
125
|
+
: undefined;
|
|
126
|
+
const gazFed = gazetteer && anchor && this.cfg.suppressGazetteerNearPostcode
|
|
127
|
+
? suppressGazetteerNearPostcode(gazetteer, anchor.confidence)
|
|
128
|
+
: gazetteer;
|
|
129
|
+
const { logits, localeLogits } = await this.cfg.runner.infer(ids, anchor, gazFed);
|
|
150
130
|
this.assertEmissionWidth(logits);
|
|
131
|
+
// Address-system conventions (#511 Tier A): resolve which system's rules apply — caller-pinned
|
|
132
|
+
// system, or the model's own locale-head detection under a high confidence bar. Null = no
|
|
133
|
+
// constraints; the parse below is byte-identical to the pre-conventions path.
|
|
134
|
+
const conventionsOpt = opts?.addressSystemConventions ?? this.cfg.addressSystemConventions;
|
|
135
|
+
const conventions = conventionsOpt === undefined
|
|
136
|
+
? null
|
|
137
|
+
: conventionsForSystem(conventionsOpt === "auto" ? (detectAddressSystem(localeLogits)?.system ?? null) : conventionsOpt);
|
|
151
138
|
let emissions = opts?.queryShape
|
|
152
139
|
? addEmissionMatrix(logits, buildEmissionPriors(opts.queryShape, pieces, this.labels, {
|
|
153
140
|
biasScale: opts.queryShapeBiasScale ?? 1.0,
|
|
@@ -162,6 +149,23 @@ export class NeuralAddressClassifier {
|
|
|
162
149
|
if (opts?.fstStreetMorphology) {
|
|
163
150
|
emissions = addEmissionMatrix(emissions, buildStreetMorphologyEmissionPriors(opts.fstStreetMorphology, pieces, this.labels, opts.fstStreetMorphologyOpts ?? {}));
|
|
164
151
|
}
|
|
152
|
+
// Conventions emission mask: tags that are ungrammatical in the detected system are removed
|
|
153
|
+
// from the decoder's vocabulary outright (-1e9 ≈ log 0). Copy-on-mask — `emissions` may alias
|
|
154
|
+
// `logits`, which the per-token confidence below reads unmasked.
|
|
155
|
+
if (conventions?.forbiddenTags?.length) {
|
|
156
|
+
const forbidden = new Set();
|
|
157
|
+
for (const tag of conventions.forbiddenTags) {
|
|
158
|
+
const b = this.labels.indexOf(`B-${tag}`);
|
|
159
|
+
const i = this.labels.indexOf(`I-${tag}`);
|
|
160
|
+
if (b >= 0)
|
|
161
|
+
forbidden.add(b);
|
|
162
|
+
if (i >= 0)
|
|
163
|
+
forbidden.add(i);
|
|
164
|
+
}
|
|
165
|
+
if (forbidden.size > 0) {
|
|
166
|
+
emissions = emissions.map((row) => row.map((v, idx) => (forbidden.has(idx) ? -1e9 : v)));
|
|
167
|
+
}
|
|
168
|
+
}
|
|
165
169
|
const labelIndices = this.decodeMode === "viterbi"
|
|
166
170
|
? viterbi({
|
|
167
171
|
emissions,
|
|
@@ -170,7 +174,7 @@ export class NeuralAddressClassifier {
|
|
|
170
174
|
endTransitions: this.endTransitions,
|
|
171
175
|
}).path
|
|
172
176
|
: emissions.map((row) => argmaxSoftmax(row).idx);
|
|
173
|
-
|
|
177
|
+
let tokens = pieces.map((p, i) => {
|
|
174
178
|
const idx = labelIndices[i];
|
|
175
179
|
const probs = softmax(logits[i]);
|
|
176
180
|
return {
|
|
@@ -181,11 +185,16 @@ export class NeuralAddressClassifier {
|
|
|
181
185
|
confidence: probs[idx],
|
|
182
186
|
};
|
|
183
187
|
});
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
188
|
+
// Postcode repair runs when the caller asks for it OR the detected system declares a postcode
|
|
189
|
+
// shape (#511 Tier A): a span that is a sub-match of a shape-valid string is exactly the
|
|
190
|
+
// snap-only truncation class the pass exists for ("47110" decoded as "4711" + a digit-split).
|
|
191
|
+
if (opts?.postcodeRepair || conventions?.postcodePattern) {
|
|
192
|
+
tokens = repairPostcodeLabels(text, tokens).tokens;
|
|
193
|
+
}
|
|
194
|
+
if (opts?.unitRepair) {
|
|
195
|
+
tokens = repairUnitLabels(text, tokens).tokens;
|
|
196
|
+
}
|
|
197
|
+
return { tokens, logits, pieces };
|
|
189
198
|
}
|
|
190
199
|
async parseJson(text, opts) {
|
|
191
200
|
return decodeAsJson(await this.parse(text, opts));
|
package/out/classifier.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"classifier.js","sourceRoot":"","sources":["../classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,EACN,gBAAgB,EAChB,YAAY,EACZ,cAAc,EACd,WAAW,GAKX,MAAM,yBAAyB,CAAA;AAChC,OAAO,EAAE,mBAAmB,EAAqB,MAAM,uBAAuB,CAAA;AAC9E,OAAO,EAAE,sBAAsB,EAAuB,MAAM,gBAAgB,CAAA;AAC5E,OAAO,EAAE,iBAAiB,EAAE,MAAM,aAAa,CAAA;AAE/C,OAAO,EAAE,oBAAoB,EAAE,MAAM,sBAAsB,CAAA;AAC3D,OAAO,EAAE,iBAAiB,EAAE,mBAAmB,EAAuB,MAAM,wBAAwB,CAAA;AACpG,OAAO,EAAE,mCAAmC,EAAkC,MAAM,8BAA8B,CAAA;AAClH,OAAO,EAAE,kBAAkB,EAAE,MAAM,gBAAgB,CAAA;AACnD,OAAO,EAAE,gBAAgB,EAAE,MAAM,kBAAkB,CAAA;AACnD,OAAO,EAAE,eAAe,EAAE,iBAAiB,EAAE,sBAAsB,EAAE,OAAO,EAAE,OAAO,EAAE,MAAM,cAAc,CAAA;
|
|
1
|
+
{"version":3,"file":"classifier.js","sourceRoot":"","sources":["../classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,EACN,gBAAgB,EAChB,YAAY,EACZ,cAAc,EACd,WAAW,GAKX,MAAM,yBAAyB,CAAA;AAChC,OAAO,EAAE,oBAAoB,EAAmB,MAAM,kBAAkB,CAAA;AAExE,OAAO,EAAE,mBAAmB,EAAE,MAAM,qBAAqB,CAAA;AACzD,OAAO,EAAE,mBAAmB,EAAqB,MAAM,uBAAuB,CAAA;AAC9E,OAAO,EAAE,sBAAsB,EAAE,6BAA6B,EAAyB,MAAM,0BAA0B,CAAA;AACvH,OAAO,EAAE,sBAAsB,EAAuB,MAAM,gBAAgB,CAAA;AAC5E,OAAO,EAAE,iBAAiB,EAAE,MAAM,aAAa,CAAA;AAE/C,OAAO,EAAE,oBAAoB,EAAE,MAAM,sBAAsB,CAAA;AAC3D,OAAO,EAAE,iBAAiB,EAAE,mBAAmB,EAAuB,MAAM,wBAAwB,CAAA;AACpG,OAAO,EAAE,mCAAmC,EAAkC,MAAM,8BAA8B,CAAA;AAClH,OAAO,EAAE,kBAAkB,EAAE,MAAM,gBAAgB,CAAA;AACnD,OAAO,EAAE,gBAAgB,EAAE,MAAM,kBAAkB,CAAA;AACnD,OAAO,EAAE,eAAe,EAAE,iBAAiB,EAAE,sBAAsB,EAAE,OAAO,EAAE,OAAO,EAAE,MAAM,cAAc,CAAA;AA+E3G,MAAM,OAAO,uBAAuB;IAON;IANZ,MAAM,CAAmB;IACzB,UAAU,CAAsB;IAChC,WAAW,CAAY;IACvB,gBAAgB,CAAU;IAC1B,cAAc,CAAU;IAEzC,YAA6B,GAAkC;QAAlC,QAAG,GAAH,GAAG,CAA+B;QAC9D,IAAI,CAAC,MAAM,GAAG,GAAG,CAAC,MAAM,IAAI,iBAAiB,CAAA;QAC7C,IAAI,CAAC,UAAU,GAAG,GAAG,CAAC,MAAM,IAAI,SAAS,CAAA;QACzC,MAAM,UAAU,GAAG,sBAAsB,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QACtD,IAAI,GAAG,CAAC,WAAW,EAAE,CAAC;YACrB,IAAI,CAAC,WAAW,GAAG,WAAW,CAAC,UAAU,EAAE,GAAG,CAAC,WAAW,CAAC,CAAA;QAC5D,CAAC;aAAM,CAAC;YACP,IAAI,CAAC,WAAW,GAAG,UAAU,CAAA;QAC9B,CAAC;QACD,IAAI,CAAC,gBAAgB,GAAG,GAAG,CAAC,gBAAgB,IAAI,iBAAiB,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QAC9E,IAAI,CAAC,cAAc,GAAG,GAAG,CAAC,cAAc,IAAI,eAAe,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;IACzE,CAAC;IAED;;;;;;;;;;;OAWG;IACH,MAAM,CAAC,KAAK,CAAC,eAAe,CAC3B,OAAqE,EAAE;QAEvE,yFAAyF;QACzF,2FAA2F;QAC3F,uFAAuF;QACvF,0FAA0F;QAC1F,2BAA2B;QAC3B,MAAM,CAAC,EAAE,UAAU,EAAE,EAAE,EAAE,cAAc,EAAE,uBAAuB,EAAE,kBAAkB,EAAE,CAAC,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC;YAC3G,MAAM,CAAC,yBAAyB,CAAC,kBAAkB,CAAC;YACpD,MAAM,CAAC,yBAAyB,CAAC,cAAc,CAAC;SAChD,CAAC,CAAA;QACF,MAAM,QAAQ,GAAoB,cAAc,CAAC,IAAI,CAAC,CAAA;QACtD,MAAM,MAAM,GAAG,uBAAuB,CAAC,QAAQ,CAAC,aAAa,CAAC,CAAA;QAC9D,MAAM,GAAG,GAAG,kBAAkB,CAAC,QAAQ,CAAC,kBAAkB,CAAC,CAAA;QAC3D,MAAM,CAAC,SAAS,EAAE,MAAM,CAAC,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC;YAC7C,kBAAkB,CAAC,YAAY,CAAC,QAAQ,CAAC,aAAa,CAAC;YACvD,UAAU,CAAC,MAAM,CAAC,QAAQ,CAAC,SAAS,CAAC;SACrC,CAAC,CAAA;QACF,OAAO,IAAI,uBAAuB,CAAC;YAClC,SAAS;YACT,MAAM;YACN,MAAM;YACN,WAAW,EAAE,GAAG,EAAE,WAAW;YAC7B,gBAAgB,EAAE,GAAG,EAAE,gBAAgB;YACvC,cAAc,EAAE,GAAG,EAAE,cAAc;YACnC,GAAG,CAAC,IAAI,CAAC,oBAAoB,CAAC,CAAC,CAAC,EAAE,oBAAoB,EAAE,IAAI,CAAC,oBAAoB,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;SACzF,CAAC,CAAA;IACH,CAAC;IAED,6DAA6D;IAC7D,KAAK,CAAC,KAAK,CAAC,IAAY,EAAE,IAAgB;QACzC,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,EAAE,GAAG,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE,EAAE,CAAA;QACtD,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,IAAI,CAAC,CAAA;QACjD,OAAO,gBAAgB,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,SAAS,CAAC,CAAC,CAAC,EAAE,SAAS,EAAE,IAAI,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC,CAAA;IACnG,CAAC;IAED;;;;;;;OAOG;IACH,KAAK,CAAC,eAAe,CAAC,IAAY,EAAE,IAAgB;QACnD,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACvB,OAAO,EAAE,IAAI,EAAE,EAAE,GAAG,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,CAAA;QAClE,CAAC;QACD,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,IAAI,CAAC,CAAA;QACjE,OAAO;YACN,IAAI,EAAE,gBAAgB,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,SAAS,CAAC,CAAC,CAAC,EAAE,SAAS,EAAE,IAAI,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC;YACjG,MAAM;YACN,MAAM,EAAE,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC,KAAK,EAAE,GAAG,EAAE,CAAC,CAAC,GAAG,EAAE,CAAC,CAAC;SAC3D,CAAA;IACF,CAAC;IAED;;;;OAIG;IACH,KAAK,CAAC,OAAO,CACZ,IAAY,EACZ,IAAgB;QAEhB,MAAM,EAAE,MAAM,EAAE,GAAG,EAAE,GAAG,IAAI,CAAC,GAAG,CAAC,SAAS,CAAC,MAAM,CAAC,IAAI,CAAC,CAAA;QACvD,gGAAgG;QAChG,+EAA+E;QAC/E,MAAM,MAAM,GAAG,IAAI,CAAC,GAAG,CAAC,oBAAoB;YAC3C,CAAC,CAAC,mBAAmB,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,GAAG,CAAC,oBAAoB,CAAC;YAClE,CAAC,CAAC,SAAS,CAAA;QACZ,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,CAAC,gBAAgB;YAC1C,CAAC,CAAC,sBAAsB,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,GAAG,CAAC,gBAAgB,CAAC;YACjE,CAAC,CAAC,SAAS,CAAA;QACZ,MAAM,MAAM,GACX,SAAS,IAAI,MAAM,IAAI,IAAI,CAAC,GAAG,CAAC,6BAA6B;YAC5D,CAAC,CAAC,6BAA6B,CAAC,SAAS,EAAE,MAAM,CAAC,UAAU,CAAC;YAC7D,CAAC,CAAC,SAAS,CAAA;QACb,MAAM,EAAE,MAAM,EAAE,YAAY,EAAE,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,EAAE,MAAM,EAAE,MAAM,CAAC,CAAA;QAEjF,IAAI,CAAC,mBAAmB,CAAC,MAAM,CAAC,CAAA;QAEhC,+FAA+F;QAC/F,0FAA0F;QAC1F,8EAA8E;QAC9E,MAAM,cAAc,GAAG,IAAI,EAAE,wBAAwB,IAAI,IAAI,CAAC,GAAG,CAAC,wBAAwB,CAAA;QAC1F,MAAM,WAAW,GAChB,cAAc,KAAK,SAAS;YAC3B,CAAC,CAAC,IAAI;YACN,CAAC,CAAC,oBAAoB,CACpB,cAAc,KAAK,MAAM,CAAC,CAAC,CAAC,CAAC,mBAAmB,CAAC,YAAY,CAAC,EAAE,MAAM,IAAI,IAAI,CAAC,CAAC,CAAC,CAAC,cAAc,CAChG,CAAA;QAEJ,IAAI,SAAS,GAAG,IAAI,EAAE,UAAU;YAC/B,CAAC,CAAC,iBAAiB,CACjB,MAAM,EACN,mBAAmB,CAAC,IAAI,CAAC,UAAU,EAAE,MAAM,EAAE,IAAI,CAAC,MAAM,EAAE;gBACzD,SAAS,EAAE,IAAI,CAAC,mBAAmB,IAAI,GAAG;gBAC1C,SAAS,EAAE,IAAI;aACf,CAAC,CACF;YACF,CAAC,CAAC,MAAM,CAAA;QAET,IAAI,IAAI,EAAE,GAAG,EAAE,CAAC;YACf,SAAS,GAAG,iBAAiB,CAC5B,SAAS,EACT,sBAAsB,CAAC,IAAI,CAAC,GAAG,EAAE,MAAM,EAAE,IAAI,CAAC,MAAM,EAAE;gBACrD,SAAS,EAAE,IAAI,CAAC,YAAY,IAAI,GAAG;aACnC,CAAC,CACF,CAAA;QACF,CAAC;QAED,IAAI,IAAI,EAAE,mBAAmB,EAAE,CAAC;YAC/B,SAAS,GAAG,iBAAiB,CAC5B,SAAS,EACT,mCAAmC,CAClC,IAAI,CAAC,mBAAmB,EACxB,MAAM,EACN,IAAI,CAAC,MAAM,EACX,IAAI,CAAC,uBAAuB,IAAI,EAAE,CAClC,CACD,CAAA;QACF,CAAC;QAED,4FAA4F;QAC5F,8FAA8F;QAC9F,iEAAiE;QACjE,IAAI,WAAW,EAAE,aAAa,EAAE,MAAM,EAAE,CAAC;YACxC,MAAM,SAAS,GAAG,IAAI,GAAG,EAAU,CAAA;YACnC,KAAK,MAAM,GAAG,IAAI,WAAW,CAAC,aAAa,EAAE,CAAC;gBAC7C,MAAM,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC,KAAK,GAAG,EAAE,CAAC,CAAA;gBACzC,MAAM,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC,KAAK,GAAG,EAAE,CAAC,CAAA;gBACzC,IAAI,CAAC,IAAI,CAAC;oBAAE,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,CAAA;gBAC5B,IAAI,CAAC,IAAI,CAAC;oBAAE,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,CAAA;YAC7B,CAAC;YACD,IAAI,SAAS,CAAC,IAAI,GAAG,CAAC,EAAE,CAAC;gBACxB,SAAS,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,GAAG,EAAE,EAAE,CAAC,CAAC,SAAS,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;YACzF,CAAC;QACF,CAAC;QAED,MAAM,YAAY,GACjB,IAAI,CAAC,UAAU,KAAK,SAAS;YAC5B,CAAC,CAAC,OAAO,CAAC;gBACR,SAAS;gBACT,WAAW,EAAE,IAAI,CAAC,WAAW;gBAC7B,gBAAgB,EAAE,IAAI,CAAC,gBAAgB;gBACvC,cAAc,EAAE,IAAI,CAAC,cAAc;aACnC,CAAC,CAAC,IAAI;YACR,CAAC,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,aAAa,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAA;QAElD,IAAI,MAAM,GAAmB,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;YAChD,MAAM,GAAG,GAAG,YAAY,CAAC,CAAC,CAAE,CAAA;YAC5B,MAAM,KAAK,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,CAAA;YACjC,OAAO;gBACN,KAAK,EAAE,CAAC,CAAC,KAAK;gBACd,KAAK,EAAE,CAAC,CAAC,KAAK;gBACd,GAAG,EAAE,CAAC,CAAC,GAAG;gBACV,KAAK,EAAE,CAAC,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,IAAI,GAAG,CAA0B;gBACzD,UAAU,EAAE,KAAK,CAAC,GAAG,CAAE;aACvB,CAAA;QACF,CAAC,CAAC,CAAA;QAEF,8FAA8F;QAC9F,yFAAyF;QACzF,8FAA8F;QAC9F,IAAI,IAAI,EAAE,cAAc,IAAI,WAAW,EAAE,eAAe,EAAE,CAAC;YAC1D,MAAM,GAAG,oBAAoB,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC,MAAM,CAAA;QACnD,CAAC;QACD,IAAI,IAAI,EAAE,UAAU,EAAE,CAAC;YACtB,MAAM,GAAG,gBAAgB,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC,MAAM,CAAA;QAC/C,CAAC;QAED,OAAO,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,CAAA;IAClC,CAAC;IAED,KAAK,CAAC,SAAS,CAAC,IAAY,EAAE,IAAgB;QAC7C,OAAO,YAAY,CAAC,MAAM,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC,CAAA;IAClD,CAAC;IAED,KAAK,CAAC,WAAW,CAAC,IAAY,EAAE,IAAgB;QAC/C,OAAO,cAAc,CAAC,MAAM,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC,CAAA;IACpD,CAAC;IAED,KAAK,CAAC,QAAQ,CAAC,IAAY,EAAE,IAA8D;QAC1F,OAAO,WAAW,CAAC,MAAM,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,IAAI,CAAC,EAAE,IAAI,EAAE,GAAG,CAAC,CAAA;IAC5D,CAAC;IAED;;;;;;;;;;OAUG;IACK,mBAAmB,CAAC,MAA2B;QACtD,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC;YAAE,OAAM;QAC/B,MAAM,KAAK,GAAG,MAAM,CAAC,CAAC,CAAE,CAAC,MAAM,CAAA;QAC/B,IAAI,KAAK,GAAG,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC;YAChC,MAAM,IAAI,KAAK,CACd,wCAAwC,KAAK,2CAA2C;gBACvF,wBAAwB,IAAI,CAAC,MAAM,CAAC,MAAM,iDAAiD;gBAC3F,oFAAoF,CACrF,CAAA;QACF,CAAC;IACF,CAAC;CACD;AAmFD,SAAS,aAAa,CAAC,GAAa;IACnC,IAAI,MAAM,GAAG,CAAC,CAAA;IACd,IAAI,MAAM,GAAG,GAAG,CAAC,CAAC,CAAE,CAAA;IACpB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACrC,IAAI,GAAG,CAAC,CAAC,CAAE,GAAG,MAAM,EAAE,CAAC;YACtB,MAAM,GAAG,GAAG,CAAC,CAAC,CAAE,CAAA;YAChB,MAAM,GAAG,CAAC,CAAA;QACX,CAAC;IACF,CAAC;IACD,IAAI,MAAM,GAAG,CAAC,CAAA;IACd,KAAK,MAAM,CAAC,IAAI,GAAG;QAAE,MAAM,IAAI,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,MAAM,CAAC,CAAA;IACnD,MAAM,IAAI,GAAG,CAAC,GAAG,MAAM,CAAA;IACvB,OAAO,EAAE,GAAG,EAAE,MAAM,EAAE,IAAI,EAAE,CAAA;AAC7B,CAAC;AAED,uGAAuG;AACvG,SAAS,WAAW,CAAC,CAAa,EAAE,CAAa;IAChD,MAAM,CAAC,GAAG,CAAC,CAAC,MAAM,CAAA;IAClB,MAAM,GAAG,GAAe,EAAE,CAAA;IAC1B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAC5B,MAAM,GAAG,GAAG,IAAI,KAAK,CAAS,CAAC,CAAC,CAAA;QAChC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE;YAAE,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAE,CAAC,CAAC,CAAE,GAAG,CAAC,CAAC,CAAC,CAAE,CAAC,CAAC,CAAE,CAAA;QAC1D,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;IACd,CAAC;IACD,OAAO,GAAG,CAAA;AACX,CAAC"}
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Inference-side gazetteer-anchor features (#464, knowledge-ladder rung 3.2) — the TS mirror of the
|
|
7
|
+
* Python training pipeline (`mailwoman_train/gazetteer_anchor.py`). Both consumers load the SAME
|
|
8
|
+
* codex-generated lexicon (`scripts/build-gazetteer-anchor-lexicon.mjs` →
|
|
9
|
+
* `data/gazetteer/anchor-lexicon-v1.json`) whose `rules` encode the match semantics as DATA, so the
|
|
10
|
+
* two implementations cannot drift. The model conditions on per-token candidate-tag-set clues fed
|
|
11
|
+
* alongside `input_ids`; this builds them from a raw address + its SentencePiece pieces.
|
|
12
|
+
*
|
|
13
|
+
* The clue INFORMS, the model decides (model-first). `gazetteer-inference.test.ts` pins the matcher
|
|
14
|
+
* against the Python fixture: the homograph clue is symmetric, "in" ≠ "IN", multi-word countries
|
|
15
|
+
* paint every word.
|
|
16
|
+
*/
|
|
17
|
+
import type { TokenizedPiece } from "./tokenizer.js";
|
|
18
|
+
/**
|
|
19
|
+
* The candidate-tag-set feature width: country/region/po_box/cedex/homograph (the lexicon's slot
|
|
20
|
+
* count). Used for the ONNX zero-fallback when a gazetteer-trained model is run with no clue data.
|
|
21
|
+
* MUST match the lexicon JSON's `feature_dim` and the trained model's `gazetteer_feature_dim`.
|
|
22
|
+
*/
|
|
23
|
+
export declare const GAZETTEER_FEATURE_DIM = 5;
|
|
24
|
+
/** The loaded lexicon — the JSON shape from build-gazetteer-anchor-lexicon.mjs. */
|
|
25
|
+
export interface GazetteerLexicon {
|
|
26
|
+
featureDim: number;
|
|
27
|
+
slots: readonly string[];
|
|
28
|
+
bits: Record<string, number>;
|
|
29
|
+
maxNgram: number;
|
|
30
|
+
/** case-insensitive: key = word_norm lowercased → bitmask. */
|
|
31
|
+
entries: Map<string, number>;
|
|
32
|
+
/** case-SENSITIVE: key = word_norm uppercased → bitmask (surface must already be uppercase). */
|
|
33
|
+
codeEntries: Map<string, number>;
|
|
34
|
+
}
|
|
35
|
+
/** Parse the lexicon JSON (already `JSON.parse`d — keeps this module browser-safe; caller reads). */
|
|
36
|
+
export declare function parseGazetteerLexicon(raw: {
|
|
37
|
+
feature_dim: number;
|
|
38
|
+
slots: string[];
|
|
39
|
+
bits: Record<string, number>;
|
|
40
|
+
max_ngram: number;
|
|
41
|
+
entries: Record<string, number>;
|
|
42
|
+
code_entries: Record<string, number>;
|
|
43
|
+
}): GazetteerLexicon;
|
|
44
|
+
/** Scan the raw surface and paint each char with its candidate-tag bitmask (mirrors Python). */
|
|
45
|
+
export declare function gazetteerCharPaint(text: string, lexicon: GazetteerLexicon): number[];
|
|
46
|
+
/**
|
|
47
|
+
* Channel choreography (#464, v0.9.13 postcode fix; DeepSeek 2026-06-10): zero the gazetteer clue on
|
|
48
|
+
* pieces within `window` of a postcode-anchor hit. The clue fires on the region token (`CA`/`GA`)
|
|
49
|
+
* immediately before a US postcode; its additive vector strengthens `B-region`, which makes the
|
|
50
|
+
* `B-region → B-postcode` CRF transition less competitive and drops the postcode (~3pp, US-only — FR
|
|
51
|
+
* postcode precedes the locality, no region neighbor). Suppressing the clue adjacent to the postcode
|
|
52
|
+
* removes the interference while leaving every other clue intact. Returns a NEW features/confidence
|
|
53
|
+
* pair (does not mutate). `anchorConfidence[i] > 0` marks postcode-span pieces. PAIRS WITH the
|
|
54
|
+
* train-time half (`gazetteer_anchor.suppress_gazetteer_near_postcode`) — enable both or neither.
|
|
55
|
+
*/
|
|
56
|
+
export declare function suppressGazetteerNearPostcode(gazetteer: {
|
|
57
|
+
features: number[][];
|
|
58
|
+
confidence: number[];
|
|
59
|
+
}, anchorConfidence: ReadonlyArray<number>, window?: number): {
|
|
60
|
+
features: number[][];
|
|
61
|
+
confidence: number[];
|
|
62
|
+
};
|
|
63
|
+
/**
|
|
64
|
+
* Per-piece gazetteer features + confidence for `text`, projected onto its SP `pieces` by the SAME
|
|
65
|
+
* char→piece rule the labels use (a piece takes the bits of the first non-whitespace char it covers).
|
|
66
|
+
* Returns `(pieces × featureDim)` features + `(pieces,)` confidence (1.0 wherever any bit fires).
|
|
67
|
+
*/
|
|
68
|
+
export declare function buildGazetteerFeatures(text: string, pieces: ReadonlyArray<TokenizedPiece>, lexicon: GazetteerLexicon): {
|
|
69
|
+
features: number[][];
|
|
70
|
+
confidence: number[];
|
|
71
|
+
};
|
|
72
|
+
//# sourceMappingURL=gazetteer-inference.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"gazetteer-inference.d.ts","sourceRoot":"","sources":["../gazetteer-inference.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAEH,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAA;AAEpD;;;;GAIG;AACH,eAAO,MAAM,qBAAqB,IAAI,CAAA;AAEtC,mFAAmF;AACnF,MAAM,WAAW,gBAAgB;IAChC,UAAU,EAAE,MAAM,CAAA;IAClB,KAAK,EAAE,SAAS,MAAM,EAAE,CAAA;IACxB,IAAI,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAA;IAC5B,QAAQ,EAAE,MAAM,CAAA;IAChB,8DAA8D;IAC9D,OAAO,EAAE,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAAA;IAC5B,gGAAgG;IAChG,WAAW,EAAE,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAAA;CAChC;AAED,qGAAqG;AACrG,wBAAgB,qBAAqB,CAAC,GAAG,EAAE;IAC1C,WAAW,EAAE,MAAM,CAAA;IACnB,KAAK,EAAE,MAAM,EAAE,CAAA;IACf,IAAI,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAA;IAC5B,SAAS,EAAE,MAAM,CAAA;IACjB,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAA;IAC/B,YAAY,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAA;CACpC,GAAG,gBAAgB,CA0BnB;AAsBD,gGAAgG;AAChG,wBAAgB,kBAAkB,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,gBAAgB,GAAG,MAAM,EAAE,CA0DpF;AAED;;;;;;;;;GASG;AACH,wBAAgB,6BAA6B,CAC5C,SAAS,EAAE;IAAE,QAAQ,EAAE,MAAM,EAAE,EAAE,CAAC;IAAC,UAAU,EAAE,MAAM,EAAE,CAAA;CAAE,EACzD,gBAAgB,EAAE,aAAa,CAAC,MAAM,CAAC,EACvC,MAAM,SAAI,GACR;IAAE,QAAQ,EAAE,MAAM,EAAE,EAAE,CAAC;IAAC,UAAU,EAAE,MAAM,EAAE,CAAA;CAAE,CAgBhD;AAED;;;;GAIG;AACH,wBAAgB,sBAAsB,CACrC,IAAI,EAAE,MAAM,EACZ,MAAM,EAAE,aAAa,CAAC,cAAc,CAAC,EACrC,OAAO,EAAE,gBAAgB,GACvB;IAAE,QAAQ,EAAE,MAAM,EAAE,EAAE,CAAC;IAAC,UAAU,EAAE,MAAM,EAAE,CAAA;CAAE,CAiBhD"}
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Inference-side gazetteer-anchor features (#464, knowledge-ladder rung 3.2) — the TS mirror of the
|
|
7
|
+
* Python training pipeline (`mailwoman_train/gazetteer_anchor.py`). Both consumers load the SAME
|
|
8
|
+
* codex-generated lexicon (`scripts/build-gazetteer-anchor-lexicon.mjs` →
|
|
9
|
+
* `data/gazetteer/anchor-lexicon-v1.json`) whose `rules` encode the match semantics as DATA, so the
|
|
10
|
+
* two implementations cannot drift. The model conditions on per-token candidate-tag-set clues fed
|
|
11
|
+
* alongside `input_ids`; this builds them from a raw address + its SentencePiece pieces.
|
|
12
|
+
*
|
|
13
|
+
* The clue INFORMS, the model decides (model-first). `gazetteer-inference.test.ts` pins the matcher
|
|
14
|
+
* against the Python fixture: the homograph clue is symmetric, "in" ≠ "IN", multi-word countries
|
|
15
|
+
* paint every word.
|
|
16
|
+
*/
|
|
17
|
+
/**
|
|
18
|
+
* The candidate-tag-set feature width: country/region/po_box/cedex/homograph (the lexicon's slot
|
|
19
|
+
* count). Used for the ONNX zero-fallback when a gazetteer-trained model is run with no clue data.
|
|
20
|
+
* MUST match the lexicon JSON's `feature_dim` and the trained model's `gazetteer_feature_dim`.
|
|
21
|
+
*/
|
|
22
|
+
export const GAZETTEER_FEATURE_DIM = 5;
|
|
23
|
+
/** Parse the lexicon JSON (already `JSON.parse`d — keeps this module browser-safe; caller reads). */
|
|
24
|
+
export function parseGazetteerLexicon(raw) {
|
|
25
|
+
// Loud validation (#481): a malformed lexicon previously surfaced as a crash deep inside
|
|
26
|
+
// buildGazetteerFeatures (or worse, silently zero-filled clues — the fake-affix-crash class).
|
|
27
|
+
// Unknown refs fail loud, never silent.
|
|
28
|
+
if (typeof raw?.feature_dim !== "number" || raw.feature_dim <= 0) {
|
|
29
|
+
throw new Error(`gazetteer lexicon: feature_dim must be a positive number, got ${raw?.feature_dim}`);
|
|
30
|
+
}
|
|
31
|
+
if (!Array.isArray(raw.slots) || raw.slots.length === 0) {
|
|
32
|
+
throw new Error("gazetteer lexicon: slots must be a non-empty array");
|
|
33
|
+
}
|
|
34
|
+
if (typeof raw.max_ngram !== "number" || raw.max_ngram < 1) {
|
|
35
|
+
throw new Error(`gazetteer lexicon: max_ngram must be >= 1, got ${raw.max_ngram}`);
|
|
36
|
+
}
|
|
37
|
+
for (const field of ["bits", "entries", "code_entries"]) {
|
|
38
|
+
if (typeof raw[field] !== "object" || raw[field] === null) {
|
|
39
|
+
throw new Error(`gazetteer lexicon: ${field} must be an object`);
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
return {
|
|
43
|
+
featureDim: raw.feature_dim,
|
|
44
|
+
slots: raw.slots,
|
|
45
|
+
bits: raw.bits,
|
|
46
|
+
maxNgram: raw.max_ngram,
|
|
47
|
+
entries: new Map(Object.entries(raw.entries)),
|
|
48
|
+
codeEntries: new Map(Object.entries(raw.code_entries)),
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
/** word_norm for one word: strip leading/trailing non-letter/digit chars (keep internal). */
|
|
52
|
+
function stripWord(word) {
|
|
53
|
+
let start = 0;
|
|
54
|
+
let end = word.length;
|
|
55
|
+
const alnum = (c) => /[\p{L}\p{N}]/u.test(c);
|
|
56
|
+
while (start < end && !alnum(word[start]))
|
|
57
|
+
start++;
|
|
58
|
+
while (end > start && !alnum(word[end - 1]))
|
|
59
|
+
end--;
|
|
60
|
+
return word.slice(start, end);
|
|
61
|
+
}
|
|
62
|
+
function bitsToRow(bits, lexicon) {
|
|
63
|
+
return lexicon.slots.map((slot) => (bits & lexicon.bits[slot] ? 1 : 0));
|
|
64
|
+
}
|
|
65
|
+
/** Scan the raw surface and paint each char with its candidate-tag bitmask (mirrors Python). */
|
|
66
|
+
export function gazetteerCharPaint(text, lexicon) {
|
|
67
|
+
const charBits = new Array(text.length).fill(0);
|
|
68
|
+
const wordRe = /\S+/g;
|
|
69
|
+
const words = [];
|
|
70
|
+
let m;
|
|
71
|
+
while ((m = wordRe.exec(text)) !== null) {
|
|
72
|
+
const surface = m[0];
|
|
73
|
+
const stripped = stripWord(surface);
|
|
74
|
+
if (!stripped) {
|
|
75
|
+
words.push({ begin: m.index, end: m.index, text: "" });
|
|
76
|
+
continue;
|
|
77
|
+
}
|
|
78
|
+
let head = 0;
|
|
79
|
+
const alnum = (c) => /[\p{L}\p{N}]/u.test(c);
|
|
80
|
+
while (head < surface.length && !alnum(surface[head]))
|
|
81
|
+
head++;
|
|
82
|
+
words.push({ begin: m.index + head, end: m.index + head + stripped.length, text: stripped });
|
|
83
|
+
}
|
|
84
|
+
let i = 0;
|
|
85
|
+
while (i < words.length) {
|
|
86
|
+
if (!words[i].text) {
|
|
87
|
+
i++;
|
|
88
|
+
continue;
|
|
89
|
+
}
|
|
90
|
+
let matchedN = 0;
|
|
91
|
+
let matchedBits = 0;
|
|
92
|
+
const maxN = Math.min(lexicon.maxNgram, words.length - i);
|
|
93
|
+
for (let n = maxN; n >= 1; n--) {
|
|
94
|
+
const parts = [];
|
|
95
|
+
let ok = true;
|
|
96
|
+
for (let k = i; k < i + n; k++) {
|
|
97
|
+
if (!words[k].text) {
|
|
98
|
+
ok = false;
|
|
99
|
+
break;
|
|
100
|
+
}
|
|
101
|
+
parts.push(words[k].text);
|
|
102
|
+
}
|
|
103
|
+
if (!ok)
|
|
104
|
+
continue;
|
|
105
|
+
const key = parts.join(" ").toLowerCase();
|
|
106
|
+
let bits = lexicon.entries.get(key) ?? 0;
|
|
107
|
+
// code_entries is case-SENSITIVE: the surface must already BE uppercase ("IN" ≠ "in").
|
|
108
|
+
if (n === 1)
|
|
109
|
+
bits |= lexicon.codeEntries.get(parts[0]) ?? 0;
|
|
110
|
+
if (bits) {
|
|
111
|
+
matchedN = n;
|
|
112
|
+
matchedBits = bits;
|
|
113
|
+
break;
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
if (matchedN) {
|
|
117
|
+
const begin = words[i].begin;
|
|
118
|
+
const end = words[i + matchedN - 1].end;
|
|
119
|
+
for (let c = begin; c < Math.min(end, text.length); c++)
|
|
120
|
+
charBits[c] = matchedBits;
|
|
121
|
+
i += matchedN;
|
|
122
|
+
}
|
|
123
|
+
else {
|
|
124
|
+
i++;
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
return charBits;
|
|
128
|
+
}
|
|
129
|
+
/**
|
|
130
|
+
* Channel choreography (#464, v0.9.13 postcode fix; DeepSeek 2026-06-10): zero the gazetteer clue on
|
|
131
|
+
* pieces within `window` of a postcode-anchor hit. The clue fires on the region token (`CA`/`GA`)
|
|
132
|
+
* immediately before a US postcode; its additive vector strengthens `B-region`, which makes the
|
|
133
|
+
* `B-region → B-postcode` CRF transition less competitive and drops the postcode (~3pp, US-only — FR
|
|
134
|
+
* postcode precedes the locality, no region neighbor). Suppressing the clue adjacent to the postcode
|
|
135
|
+
* removes the interference while leaving every other clue intact. Returns a NEW features/confidence
|
|
136
|
+
* pair (does not mutate). `anchorConfidence[i] > 0` marks postcode-span pieces. PAIRS WITH the
|
|
137
|
+
* train-time half (`gazetteer_anchor.suppress_gazetteer_near_postcode`) — enable both or neither.
|
|
138
|
+
*/
|
|
139
|
+
export function suppressGazetteerNearPostcode(gazetteer, anchorConfidence, window = 1) {
|
|
140
|
+
const n = gazetteer.confidence.length;
|
|
141
|
+
const suppress = new Array(n).fill(false);
|
|
142
|
+
for (let i = 0; i < n; i++) {
|
|
143
|
+
if ((anchorConfidence[i] ?? 0) > 0) {
|
|
144
|
+
for (let d = -window; d <= window; d++) {
|
|
145
|
+
const j = i + d;
|
|
146
|
+
if (j >= 0 && j < n && d !== 0)
|
|
147
|
+
suppress[j] = true;
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
const dim = gazetteer.features[0]?.length ?? 0;
|
|
152
|
+
return {
|
|
153
|
+
features: gazetteer.features.map((row, i) => (suppress[i] ? new Array(dim).fill(0) : row)),
|
|
154
|
+
confidence: gazetteer.confidence.map((c, i) => (suppress[i] ? 0 : c)),
|
|
155
|
+
};
|
|
156
|
+
}
|
|
157
|
+
/**
|
|
158
|
+
* Per-piece gazetteer features + confidence for `text`, projected onto its SP `pieces` by the SAME
|
|
159
|
+
* char→piece rule the labels use (a piece takes the bits of the first non-whitespace char it covers).
|
|
160
|
+
* Returns `(pieces × featureDim)` features + `(pieces,)` confidence (1.0 wherever any bit fires).
|
|
161
|
+
*/
|
|
162
|
+
export function buildGazetteerFeatures(text, pieces, lexicon) {
|
|
163
|
+
const charBits = gazetteerCharPaint(text, lexicon);
|
|
164
|
+
const zero = () => new Array(lexicon.featureDim).fill(0);
|
|
165
|
+
const features = [];
|
|
166
|
+
const confidence = [];
|
|
167
|
+
for (const p of pieces) {
|
|
168
|
+
let bits = 0;
|
|
169
|
+
for (let c = p.start; c < p.end; c++) {
|
|
170
|
+
if (c < text.length && !/\s/.test(text[c])) {
|
|
171
|
+
bits = charBits[c];
|
|
172
|
+
break;
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
features.push(bits ? bitsToRow(bits, lexicon) : zero());
|
|
176
|
+
confidence.push(bits ? 1.0 : 0);
|
|
177
|
+
}
|
|
178
|
+
return { features, confidence };
|
|
179
|
+
}
|
|
180
|
+
//# sourceMappingURL=gazetteer-inference.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"gazetteer-inference.js","sourceRoot":"","sources":["../gazetteer-inference.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAIH;;;;GAIG;AACH,MAAM,CAAC,MAAM,qBAAqB,GAAG,CAAC,CAAA;AActC,qGAAqG;AACrG,MAAM,UAAU,qBAAqB,CAAC,GAOrC;IACA,yFAAyF;IACzF,8FAA8F;IAC9F,wCAAwC;IACxC,IAAI,OAAO,GAAG,EAAE,WAAW,KAAK,QAAQ,IAAI,GAAG,CAAC,WAAW,IAAI,CAAC,EAAE,CAAC;QAClE,MAAM,IAAI,KAAK,CAAC,iEAAiE,GAAG,EAAE,WAAW,EAAE,CAAC,CAAA;IACrG,CAAC;IACD,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,GAAG,CAAC,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACzD,MAAM,IAAI,KAAK,CAAC,oDAAoD,CAAC,CAAA;IACtE,CAAC;IACD,IAAI,OAAO,GAAG,CAAC,SAAS,KAAK,QAAQ,IAAI,GAAG,CAAC,SAAS,GAAG,CAAC,EAAE,CAAC;QAC5D,MAAM,IAAI,KAAK,CAAC,kDAAkD,GAAG,CAAC,SAAS,EAAE,CAAC,CAAA;IACnF,CAAC;IACD,KAAK,MAAM,KAAK,IAAI,CAAC,MAAM,EAAE,SAAS,EAAE,cAAc,CAAU,EAAE,CAAC;QAClE,IAAI,OAAO,GAAG,CAAC,KAAK,CAAC,KAAK,QAAQ,IAAI,GAAG,CAAC,KAAK,CAAC,KAAK,IAAI,EAAE,CAAC;YAC3D,MAAM,IAAI,KAAK,CAAC,sBAAsB,KAAK,oBAAoB,CAAC,CAAA;QACjE,CAAC;IACF,CAAC;IACD,OAAO;QACN,UAAU,EAAE,GAAG,CAAC,WAAW;QAC3B,KAAK,EAAE,GAAG,CAAC,KAAK;QAChB,IAAI,EAAE,GAAG,CAAC,IAAI;QACd,QAAQ,EAAE,GAAG,CAAC,SAAS;QACvB,OAAO,EAAE,IAAI,GAAG,CAAC,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;QAC7C,WAAW,EAAE,IAAI,GAAG,CAAC,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC;KACtD,CAAA;AACF,CAAC;AAED,6FAA6F;AAC7F,SAAS,SAAS,CAAC,IAAY;IAC9B,IAAI,KAAK,GAAG,CAAC,CAAA;IACb,IAAI,GAAG,GAAG,IAAI,CAAC,MAAM,CAAA;IACrB,MAAM,KAAK,GAAG,CAAC,CAAS,EAAE,EAAE,CAAC,eAAe,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IACpD,OAAO,KAAK,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,KAAK,CAAE,CAAC;QAAE,KAAK,EAAE,CAAA;IACnD,OAAO,GAAG,GAAG,KAAK,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,GAAG,CAAC,CAAE,CAAC;QAAE,GAAG,EAAE,CAAA;IACnD,OAAO,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,CAAA;AAC9B,CAAC;AAED,SAAS,SAAS,CAAC,IAAY,EAAE,OAAyB;IACzD,OAAO,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;AACzE,CAAC;AAQD,gGAAgG;AAChG,MAAM,UAAU,kBAAkB,CAAC,IAAY,EAAE,OAAyB;IACzE,MAAM,QAAQ,GAAG,IAAI,KAAK,CAAS,IAAI,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IACvD,MAAM,MAAM,GAAG,MAAM,CAAA;IACrB,MAAM,KAAK,GAAe,EAAE,CAAA;IAC5B,IAAI,CAAyB,CAAA;IAC7B,OAAO,CAAC,CAAC,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QACzC,MAAM,OAAO,GAAG,CAAC,CAAC,CAAC,CAAC,CAAA;QACpB,MAAM,QAAQ,GAAG,SAAS,CAAC,OAAO,CAAC,CAAA;QACnC,IAAI,CAAC,QAAQ,EAAE,CAAC;YACf,KAAK,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC,KAAK,EAAE,GAAG,EAAE,CAAC,CAAC,KAAK,EAAE,IAAI,EAAE,EAAE,EAAE,CAAC,CAAA;YACtD,SAAQ;QACT,CAAC;QACD,IAAI,IAAI,GAAG,CAAC,CAAA;QACZ,MAAM,KAAK,GAAG,CAAC,CAAS,EAAE,EAAE,CAAC,eAAe,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;QACpD,OAAO,IAAI,GAAG,OAAO,CAAC,MAAM,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,IAAI,CAAE,CAAC;YAAE,IAAI,EAAE,CAAA;QAC9D,KAAK,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC,KAAK,GAAG,IAAI,EAAE,GAAG,EAAE,CAAC,CAAC,KAAK,GAAG,IAAI,GAAG,QAAQ,CAAC,MAAM,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC,CAAA;IAC7F,CAAC;IAED,IAAI,CAAC,GAAG,CAAC,CAAA;IACT,OAAO,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;QACzB,IAAI,CAAC,KAAK,CAAC,CAAC,CAAE,CAAC,IAAI,EAAE,CAAC;YACrB,CAAC,EAAE,CAAA;YACH,SAAQ;QACT,CAAC;QACD,IAAI,QAAQ,GAAG,CAAC,CAAA;QAChB,IAAI,WAAW,GAAG,CAAC,CAAA;QACnB,MAAM,IAAI,GAAG,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,QAAQ,EAAE,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;QACzD,KAAK,IAAI,CAAC,GAAG,IAAI,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAChC,MAAM,KAAK,GAAa,EAAE,CAAA;YAC1B,IAAI,EAAE,GAAG,IAAI,CAAA;YACb,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;gBAChC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAE,CAAC,IAAI,EAAE,CAAC;oBACrB,EAAE,GAAG,KAAK,CAAA;oBACV,MAAK;gBACN,CAAC;gBACD,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAE,CAAC,IAAI,CAAC,CAAA;YAC3B,CAAC;YACD,IAAI,CAAC,EAAE;gBAAE,SAAQ;YACjB,MAAM,GAAG,GAAG,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,WAAW,EAAE,CAAA;YACzC,IAAI,IAAI,GAAG,OAAO,CAAC,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,CAAA;YACxC,uFAAuF;YACvF,IAAI,CAAC,KAAK,CAAC;gBAAE,IAAI,IAAI,OAAO,CAAC,WAAW,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,CAAE,CAAC,IAAI,CAAC,CAAA;YAC5D,IAAI,IAAI,EAAE,CAAC;gBACV,QAAQ,GAAG,CAAC,CAAA;gBACZ,WAAW,GAAG,IAAI,CAAA;gBAClB,MAAK;YACN,CAAC;QACF,CAAC;QACD,IAAI,QAAQ,EAAE,CAAC;YACd,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAE,CAAC,KAAK,CAAA;YAC7B,MAAM,GAAG,GAAG,KAAK,CAAC,CAAC,GAAG,QAAQ,GAAG,CAAC,CAAE,CAAC,GAAG,CAAA;YACxC,KAAK,IAAI,CAAC,GAAG,KAAK,EAAE,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,EAAE,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,EAAE;gBAAE,QAAQ,CAAC,CAAC,CAAC,GAAG,WAAW,CAAA;YAClF,CAAC,IAAI,QAAQ,CAAA;QACd,CAAC;aAAM,CAAC;YACP,CAAC,EAAE,CAAA;QACJ,CAAC;IACF,CAAC;IACD,OAAO,QAAQ,CAAA;AAChB,CAAC;AAED;;;;;;;;;GASG;AACH,MAAM,UAAU,6BAA6B,CAC5C,SAAyD,EACzD,gBAAuC,EACvC,MAAM,GAAG,CAAC;IAEV,MAAM,CAAC,GAAG,SAAS,CAAC,UAAU,CAAC,MAAM,CAAA;IACrC,MAAM,QAAQ,GAAG,IAAI,KAAK,CAAU,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;IAClD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAC5B,IAAI,CAAC,gBAAgB,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC;YACpC,KAAK,IAAI,CAAC,GAAG,CAAC,MAAM,EAAE,CAAC,IAAI,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBACxC,MAAM,CAAC,GAAG,CAAC,GAAG,CAAC,CAAA;gBACf,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC;oBAAE,QAAQ,CAAC,CAAC,CAAC,GAAG,IAAI,CAAA;YACnD,CAAC;QACF,CAAC;IACF,CAAC;IACD,MAAM,GAAG,GAAG,SAAS,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,MAAM,IAAI,CAAC,CAAA;IAC9C,OAAO;QACN,QAAQ,EAAE,SAAS,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,KAAK,CAAS,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;QAClG,UAAU,EAAE,SAAS,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;KACrE,CAAA;AACF,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,sBAAsB,CACrC,IAAY,EACZ,MAAqC,EACrC,OAAyB;IAEzB,MAAM,QAAQ,GAAG,kBAAkB,CAAC,IAAI,EAAE,OAAO,CAAC,CAAA;IAClD,MAAM,IAAI,GAAG,GAAG,EAAE,CAAC,IAAI,KAAK,CAAS,OAAO,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IAChE,MAAM,QAAQ,GAAe,EAAE,CAAA;IAC/B,MAAM,UAAU,GAAa,EAAE,CAAA;IAC/B,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;QACxB,IAAI,IAAI,GAAG,CAAC,CAAA;QACZ,KAAK,IAAI,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;YACtC,IAAI,CAAC,GAAG,IAAI,CAAC,MAAM,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAE,CAAC,EAAE,CAAC;gBAC7C,IAAI,GAAG,QAAQ,CAAC,CAAC,CAAE,CAAA;gBACnB,MAAK;YACN,CAAC;QACF,CAAC;QACD,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,SAAS,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAA;QACvD,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;IAChC,CAAC;IACD,OAAO,EAAE,QAAQ,EAAE,UAAU,EAAE,CAAA;AAChC,CAAC"}
|
package/out/index.d.ts
CHANGED
package/out/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,cAAc,uBAAuB,CAAA;AACrC,cAAc,iBAAiB,CAAA;AAC/B,cAAc,aAAa,CAAA;AAC3B,cAAc,kBAAkB,CAAA;AAChC,cAAc,sBAAsB,CAAA;AACpC,cAAc,+BAA+B,CAAA;AAC7C,cAAc,0BAA0B,CAAA;AACxC,OAAO,EAAE,iBAAiB,EAAE,mBAAmB,EAAE,MAAM,wBAAwB,CAAA;AAC/E,YAAY,EAAE,eAAe,EAAE,kBAAkB,EAAE,cAAc,EAAE,SAAS,EAAE,MAAM,wBAAwB,CAAA;AAC5G,cAAc,gBAAgB,CAAA;AAC9B,OAAO,EACN,eAAe,EACf,iBAAiB,EACjB,sBAAsB,EACtB,cAAc,EACd,OAAO,EACP,OAAO,GACP,MAAM,cAAc,CAAA;AACrB,YAAY,EAAE,YAAY,EAAE,aAAa,EAAE,MAAM,cAAc,CAAA;AAC/D,cAAc,cAAc,CAAA"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,cAAc,uBAAuB,CAAA;AACrC,cAAc,iBAAiB,CAAA;AAC/B,cAAc,0BAA0B,CAAA;AACxC,cAAc,aAAa,CAAA;AAC3B,cAAc,kBAAkB,CAAA;AAChC,cAAc,sBAAsB,CAAA;AACpC,cAAc,+BAA+B,CAAA;AAC7C,cAAc,0BAA0B,CAAA;AACxC,OAAO,EAAE,iBAAiB,EAAE,mBAAmB,EAAE,MAAM,wBAAwB,CAAA;AAC/E,YAAY,EAAE,eAAe,EAAE,kBAAkB,EAAE,cAAc,EAAE,SAAS,EAAE,MAAM,wBAAwB,CAAA;AAC5G,cAAc,gBAAgB,CAAA;AAC9B,OAAO,EACN,eAAe,EACf,iBAAiB,EACjB,sBAAsB,EACtB,cAAc,EACd,OAAO,EACP,OAAO,GACP,MAAM,cAAc,CAAA;AACrB,YAAY,EAAE,YAAY,EAAE,aAAa,EAAE,MAAM,cAAc,CAAA;AAC/D,cAAc,cAAc,CAAA"}
|
package/out/index.js
CHANGED
package/out/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,cAAc,uBAAuB,CAAA;AACrC,cAAc,iBAAiB,CAAA;AAC/B,cAAc,aAAa,CAAA;AAC3B,cAAc,kBAAkB,CAAA;AAChC,cAAc,sBAAsB,CAAA;AACpC,cAAc,+BAA+B,CAAA;AAC7C,cAAc,0BAA0B,CAAA;AACxC,OAAO,EAAE,iBAAiB,EAAE,mBAAmB,EAAE,MAAM,wBAAwB,CAAA;AAE/E,cAAc,gBAAgB,CAAA;AAC9B,OAAO,EACN,eAAe,EACf,iBAAiB,EACjB,sBAAsB,EACtB,cAAc,EACd,OAAO,EACP,OAAO,GACP,MAAM,cAAc,CAAA;AAErB,cAAc,cAAc,CAAA"}
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,cAAc,uBAAuB,CAAA;AACrC,cAAc,iBAAiB,CAAA;AAC/B,cAAc,0BAA0B,CAAA;AACxC,cAAc,aAAa,CAAA;AAC3B,cAAc,kBAAkB,CAAA;AAChC,cAAc,sBAAsB,CAAA;AACpC,cAAc,+BAA+B,CAAA;AAC7C,cAAc,0BAA0B,CAAA;AACxC,OAAO,EAAE,iBAAiB,EAAE,mBAAmB,EAAE,MAAM,wBAAwB,CAAA;AAE/E,cAAc,gBAAgB,CAAA;AAC9B,OAAO,EACN,eAAe,EACf,iBAAiB,EACjB,sBAAsB,EACtB,cAAc,EACd,OAAO,EACP,OAAO,GACP,MAAM,cAAc,CAAA;AAErB,cAAc,cAAc,CAAA"}
|
package/out/onnx-runner.d.ts
CHANGED
|
@@ -30,6 +30,12 @@ export interface InferResult {
|
|
|
30
30
|
logits: number[][];
|
|
31
31
|
/** Number of label classes (the inner-dim of the logits tensor). */
|
|
32
32
|
numLabels: number;
|
|
33
|
+
/**
|
|
34
|
+
* Pooled locale-head posterior (`locale_logits` output, LOCALE_COUNTRIES order), when the model
|
|
35
|
+
* exports it (v1.1.0+, #511 Tier A). Absent on older bundles — consumers must treat undefined
|
|
36
|
+
* as "no address-system detection available".
|
|
37
|
+
*/
|
|
38
|
+
localeLogits?: number[];
|
|
33
39
|
}
|
|
34
40
|
export declare class OnnxRunner {
|
|
35
41
|
private readonly modelPath;
|
|
@@ -58,6 +64,9 @@ export declare class OnnxRunner {
|
|
|
58
64
|
infer(tokenIds: number[], anchor?: {
|
|
59
65
|
features: ReadonlyArray<ReadonlyArray<number>>;
|
|
60
66
|
confidence: ReadonlyArray<number>;
|
|
67
|
+
}, gazetteer?: {
|
|
68
|
+
features: ReadonlyArray<ReadonlyArray<number>>;
|
|
69
|
+
confidence: ReadonlyArray<number>;
|
|
61
70
|
}): Promise<InferResult>;
|
|
62
71
|
}
|
|
63
72
|
//# sourceMappingURL=onnx-runner.d.ts.map
|
package/out/onnx-runner.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"onnx-runner.d.ts","sourceRoot":"","sources":["../onnx-runner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;
|
|
1
|
+
{"version":3,"file":"onnx-runner.d.ts","sourceRoot":"","sources":["../onnx-runner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAQH,MAAM,WAAW,cAAc;IAC9B,wEAAwE;IACxE,MAAM,CAAC,EAAE,OAAO,CAAA;IAChB;;;;;OAKG;IACH,WAAW,CAAC,EAAE,MAAM,CAAA;CACpB;AAED,8FAA8F;AAC9F,eAAO,MAAM,qBAAqB,MAAM,CAAA;AAExC,MAAM,WAAW,WAAW;IAC3B,2EAA2E;IAC3E,MAAM,EAAE,MAAM,EAAE,EAAE,CAAA;IAClB,oEAAoE;IACpE,SAAS,EAAE,MAAM,CAAA;IACjB;;;;OAIG;IACH,YAAY,CAAC,EAAE,MAAM,EAAE,CAAA;CACvB;AAED,qBAAa,UAAU;IAMrB,OAAO,CAAC,QAAQ,CAAC,SAAS;IAC1B,OAAO,CAAC,QAAQ,CAAC,UAAU;IAN5B,OAAO,CAAC,OAAO,CAAoC;IACnD,OAAO,CAAC,WAAW,CAA6C;IAChE,SAAgB,WAAW,EAAE,MAAM,CAAA;IAEnC,OAAO;IAQP,oEAAoE;WACvD,MAAM,CAAC,SAAS,EAAE,MAAM,EAAE,IAAI,GAAE,cAAmB,GAAG,OAAO,CAAC,UAAU,CAAC;IAMtF,6CAA6C;WAChC,SAAS,CAAC,UAAU,EAAE,UAAU,EAAE,IAAI,GAAE,cAAmB,GAAG,OAAO,CAAC,UAAU,CAAC;YAMhF,aAAa;IAgB3B;;;;;;;;;;;OAWG;IACG,KAAK,CACV,QAAQ,EAAE,MAAM,EAAE,EAClB,MAAM,CAAC,EAAE;QAAE,QAAQ,EAAE,aAAa,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC,CAAC;QAAC,UAAU,EAAE,aAAa,CAAC,MAAM,CAAC,CAAA;KAAE,EAC9F,SAAS,CAAC,EAAE;QAAE,QAAQ,EAAE,aAAa,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC,CAAC;QAAC,UAAU,EAAE,aAAa,CAAC,MAAM,CAAC,CAAA;KAAE,GAC/F,OAAO,CAAC,WAAW,CAAC;CAoFvB"}
|
package/out/onnx-runner.js
CHANGED
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
import { promises as fs } from "node:fs";
|
|
16
16
|
import ort from "onnxruntime-node";
|
|
17
17
|
import { ANCHOR_FEATURE_DIM } from "./anchor-inference.js";
|
|
18
|
+
import { GAZETTEER_FEATURE_DIM } from "./gazetteer-inference.js";
|
|
18
19
|
/** Default sequence length for v0.1.0 / v0.2.0 (BertConfig max_position_embeddings = 128). */
|
|
19
20
|
export const DEFAULT_FIXED_SEQ_LEN = 128;
|
|
20
21
|
export class OnnxRunner {
|
|
@@ -70,7 +71,7 @@ export class OnnxRunner {
|
|
|
70
71
|
* `(seqLen × dim)` + confidence `(seqLen,)` are fed, zero-padded to `fixedSeqLen`. Omit for
|
|
71
72
|
* plain models, whose ONNX has no anchor inputs.
|
|
72
73
|
*/
|
|
73
|
-
async infer(tokenIds, anchor) {
|
|
74
|
+
async infer(tokenIds, anchor, gazetteer) {
|
|
74
75
|
const session = await this.ensureSession();
|
|
75
76
|
const seqLen = Math.min(tokenIds.length, this.fixedSeqLen);
|
|
76
77
|
const padded = new BigInt64Array(this.fixedSeqLen);
|
|
@@ -108,6 +109,30 @@ export class OnnxRunner {
|
|
|
108
109
|
]);
|
|
109
110
|
feeds.anchor_confidence = new ort.Tensor("float32", new Float32Array(this.fixedSeqLen), [1, this.fixedSeqLen]);
|
|
110
111
|
}
|
|
112
|
+
// Gazetteer-anchor channel (#464): same feed contract as the postcode anchor. Feature width is
|
|
113
|
+
// read from the supplied rows (the lexicon's slot count); a gazetteer-trained model with no clue
|
|
114
|
+
// data supplied gets the confidence=0 identity (the model's gazetteer-off behavior).
|
|
115
|
+
if (gazetteer && session.inputNames.includes("gazetteer_features")) {
|
|
116
|
+
const dim = gazetteer.features[0]?.length ?? 0;
|
|
117
|
+
const gf = new Float32Array(this.fixedSeqLen * dim);
|
|
118
|
+
const gc = new Float32Array(this.fixedSeqLen);
|
|
119
|
+
for (let i = 0; i < seqLen; i++) {
|
|
120
|
+
gc[i] = gazetteer.confidence[i] ?? 0;
|
|
121
|
+
const row = gazetteer.features[i];
|
|
122
|
+
if (row)
|
|
123
|
+
for (let d = 0; d < dim; d++)
|
|
124
|
+
gf[i * dim + d] = row[d] ?? 0;
|
|
125
|
+
}
|
|
126
|
+
feeds.gazetteer_features = new ort.Tensor("float32", gf, [1, this.fixedSeqLen, dim]);
|
|
127
|
+
feeds.gazetteer_confidence = new ort.Tensor("float32", gc, [1, this.fixedSeqLen]);
|
|
128
|
+
}
|
|
129
|
+
else if (session.inputNames.includes("gazetteer_features")) {
|
|
130
|
+
feeds.gazetteer_features = new ort.Tensor("float32", new Float32Array(this.fixedSeqLen * GAZETTEER_FEATURE_DIM), [1, this.fixedSeqLen, GAZETTEER_FEATURE_DIM]);
|
|
131
|
+
feeds.gazetteer_confidence = new ort.Tensor("float32", new Float32Array(this.fixedSeqLen), [
|
|
132
|
+
1,
|
|
133
|
+
this.fixedSeqLen,
|
|
134
|
+
]);
|
|
135
|
+
}
|
|
111
136
|
const output = await session.run(feeds);
|
|
112
137
|
const logitsTensor = output.logits;
|
|
113
138
|
if (!logitsTensor)
|
|
@@ -122,7 +147,10 @@ export class OnnxRunner {
|
|
|
122
147
|
row[l] = data[base + l];
|
|
123
148
|
logits.push(row);
|
|
124
149
|
}
|
|
125
|
-
|
|
150
|
+
// Locale head (#511 Tier A): present on v1.1.0+ exports, absent (and optional) before.
|
|
151
|
+
const localeTensor = output.locale_logits;
|
|
152
|
+
const localeLogits = localeTensor ? Array.from(localeTensor.data) : undefined;
|
|
153
|
+
return { logits, numLabels, ...(localeLogits ? { localeLogits } : {}) };
|
|
126
154
|
}
|
|
127
155
|
}
|
|
128
156
|
//# sourceMappingURL=onnx-runner.js.map
|
package/out/onnx-runner.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"onnx-runner.js","sourceRoot":"","sources":["../onnx-runner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAEH,OAAO,EAAE,QAAQ,IAAI,EAAE,EAAE,MAAM,SAAS,CAAA;AACxC,OAAO,GAAG,MAAM,kBAAkB,CAAA;AAElC,OAAO,EAAE,kBAAkB,EAAE,MAAM,uBAAuB,CAAA;
|
|
1
|
+
{"version":3,"file":"onnx-runner.js","sourceRoot":"","sources":["../onnx-runner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAEH,OAAO,EAAE,QAAQ,IAAI,EAAE,EAAE,MAAM,SAAS,CAAA;AACxC,OAAO,GAAG,MAAM,kBAAkB,CAAA;AAElC,OAAO,EAAE,kBAAkB,EAAE,MAAM,uBAAuB,CAAA;AAC1D,OAAO,EAAE,qBAAqB,EAAE,MAAM,0BAA0B,CAAA;AAchE,8FAA8F;AAC9F,MAAM,CAAC,MAAM,qBAAqB,GAAG,GAAG,CAAA;AAexC,MAAM,OAAO,UAAU;IAMJ;IACA;IANV,OAAO,GAAgC,IAAI,CAAA;IAC3C,WAAW,GAAyC,IAAI,CAAA;IAChD,WAAW,CAAQ;IAEnC,YACkB,SAAiB,EACjB,UAA6B,EAC9C,IAAoB;QAFH,cAAS,GAAT,SAAS,CAAQ;QACjB,eAAU,GAAV,UAAU,CAAmB;QAG9C,IAAI,CAAC,WAAW,GAAG,IAAI,CAAC,WAAW,IAAI,qBAAqB,CAAA;IAC7D,CAAC;IAED,oEAAoE;IACpE,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,SAAiB,EAAE,OAAuB,EAAE;QAC/D,MAAM,MAAM,GAAG,IAAI,UAAU,CAAC,SAAS,EAAE,IAAI,EAAE,IAAI,CAAC,CAAA;QACpD,IAAI,IAAI,CAAC,MAAM;YAAE,MAAM,MAAM,CAAC,aAAa,EAAE,CAAA;QAC7C,OAAO,MAAM,CAAA;IACd,CAAC;IAED,6CAA6C;IAC7C,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,UAAsB,EAAE,OAAuB,EAAE;QACvE,MAAM,MAAM,GAAG,IAAI,UAAU,CAAC,SAAS,EAAE,UAAU,EAAE,IAAI,CAAC,CAAA;QAC1D,IAAI,IAAI,CAAC,MAAM;YAAE,MAAM,MAAM,CAAC,aAAa,EAAE,CAAA;QAC7C,OAAO,MAAM,CAAA;IACd,CAAC;IAEO,KAAK,CAAC,aAAa;QAC1B,IAAI,IAAI,CAAC,OAAO;YAAE,OAAO,IAAI,CAAC,OAAO,CAAA;QACrC,IAAI,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC;YACvB,IAAI,CAAC,WAAW,GAAG,CAAC,KAAK,IAAI,EAAE;gBAC9B,MAAM,KAAK,GAAG,IAAI,CAAC,UAAU,IAAI,IAAI,UAAU,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAA;gBAClF,MAAM,OAAO,GAAG,MAAM,GAAG,CAAC,gBAAgB,CAAC,MAAM,CAAC,KAAK,EAAE;oBACxD,kBAAkB,EAAE,CAAC,KAAK,CAAC;oBAC3B,sBAAsB,EAAE,KAAK;iBAC7B,CAAC,CAAA;gBACF,IAAI,CAAC,OAAO,GAAG,OAAO,CAAA;gBACtB,OAAO,OAAO,CAAA;YACf,CAAC,CAAC,EAAE,CAAA;QACL,CAAC;QACD,OAAO,IAAI,CAAC,WAAW,CAAA;IACxB,CAAC;IAED;;;;;;;;;;;OAWG;IACH,KAAK,CAAC,KAAK,CACV,QAAkB,EAClB,MAA8F,EAC9F,SAAiG;QAEjG,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,aAAa,EAAE,CAAA;QAC1C,MAAM,MAAM,GAAG,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,MAAM,EAAE,IAAI,CAAC,WAAW,CAAC,CAAA;QAC1D,MAAM,MAAM,GAAG,IAAI,aAAa,CAAC,IAAI,CAAC,WAAW,CAAC,CAAA;QAClD,MAAM,IAAI,GAAG,IAAI,aAAa,CAAC,IAAI,CAAC,WAAW,CAAC,CAAA;QAChD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACjC,MAAM,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAE,CAAC,CAAA;YAChC,IAAI,CAAC,CAAC,CAAC,GAAG,EAAE,CAAA;QACb,CAAC;QAED,MAAM,KAAK,GAA+B;YACzC,SAAS,EAAE,IAAI,GAAG,CAAC,MAAM,CAAC,OAAO,EAAE,MAAM,EAAE,CAAC,CAAC,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC;YACjE,cAAc,EAAE,IAAI,GAAG,CAAC,MAAM,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC;SACpE,CAAA;QAED,IAAI,MAAM,EAAE,CAAC;YACZ,MAAM,GAAG,GAAG,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,MAAM,IAAI,CAAC,CAAA;YAC3C,MAAM,EAAE,GAAG,IAAI,YAAY,CAAC,IAAI,CAAC,WAAW,GAAG,GAAG,CAAC,CAAA;YACnD,MAAM,EAAE,GAAG,IAAI,YAAY,CAAC,IAAI,CAAC,WAAW,CAAC,CAAA;YAC7C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBACjC,EAAE,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,UAAU,CAAC,CAAC,CAAC,IAAI,CAAC,CAAA;gBACjC,MAAM,GAAG,GAAG,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAA;gBAC9B,IAAI,GAAG;oBAAE,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,EAAE,CAAC,EAAE;wBAAE,EAAE,CAAC,CAAC,GAAG,GAAG,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAA;YACrE,CAAC;YACD,KAAK,CAAC,eAAe,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,SAAS,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,IAAI,CAAC,WAAW,EAAE,GAAG,CAAC,CAAC,CAAA;YACjF,KAAK,CAAC,iBAAiB,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,SAAS,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC,CAAA;QAC/E,CAAC;aAAM,IAAI,OAAO,CAAC,UAAU,CAAC,QAAQ,CAAC,iBAAiB,CAAC,EAAE,CAAC;YAC3D,6FAA6F;YAC7F,0FAA0F;YAC1F,0EAA0E;YAC1E,KAAK,CAAC,eAAe,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,SAAS,EAAE,IAAI,YAAY,CAAC,IAAI,CAAC,WAAW,GAAG,kBAAkB,CAAC,EAAE;gBAC1G,CAAC;gBACD,IAAI,CAAC,WAAW;gBAChB,kBAAkB;aAClB,CAAC,CAAA;YACF,KAAK,CAAC,iBAAiB,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,SAAS,EAAE,IAAI,YAAY,CAAC,IAAI,CAAC,WAAW,CAAC,EAAE,CAAC,CAAC,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC,CAAA;QAC/G,CAAC;QAED,+FAA+F;QAC/F,iGAAiG;QACjG,qFAAqF;QACrF,IAAI,SAAS,IAAI,OAAO,CAAC,UAAU,CAAC,QAAQ,CAAC,oBAAoB,CAAC,EAAE,CAAC;YACpE,MAAM,GAAG,GAAG,SAAS,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,MAAM,IAAI,CAAC,CAAA;YAC9C,MAAM,EAAE,GAAG,IAAI,YAAY,CAAC,IAAI,CAAC,WAAW,GAAG,GAAG,CAAC,CAAA;YACnD,MAAM,EAAE,GAAG,IAAI,YAAY,CAAC,IAAI,CAAC,WAAW,CAAC,CAAA;YAC7C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBACjC,EAAE,CAAC,CAAC,CAAC,GAAG,SAAS,CAAC,UAAU,CAAC,CAAC,CAAC,IAAI,CAAC,CAAA;gBACpC,MAAM,GAAG,GAAG,SAAS,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAA;gBACjC,IAAI,GAAG;oBAAE,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,EAAE,CAAC,EAAE;wBAAE,EAAE,CAAC,CAAC,GAAG,GAAG,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAA;YACrE,CAAC;YACD,KAAK,CAAC,kBAAkB,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,SAAS,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,IAAI,CAAC,WAAW,EAAE,GAAG,CAAC,CAAC,CAAA;YACpF,KAAK,CAAC,oBAAoB,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,SAAS,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC,CAAA;QAClF,CAAC;aAAM,IAAI,OAAO,CAAC,UAAU,CAAC,QAAQ,CAAC,oBAAoB,CAAC,EAAE,CAAC;YAC9D,KAAK,CAAC,kBAAkB,GAAG,IAAI,GAAG,CAAC,MAAM,CACxC,SAAS,EACT,IAAI,YAAY,CAAC,IAAI,CAAC,WAAW,GAAG,qBAAqB,CAAC,EAC1D,CAAC,CAAC,EAAE,IAAI,CAAC,WAAW,EAAE,qBAAqB,CAAC,CAC5C,CAAA;YACD,KAAK,CAAC,oBAAoB,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,SAAS,EAAE,IAAI,YAAY,CAAC,IAAI,CAAC,WAAW,CAAC,EAAE;gBAC1F,CAAC;gBACD,IAAI,CAAC,WAAW;aAChB,CAAC,CAAA;QACH,CAAC;QAED,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,CAAA;QACvC,MAAM,YAAY,GAAG,MAAM,CAAC,MAAM,CAAA;QAClC,IAAI,CAAC,YAAY;YAAE,MAAM,IAAI,KAAK,CAAC,6CAA6C,CAAC,CAAA;QACjF,MAAM,IAAI,GAAG,YAAY,CAAC,IAAoB,CAAA;QAC9C,MAAM,CAAC,EAAE,AAAD,EAAG,SAAS,CAAC,GAAG,YAAY,CAAC,IAAyC,CAAA;QAE9E,MAAM,MAAM,GAAe,EAAE,CAAA;QAC7B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACjC,MAAM,GAAG,GAAa,IAAI,KAAK,CAAC,SAAS,CAAC,CAAA;YAC1C,MAAM,IAAI,GAAG,CAAC,GAAG,SAAS,CAAA;YAC1B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,EAAE,CAAC,EAAE;gBAAE,GAAG,CAAC,CAAC,CAAC,GAAG,IAAI,CAAC,IAAI,GAAG,CAAC,CAAE,CAAA;YAC5D,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;QACjB,CAAC;QAED,uFAAuF;QACvF,MAAM,YAAY,GAAG,MAAM,CAAC,aAAa,CAAA;QACzC,MAAM,YAAY,GAAG,YAAY,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,YAAY,CAAC,IAAoB,CAAC,CAAC,CAAC,CAAC,SAAS,CAAA;QAE7F,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,GAAG,CAAC,YAAY,CAAC,CAAC,CAAC,EAAE,YAAY,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,CAAA;IACxE,CAAC;CACD"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mailwoman/neural",
|
|
3
|
-
"version": "4.
|
|
3
|
+
"version": "4.3.0",
|
|
4
4
|
"description": "Mailwoman neural classifier runtime: SentencePiece tokenizer + ONNX inference + decoder wiring.",
|
|
5
5
|
"license": "AGPL-3.0-only",
|
|
6
6
|
"repository": {
|
|
@@ -20,8 +20,8 @@
|
|
|
20
20
|
"./browser": "./out/browser.js"
|
|
21
21
|
},
|
|
22
22
|
"dependencies": {
|
|
23
|
-
"@mailwoman/codex": "4.
|
|
24
|
-
"@mailwoman/core": "4.
|
|
23
|
+
"@mailwoman/codex": "4.3.0",
|
|
24
|
+
"@mailwoman/core": "4.3.0",
|
|
25
25
|
"@sctg/sentencepiece-js": "^1.3.3",
|
|
26
26
|
"onnxruntime-node": "^1.26.0"
|
|
27
27
|
},
|