@mailwoman/registry 4.8.1 → 4.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/out/address-key.d.ts +31 -0
- package/out/address-key.d.ts.map +1 -0
- package/out/address-key.js +38 -0
- package/out/address-key.js.map +1 -0
- package/out/geojson.d.ts.map +1 -1
- package/out/geojson.js +2 -0
- package/out/geojson.js.map +1 -1
- package/out/index.d.ts +3 -0
- package/out/index.d.ts.map +1 -1
- package/out/index.js +3 -0
- package/out/index.js.map +1 -1
- package/out/ingest.d.ts +53 -4
- package/out/ingest.d.ts.map +1 -1
- package/out/ingest.js +111 -2
- package/out/ingest.js.map +1 -1
- package/out/learned-scorer.d.ts +57 -0
- package/out/learned-scorer.d.ts.map +1 -0
- package/out/learned-scorer.js +78 -0
- package/out/learned-scorer.js.map +1 -0
- package/out/models/dedup-gbt-en-us.d.ts +36 -0
- package/out/models/dedup-gbt-en-us.d.ts.map +1 -0
- package/out/models/dedup-gbt-en-us.js +36 -0
- package/out/models/dedup-gbt-en-us.js.map +1 -0
- package/out/resolve.d.ts +114 -4
- package/out/resolve.d.ts.map +1 -1
- package/out/resolve.js +165 -21
- package/out/resolve.js.map +1 -1
- package/out/types.d.ts +7 -0
- package/out/types.d.ts.map +1 -1
- package/out/types.js.map +1 -1
- package/package.json +6 -4
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* The address-id consumer for the matcher (#259). Derives a stable {@link PostalAddressID} from a
|
|
7
|
+
* resolved {@link SourceRecord} and exposes it as a blocking key — the deterministic,
|
|
8
|
+
* exact-canonical-address complement to the fuzzy Fellegi-Sunter / GBT scoring. Two uses:
|
|
9
|
+
*
|
|
10
|
+
* - **As a pre-dedup / join key:** `GROUP BY postalAddressId(record)` collapses records that resolve
|
|
11
|
+
* to the same place AND share a canonical address with NO scoring at all — the cheap, certain
|
|
12
|
+
* slice of dedup before the matcher does the fuzzy rest.
|
|
13
|
+
* - **As a blocking key:** {@link addressIdBlockingKey} adds the address-id to the blocking union, so
|
|
14
|
+
* records sharing one are guaranteed to be compared.
|
|
15
|
+
*/
|
|
16
|
+
import { type PostalAddressID } from "@mailwoman/address-id";
|
|
17
|
+
import { type BlockingKey } from "@mailwoman/match";
|
|
18
|
+
import type { SourceRecord } from "./types.js";
|
|
19
|
+
/**
|
|
20
|
+
* The stable address primary key for a record, or null when it isn't geocoded (no coordinate → no
|
|
21
|
+
* locality cell) or carries no raw address to hash. Uses the resolved coordinate + the raw address;
|
|
22
|
+
* the state prefix is plucked from the address when present.
|
|
23
|
+
*/
|
|
24
|
+
export declare function postalAddressId(record: SourceRecord): PostalAddressID | null;
|
|
25
|
+
/**
|
|
26
|
+
* A blocking key on the {@link postalAddressId} — records that resolve to the same place with the
|
|
27
|
+
* same canonical address block together. Add it to {@link defaultBlockingKeys}'s union when an exact
|
|
28
|
+
* address join should never be missed.
|
|
29
|
+
*/
|
|
30
|
+
export declare function addressIdBlockingKey(): BlockingKey<SourceRecord>;
|
|
31
|
+
//# sourceMappingURL=address-key.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"address-key.d.ts","sourceRoot":"","sources":["../address-key.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;GAcG;AAEH,OAAO,EAAyB,KAAK,eAAe,EAAE,MAAM,uBAAuB,CAAA;AACnF,OAAO,EAAE,KAAK,WAAW,EAAY,MAAM,kBAAkB,CAAA;AAC7D,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,YAAY,CAAA;AAE9C;;;;GAIG;AACH,wBAAgB,eAAe,CAAC,MAAM,EAAE,YAAY,GAAG,eAAe,GAAG,IAAI,CAK5E;AAED;;;;GAIG;AACH,wBAAgB,oBAAoB,IAAI,WAAW,CAAC,YAAY,CAAC,CAEhE"}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* The address-id consumer for the matcher (#259). Derives a stable {@link PostalAddressID} from a
|
|
7
|
+
* resolved {@link SourceRecord} and exposes it as a blocking key — the deterministic,
|
|
8
|
+
* exact-canonical-address complement to the fuzzy Fellegi-Sunter / GBT scoring. Two uses:
|
|
9
|
+
*
|
|
10
|
+
* - **As a pre-dedup / join key:** `GROUP BY postalAddressId(record)` collapses records that resolve
|
|
11
|
+
* to the same place AND share a canonical address with NO scoring at all — the cheap, certain
|
|
12
|
+
* slice of dedup before the matcher does the fuzzy rest.
|
|
13
|
+
* - **As a blocking key:** {@link addressIdBlockingKey} adds the address-id to the blocking union, so
|
|
14
|
+
* records sharing one are guaranteed to be compared.
|
|
15
|
+
*/
|
|
16
|
+
import { createPostalAddressID } from "@mailwoman/address-id";
|
|
17
|
+
import { exactKey } from "@mailwoman/match";
|
|
18
|
+
/**
|
|
19
|
+
* The stable address primary key for a record, or null when it isn't geocoded (no coordinate → no
|
|
20
|
+
* locality cell) or carries no raw address to hash. Uses the resolved coordinate + the raw address;
|
|
21
|
+
* the state prefix is plucked from the address when present.
|
|
22
|
+
*/
|
|
23
|
+
export function postalAddressId(record) {
|
|
24
|
+
const coordinate = record.address?.geocode?.coordinate;
|
|
25
|
+
const address = record.address?.raw;
|
|
26
|
+
if (!coordinate || !address)
|
|
27
|
+
return null;
|
|
28
|
+
return createPostalAddressID({ coordinate, address });
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* A blocking key on the {@link postalAddressId} — records that resolve to the same place with the
|
|
32
|
+
* same canonical address block together. Add it to {@link defaultBlockingKeys}'s union when an exact
|
|
33
|
+
* address join should never be missed.
|
|
34
|
+
*/
|
|
35
|
+
export function addressIdBlockingKey() {
|
|
36
|
+
return exactKey((record) => postalAddressId(record));
|
|
37
|
+
}
|
|
38
|
+
//# sourceMappingURL=address-key.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"address-key.js","sourceRoot":"","sources":["../address-key.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;GAcG;AAEH,OAAO,EAAE,qBAAqB,EAAwB,MAAM,uBAAuB,CAAA;AACnF,OAAO,EAAoB,QAAQ,EAAE,MAAM,kBAAkB,CAAA;AAG7D;;;;GAIG;AACH,MAAM,UAAU,eAAe,CAAC,MAAoB;IACnD,MAAM,UAAU,GAAG,MAAM,CAAC,OAAO,EAAE,OAAO,EAAE,UAAU,CAAA;IACtD,MAAM,OAAO,GAAG,MAAM,CAAC,OAAO,EAAE,GAAG,CAAA;IACnC,IAAI,CAAC,UAAU,IAAI,CAAC,OAAO;QAAE,OAAO,IAAI,CAAA;IACxC,OAAO,qBAAqB,CAAC,EAAE,UAAU,EAAE,OAAO,EAAE,CAAC,CAAA;AACtD,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,oBAAoB;IACnC,OAAO,QAAQ,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,eAAe,CAAC,MAAM,CAAC,CAAC,CAAA;AACrD,CAAC"}
|
package/out/geojson.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"geojson.d.ts","sourceRoot":"","sources":["../geojson.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,KAAK,EAAkB,wBAAwB,EAAE,cAAc,EAAgB,MAAM,YAAY,CAAA;
|
|
1
|
+
{"version":3,"file":"geojson.d.ts","sourceRoot":"","sources":["../geojson.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,KAAK,EAAkB,wBAAwB,EAAE,cAAc,EAAgB,MAAM,YAAY,CAAA;AAqCxG;;;GAGG;AACH,wBAAgB,SAAS,CAAC,QAAQ,EAAE,SAAS,cAAc,EAAE,GAAG,wBAAwB,CAKvF"}
|
package/out/geojson.js
CHANGED
|
@@ -34,6 +34,8 @@ function toFeature(entity) {
|
|
|
34
34
|
recordCount: entity.records.length,
|
|
35
35
|
cohesion: entity.cohesion,
|
|
36
36
|
sourceIds: entity.records.map((r) => r.id),
|
|
37
|
+
// Distinct provenance labels the entity's records span — an entity with ≥2 is a cross-dataset link.
|
|
38
|
+
sources: [...new Set(entity.records.map((r) => r.source).filter((s) => !!s))].sort(),
|
|
37
39
|
name: displayName(rep),
|
|
38
40
|
organization: rep.organization?.canonical ?? null,
|
|
39
41
|
address: rep.address?.formatted ?? null,
|
package/out/geojson.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"geojson.js","sourceRoot":"","sources":["../geojson.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAIH,0EAA0E;AAC1E,SAAS,WAAW,CAAC,MAAoB;IACxC,MAAM,IAAI,GAAG,MAAM,CAAC,IAAI,CAAA;IACxB,IAAI,CAAC,IAAI;QAAE,OAAO,IAAI,CAAA;IACtB,MAAM,MAAM,GAAG,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,KAAK,EAAE,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,cAAc,EAAE,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,MAAM,CAAC;SAClG,MAAM,CAAC,OAAO,CAAC;SACf,IAAI,CAAC,GAAG,CAAC;SACT,IAAI,EAAE,CAAA;IACR,OAAO,MAAM,IAAI,IAAI,CAAA;AACtB,CAAC;AAED,8CAA8C;AAC9C,SAAS,SAAS,CAAC,MAAsB;IACxC,MAAM,GAAG,GAAG,MAAM,CAAC,cAAc,CAAA;IACjC,OAAO;QACN,IAAI,EAAE,SAAS;QACf,QAAQ,EAAE;YACT,IAAI,EAAE,OAAO;YACb,WAAW,EAAE,CAAC,MAAM,CAAC,UAAW,CAAC,SAAS,EAAE,MAAM,CAAC,UAAW,CAAC,QAAQ,CAAC;SACxE;QACD,UAAU,EAAE;YACX,QAAQ,EAAE,MAAM,CAAC,EAAE;YACnB,WAAW,EAAE,MAAM,CAAC,OAAO,CAAC,MAAM;YAClC,QAAQ,EAAE,MAAM,CAAC,QAAQ;YACzB,SAAS,EAAE,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;YAC1C,IAAI,EAAE,WAAW,CAAC,GAAG,CAAC;YACtB,YAAY,EAAE,GAAG,CAAC,YAAY,EAAE,SAAS,IAAI,IAAI;YACjD,OAAO,EAAE,GAAG,CAAC,OAAO,EAAE,SAAS,IAAI,IAAI;YACvC,WAAW,EAAE,GAAG,CAAC,OAAO,EAAE,OAAO,EAAE,IAAI,IAAI,IAAI;SAC/C;KACD,CAAA;AACF,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,SAAS,CAAC,QAAmC;IAC5D,OAAO;QACN,IAAI,EAAE,mBAAmB;QACzB,QAAQ,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC;KACvE,CAAA;AACF,CAAC"}
|
|
1
|
+
{"version":3,"file":"geojson.js","sourceRoot":"","sources":["../geojson.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAIH,0EAA0E;AAC1E,SAAS,WAAW,CAAC,MAAoB;IACxC,MAAM,IAAI,GAAG,MAAM,CAAC,IAAI,CAAA;IACxB,IAAI,CAAC,IAAI;QAAE,OAAO,IAAI,CAAA;IACtB,MAAM,MAAM,GAAG,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,KAAK,EAAE,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,cAAc,EAAE,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,MAAM,CAAC;SAClG,MAAM,CAAC,OAAO,CAAC;SACf,IAAI,CAAC,GAAG,CAAC;SACT,IAAI,EAAE,CAAA;IACR,OAAO,MAAM,IAAI,IAAI,CAAA;AACtB,CAAC;AAED,8CAA8C;AAC9C,SAAS,SAAS,CAAC,MAAsB;IACxC,MAAM,GAAG,GAAG,MAAM,CAAC,cAAc,CAAA;IACjC,OAAO;QACN,IAAI,EAAE,SAAS;QACf,QAAQ,EAAE;YACT,IAAI,EAAE,OAAO;YACb,WAAW,EAAE,CAAC,MAAM,CAAC,UAAW,CAAC,SAAS,EAAE,MAAM,CAAC,UAAW,CAAC,QAAQ,CAAC;SACxE;QACD,UAAU,EAAE;YACX,QAAQ,EAAE,MAAM,CAAC,EAAE;YACnB,WAAW,EAAE,MAAM,CAAC,OAAO,CAAC,MAAM;YAClC,QAAQ,EAAE,MAAM,CAAC,QAAQ;YACzB,SAAS,EAAE,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;YAC1C,oGAAoG;YACpG,OAAO,EAAE,CAAC,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAe,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE;YACjG,IAAI,EAAE,WAAW,CAAC,GAAG,CAAC;YACtB,YAAY,EAAE,GAAG,CAAC,YAAY,EAAE,SAAS,IAAI,IAAI;YACjD,OAAO,EAAE,GAAG,CAAC,OAAO,EAAE,SAAS,IAAI,IAAI;YACvC,WAAW,EAAE,GAAG,CAAC,OAAO,EAAE,OAAO,EAAE,IAAI,IAAI,IAAI;SAC/C;KACD,CAAA;AACF,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,SAAS,CAAC,QAAmC;IAC5D,OAAO;QACN,IAAI,EAAE,mBAAmB;QACzB,QAAQ,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC;KACvE,CAAA;AACF,CAAC"}
|
package/out/index.d.ts
CHANGED
|
@@ -10,8 +10,11 @@
|
|
|
10
10
|
* {@link toGeoJSON} exports them for QGIS. This is the clinic-funding use case mailwoman was built
|
|
11
11
|
* for, finally standing on a calibrated, label-free matcher.
|
|
12
12
|
*/
|
|
13
|
+
export * from "./address-key.js";
|
|
13
14
|
export * from "./geojson.js";
|
|
14
15
|
export * from "./ingest.js";
|
|
16
|
+
export * from "./learned-scorer.js";
|
|
17
|
+
export * from "./models/dedup-gbt-en-us.js";
|
|
15
18
|
export * from "./resolve.js";
|
|
16
19
|
export * from "./types.js";
|
|
17
20
|
//# sourceMappingURL=index.d.ts.map
|
package/out/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,cAAc,cAAc,CAAA;AAC5B,cAAc,aAAa,CAAA;AAC3B,cAAc,cAAc,CAAA;AAC5B,cAAc,YAAY,CAAA"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,cAAc,kBAAkB,CAAA;AAChC,cAAc,cAAc,CAAA;AAC5B,cAAc,aAAa,CAAA;AAC3B,cAAc,qBAAqB,CAAA;AACnC,cAAc,6BAA6B,CAAA;AAC3C,cAAc,cAAc,CAAA;AAC5B,cAAc,YAAY,CAAA"}
|
package/out/index.js
CHANGED
|
@@ -10,8 +10,11 @@
|
|
|
10
10
|
* {@link toGeoJSON} exports them for QGIS. This is the clinic-funding use case mailwoman was built
|
|
11
11
|
* for, finally standing on a calibrated, label-free matcher.
|
|
12
12
|
*/
|
|
13
|
+
export * from "./address-key.js";
|
|
13
14
|
export * from "./geojson.js";
|
|
14
15
|
export * from "./ingest.js";
|
|
16
|
+
export * from "./learned-scorer.js";
|
|
17
|
+
export * from "./models/dedup-gbt-en-us.js";
|
|
15
18
|
export * from "./resolve.js";
|
|
16
19
|
export * from "./types.js";
|
|
17
20
|
//# sourceMappingURL=index.js.map
|
package/out/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,cAAc,cAAc,CAAA;AAC5B,cAAc,aAAa,CAAA;AAC3B,cAAc,cAAc,CAAA;AAC5B,cAAc,YAAY,CAAA"}
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,cAAc,kBAAkB,CAAA;AAChC,cAAc,cAAc,CAAA;AAC5B,cAAc,aAAa,CAAA;AAC3B,cAAc,qBAAqB,CAAA;AACnC,cAAc,6BAA6B,CAAA;AAC3C,cAAc,cAAc,CAAA;AAC5B,cAAc,YAAY,CAAA"}
|
package/out/ingest.d.ts
CHANGED
|
@@ -26,8 +26,36 @@ import { toPostalAddress } from "@mailwoman/record";
|
|
|
26
26
|
import type { SourceRecord } from "./types.js";
|
|
27
27
|
/** Resolve a raw address string into a {@link PostalAddress}. The seam to mailwoman's geocoder. */
|
|
28
28
|
export type GeocodeAddress = (raw: string) => Promise<PostalAddress | null> | PostalAddress | null;
|
|
29
|
-
/**
|
|
30
|
-
|
|
29
|
+
/** Column delimiter of a delimited source. */
|
|
30
|
+
export type Delimiter = "comma" | "tab";
|
|
31
|
+
/** Infer the delimiter from a path's extension (`.tsv` → tab, else comma). */
|
|
32
|
+
export declare function delimiterFor(path: string): Delimiter;
|
|
33
|
+
/**
|
|
34
|
+
* Stream a delimited file's rows lazily as header-keyed objects — the same shape {@link parseCsv}
|
|
35
|
+
* returns, but **without loading the file into memory**. A multi-GB source (the NPPES registry is
|
|
36
|
+
* ~4.8 GB / 9.6M rows — too big for `readFileSync`, which throws `ERR_STRING_TOO_LONG`) streams
|
|
37
|
+
* line by line. Keys are the original header names so a {@link ColumnMapping} written against the
|
|
38
|
+
* source's headers matches. Filter/sample the stream before {@link ingestRows} to keep only the rows
|
|
39
|
+
* you geocode.
|
|
40
|
+
*
|
|
41
|
+
* We stream _lines_ with spliterator's `TextSpliterator` (pure-Node, the part that handles the huge
|
|
42
|
+
* file) and split each line into columns here with `String.prototype.split`. We deliberately do NOT
|
|
43
|
+
* use `CSVSpliterator`: its column tokenizer hard-codes `skipEmpty` (it builds the column
|
|
44
|
+
* spliterator as `{ delimiter }` with no `skipEmpty: false`), so consecutive delimiters collapse
|
|
45
|
+
* and EMPTY FIELDS ARE DROPPED — fatal for a fixed-width registry like NPPES where a row of 330
|
|
46
|
+
* columns full of empties would mis-parse to 40 and shift every value. (Upstream `spliterator` bug;
|
|
47
|
+
* revisit when it's fixed.)
|
|
48
|
+
*
|
|
49
|
+
* Assumes an unquoted delimited file (no fields containing the delimiter) — true for these
|
|
50
|
+
* government TSVs. For small, possibly-quoted CSVs use {@link parseCsv} (quote-aware, in-memory).
|
|
51
|
+
*/
|
|
52
|
+
export declare function streamRows(source: string, opts?: {
|
|
53
|
+
delimiter?: Delimiter;
|
|
54
|
+
}): AsyncGenerator<Record<string, string>>;
|
|
55
|
+
/**
|
|
56
|
+
* Maps dataset columns to record fields. A field may draw from several columns (joined with
|
|
57
|
+
* spaces).
|
|
58
|
+
*/
|
|
31
59
|
export interface ColumnMapping {
|
|
32
60
|
/** Column holding a stable row id. Falls back to the row index. */
|
|
33
61
|
id?: string;
|
|
@@ -38,7 +66,24 @@ export interface ColumnMapping {
|
|
|
38
66
|
address?: string | string[];
|
|
39
67
|
phone?: string;
|
|
40
68
|
email?: string;
|
|
69
|
+
/**
|
|
70
|
+
* Extra secondary-identifier fields → the column(s) to draw each from (joined with spaces). Land
|
|
71
|
+
* on `SourceRecord.attributes` under the same key, for the matcher's `discriminators`
|
|
72
|
+
* (authorized-official name, taxonomy, license…).
|
|
73
|
+
*/
|
|
74
|
+
attributes?: Record<string, string | string[]>;
|
|
41
75
|
}
|
|
76
|
+
/**
|
|
77
|
+
* Best-effort {@link ColumnMapping} inferred from a header row — the "point it at any CSV"
|
|
78
|
+
* convenience. Each column name is matched (case- and punctuation-insensitive, on whole tokens) to
|
|
79
|
+
* a field by keyword, in a precedence that resolves the common ambiguities: a dedicated id / phone
|
|
80
|
+
* / email column is claimed before the generic sweep, an org / facility column beats a person
|
|
81
|
+
* "name", and address columns (street / city / state / zip…) collect into one multi-column field.
|
|
82
|
+
* Imperfect on bespoke headers (an explicit mapping or the LLM-assisted inference #603 is the
|
|
83
|
+
* answer there), but it nails tidy and semi-tidy files with no hand-mapping. Unmatched columns are
|
|
84
|
+
* left out.
|
|
85
|
+
*/
|
|
86
|
+
export declare function inferMapping(header: readonly string[]): ColumnMapping;
|
|
42
87
|
/** Options for {@link ingestRows}. */
|
|
43
88
|
export interface IngestOptions {
|
|
44
89
|
/** The geocoding seam. Without it, records carry name/org but no resolved address. */
|
|
@@ -46,8 +91,12 @@ export interface IngestOptions {
|
|
|
46
91
|
}
|
|
47
92
|
/** Parse a CSV string (with a header row) into row objects keyed by column name. */
|
|
48
93
|
export declare function parseCsv(text: string): Record<string, string>[];
|
|
49
|
-
/**
|
|
50
|
-
|
|
94
|
+
/**
|
|
95
|
+
* Normalize tabular rows into {@link SourceRecord}s under a {@link ColumnMapping}. Accepts a sync OR
|
|
96
|
+
* async iterable, so {@link parseCsv} (in-memory) and {@link streamRows} (lazy, for huge files) both
|
|
97
|
+
* thread straight through.
|
|
98
|
+
*/
|
|
99
|
+
export declare function ingestRows(rows: Iterable<Record<string, string>> | AsyncIterable<Record<string, string>>, mapping: ColumnMapping, opts?: IngestOptions): Promise<SourceRecord[]>;
|
|
51
100
|
/**
|
|
52
101
|
* The subset of mailwoman's `GeocodeResult` the adapter consumes — kept structural so this package
|
|
53
102
|
* never imports the heavy geocoder, yet a real `GeocodeResult` maps straight in.
|
package/out/ingest.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ingest.d.ts","sourceRoot":"","sources":["../ingest.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAEH,OAAO,KAAK,EAAE,cAAc,EAAE,aAAa,EAAE,MAAM,mBAAmB,CAAA;AACtE,OAAO,EAAiD,eAAe,EAAe,MAAM,mBAAmB,CAAA;
|
|
1
|
+
{"version":3,"file":"ingest.d.ts","sourceRoot":"","sources":["../ingest.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAEH,OAAO,KAAK,EAAE,cAAc,EAAE,aAAa,EAAE,MAAM,mBAAmB,CAAA;AACtE,OAAO,EAAiD,eAAe,EAAe,MAAM,mBAAmB,CAAA;AAI/G,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,YAAY,CAAA;AAE9C,mGAAmG;AACnG,MAAM,MAAM,cAAc,GAAG,CAAC,GAAG,EAAE,MAAM,KAAK,OAAO,CAAC,aAAa,GAAG,IAAI,CAAC,GAAG,aAAa,GAAG,IAAI,CAAA;AAElG,8CAA8C;AAC9C,MAAM,MAAM,SAAS,GAAG,OAAO,GAAG,KAAK,CAAA;AAEvC,8EAA8E;AAC9E,wBAAgB,YAAY,CAAC,IAAI,EAAE,MAAM,GAAG,SAAS,CAEpD;AAED;;;;;;;;;;;;;;;;;;GAkBG;AACH,wBAAuB,UAAU,CAChC,MAAM,EAAE,MAAM,EACd,IAAI,GAAE;IAAE,SAAS,CAAC,EAAE,SAAS,CAAA;CAAO,GAClC,cAAc,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CA0BxC;AAED;;;GAGG;AACH,MAAM,WAAW,aAAa;IAC7B,mEAAmE;IACnE,EAAE,CAAC,EAAE,MAAM,CAAA;IACX,+DAA+D;IAC/D,MAAM,CAAC,EAAE,MAAM,CAAA;IACf,IAAI,CAAC,EAAE,MAAM,GAAG,MAAM,EAAE,CAAA;IACxB,YAAY,CAAC,EAAE,MAAM,GAAG,MAAM,EAAE,CAAA;IAChC,OAAO,CAAC,EAAE,MAAM,GAAG,MAAM,EAAE,CAAA;IAC3B,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,KAAK,CAAC,EAAE,MAAM,CAAA;IACd;;;;OAIG;IACH,UAAU,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,EAAE,CAAC,CAAA;CAC9C;AAED;;;;;;;;;GASG;AACH,wBAAgB,YAAY,CAAC,MAAM,EAAE,SAAS,MAAM,EAAE,GAAG,aAAa,CA4CrE;AAED,sCAAsC;AACtC,MAAM,WAAW,aAAa;IAC7B,sFAAsF;IACtF,cAAc,CAAC,EAAE,cAAc,CAAA;CAC/B;AAED,oFAAoF;AACpF,wBAAgB,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,CAE/D;AAcD;;;;GAIG;AACH,wBAAsB,UAAU,CAC/B,IAAI,EAAE,QAAQ,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,GAAG,aAAa,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,EAC9E,OAAO,EAAE,aAAa,EACtB,IAAI,GAAE,aAAkB,GACtB,OAAO,CAAC,YAAY,EAAE,CAAC,CAoCzB;AAED;;;GAGG;AACH,MAAM,WAAW,UAAU;IAC1B,GAAG,EAAE,MAAM,GAAG,IAAI,CAAA;IAClB,GAAG,EAAE,MAAM,GAAG,IAAI,CAAA;IAClB,eAAe,EAAE,cAAc,CAAC,MAAM,CAAC,CAAA;IACvC,aAAa,EAAE,MAAM,GAAG,IAAI,CAAA;IAC5B,SAAS,CAAC,EAAE,cAAc,CAAC,WAAW,CAAC,CAAA;CACvC;AAED;;;;;;GAMG;AACH,wBAAgB,iBAAiB,CAAC,IAAI,EAAE;IACvC,KAAK,EAAE,CAAC,GAAG,EAAE,MAAM,KAAK,OAAO,CAAC,UAAU,CAAC,OAAO,eAAe,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,UAAU,CAAC,OAAO,eAAe,CAAC,CAAC,CAAC,CAAC,CAAA;IAC9G,OAAO,EAAE,CAAC,GAAG,EAAE,MAAM,KAAK,OAAO,CAAC,UAAU,GAAG,IAAI,CAAC,GAAG,UAAU,GAAG,IAAI,CAAA;IACxE,OAAO,CAAC,EAAE,MAAM,CAAA;CAChB,GAAG,cAAc,CAgBjB"}
|
package/out/ingest.js
CHANGED
|
@@ -23,6 +23,102 @@
|
|
|
23
23
|
*/
|
|
24
24
|
import { canonicalizeOrganizationName, parsePersonName, toPostalAddress, withGeocode } from "@mailwoman/record";
|
|
25
25
|
import { parse as parseCsvSync } from "csv-parse/sync";
|
|
26
|
+
import { open } from "node:fs/promises";
|
|
27
|
+
import { Delimiters, TextSpliterator } from "spliterator";
|
|
28
|
+
/** Infer the delimiter from a path's extension (`.tsv` → tab, else comma). */
|
|
29
|
+
export function delimiterFor(path) {
|
|
30
|
+
return /\.tsv$/i.test(path) ? "tab" : "comma";
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Stream a delimited file's rows lazily as header-keyed objects — the same shape {@link parseCsv}
|
|
34
|
+
* returns, but **without loading the file into memory**. A multi-GB source (the NPPES registry is
|
|
35
|
+
* ~4.8 GB / 9.6M rows — too big for `readFileSync`, which throws `ERR_STRING_TOO_LONG`) streams
|
|
36
|
+
* line by line. Keys are the original header names so a {@link ColumnMapping} written against the
|
|
37
|
+
* source's headers matches. Filter/sample the stream before {@link ingestRows} to keep only the rows
|
|
38
|
+
* you geocode.
|
|
39
|
+
*
|
|
40
|
+
* We stream _lines_ with spliterator's `TextSpliterator` (pure-Node, the part that handles the huge
|
|
41
|
+
* file) and split each line into columns here with `String.prototype.split`. We deliberately do NOT
|
|
42
|
+
* use `CSVSpliterator`: its column tokenizer hard-codes `skipEmpty` (it builds the column
|
|
43
|
+
* spliterator as `{ delimiter }` with no `skipEmpty: false`), so consecutive delimiters collapse
|
|
44
|
+
* and EMPTY FIELDS ARE DROPPED — fatal for a fixed-width registry like NPPES where a row of 330
|
|
45
|
+
* columns full of empties would mis-parse to 40 and shift every value. (Upstream `spliterator` bug;
|
|
46
|
+
* revisit when it's fixed.)
|
|
47
|
+
*
|
|
48
|
+
* Assumes an unquoted delimited file (no fields containing the delimiter) — true for these
|
|
49
|
+
* government TSVs. For small, possibly-quoted CSVs use {@link parseCsv} (quote-aware, in-memory).
|
|
50
|
+
*/
|
|
51
|
+
export async function* streamRows(source, opts = {}) {
|
|
52
|
+
const sep = (opts.delimiter ?? delimiterFor(source)) === "tab" ? "\t" : ",";
|
|
53
|
+
// Own the file handle so it's closed deterministically. spliterator's `autoDispose` only fires on
|
|
54
|
+
// natural completion, not on an early `break`/`.return()` — which then leaks the fd (a GC-time error
|
|
55
|
+
// in Node 24+). We open it, pass `autoDispose: false` so spliterator never touches our handle, and
|
|
56
|
+
// close it in `finally` (runs on completion AND when the consumer abandons the generator early).
|
|
57
|
+
const handle = await open(source, "r");
|
|
58
|
+
try {
|
|
59
|
+
let header = null;
|
|
60
|
+
for await (const line of TextSpliterator.fromAsync(handle, {
|
|
61
|
+
delimiter: Delimiters.LineFeed,
|
|
62
|
+
autoDispose: false,
|
|
63
|
+
})) {
|
|
64
|
+
if (line.length === 0)
|
|
65
|
+
continue; // blank line / trailing newline
|
|
66
|
+
const fields = line.replace(/\r$/, "").split(sep); // tolerate CRLF
|
|
67
|
+
if (header === null) {
|
|
68
|
+
header = fields;
|
|
69
|
+
continue;
|
|
70
|
+
}
|
|
71
|
+
const row = {};
|
|
72
|
+
for (let i = 0; i < header.length; i++)
|
|
73
|
+
row[header[i]] = fields[i] ?? "";
|
|
74
|
+
yield row;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
finally {
|
|
78
|
+
await handle.close();
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
/**
|
|
82
|
+
* Best-effort {@link ColumnMapping} inferred from a header row — the "point it at any CSV"
|
|
83
|
+
* convenience. Each column name is matched (case- and punctuation-insensitive, on whole tokens) to
|
|
84
|
+
* a field by keyword, in a precedence that resolves the common ambiguities: a dedicated id / phone
|
|
85
|
+
* / email column is claimed before the generic sweep, an org / facility column beats a person
|
|
86
|
+
* "name", and address columns (street / city / state / zip…) collect into one multi-column field.
|
|
87
|
+
* Imperfect on bespoke headers (an explicit mapping or the LLM-assisted inference #603 is the
|
|
88
|
+
* answer there), but it nails tidy and semi-tidy files with no hand-mapping. Unmatched columns are
|
|
89
|
+
* left out.
|
|
90
|
+
*/
|
|
91
|
+
export function inferMapping(header) {
|
|
92
|
+
// Pad to whole-token boundaries so "state" doesn't match inside "statement".
|
|
93
|
+
const tok = (h) => ` ${h
|
|
94
|
+
.toLowerCase()
|
|
95
|
+
.replace(/[^a-z0-9]+/g, " ")
|
|
96
|
+
.trim()} `;
|
|
97
|
+
const mapping = {};
|
|
98
|
+
const name = [];
|
|
99
|
+
const address = [];
|
|
100
|
+
for (const column of header) {
|
|
101
|
+
const h = tok(column);
|
|
102
|
+
const has = (...words) => words.some((w) => h.includes(` ${w} `));
|
|
103
|
+
if (!mapping.email && has("email", "e mail"))
|
|
104
|
+
mapping.email = column;
|
|
105
|
+
else if (!mapping.phone && has("phone", "telephone", "tel", "mobile", "cell"))
|
|
106
|
+
mapping.phone = column;
|
|
107
|
+
else if (!mapping.id && has("id", "npi", "ein", "frn", "spin", "uuid", "guid", "key"))
|
|
108
|
+
mapping.id = column;
|
|
109
|
+
else if (has("org", "organization", "organisation", "company", "business", "facility", "agency", "employer"))
|
|
110
|
+
mapping.organization ??= column;
|
|
111
|
+
else if (has("street", "address", "addr", "city", "town", "state", "province", "zip", "zipcode", "postal", "postcode", "county"))
|
|
112
|
+
address.push(column);
|
|
113
|
+
else if (has("name", "first", "last", "given", "family", "middle", "surname", "fullname", "contact"))
|
|
114
|
+
name.push(column);
|
|
115
|
+
}
|
|
116
|
+
if (name.length)
|
|
117
|
+
mapping.name = name.length === 1 ? name[0] : name;
|
|
118
|
+
if (address.length)
|
|
119
|
+
mapping.address = address;
|
|
120
|
+
return mapping;
|
|
121
|
+
}
|
|
26
122
|
/** Parse a CSV string (with a header row) into row objects keyed by column name. */
|
|
27
123
|
export function parseCsv(text) {
|
|
28
124
|
return parseCsvSync(text, { columns: true, skip_empty_lines: true, trim: true, relax_column_count: true });
|
|
@@ -39,15 +135,27 @@ function pick(row, columns) {
|
|
|
39
135
|
.trim();
|
|
40
136
|
return value || undefined;
|
|
41
137
|
}
|
|
42
|
-
/**
|
|
138
|
+
/**
|
|
139
|
+
* Normalize tabular rows into {@link SourceRecord}s under a {@link ColumnMapping}. Accepts a sync OR
|
|
140
|
+
* async iterable, so {@link parseCsv} (in-memory) and {@link streamRows} (lazy, for huge files) both
|
|
141
|
+
* thread straight through.
|
|
142
|
+
*/
|
|
43
143
|
export async function ingestRows(rows, mapping, opts = {}) {
|
|
44
144
|
const records = [];
|
|
45
145
|
let index = 0;
|
|
46
|
-
for (const row of rows) {
|
|
146
|
+
for await (const row of rows) {
|
|
47
147
|
const id = (mapping.id ? row[mapping.id]?.trim() : "") || String(index);
|
|
48
148
|
const nameValue = pick(row, mapping.name);
|
|
49
149
|
const orgValue = pick(row, mapping.organization);
|
|
50
150
|
const addressValue = pick(row, mapping.address);
|
|
151
|
+
let attributes;
|
|
152
|
+
if (mapping.attributes) {
|
|
153
|
+
for (const [key, columns] of Object.entries(mapping.attributes)) {
|
|
154
|
+
const value = pick(row, columns);
|
|
155
|
+
if (value)
|
|
156
|
+
(attributes ??= {})[key] = value;
|
|
157
|
+
}
|
|
158
|
+
}
|
|
51
159
|
const record = {
|
|
52
160
|
id,
|
|
53
161
|
source: mapping.source,
|
|
@@ -56,6 +164,7 @@ export async function ingestRows(rows, mapping, opts = {}) {
|
|
|
56
164
|
phone: (mapping.phone && row[mapping.phone]?.trim()) || undefined,
|
|
57
165
|
email: (mapping.email && row[mapping.email]?.trim()?.toLowerCase()) || undefined,
|
|
58
166
|
address: addressValue && opts.geocodeAddress ? ((await opts.geocodeAddress(addressValue)) ?? undefined) : undefined,
|
|
167
|
+
attributes,
|
|
59
168
|
raw: row,
|
|
60
169
|
};
|
|
61
170
|
records.push(record);
|
package/out/ingest.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ingest.js","sourceRoot":"","sources":["../ingest.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAGH,OAAO,EAAE,4BAA4B,EAAE,eAAe,EAAE,eAAe,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAA;AAC/G,OAAO,EAAE,KAAK,IAAI,YAAY,EAAE,MAAM,gBAAgB,CAAA;
|
|
1
|
+
{"version":3,"file":"ingest.js","sourceRoot":"","sources":["../ingest.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAGH,OAAO,EAAE,4BAA4B,EAAE,eAAe,EAAE,eAAe,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAA;AAC/G,OAAO,EAAE,KAAK,IAAI,YAAY,EAAE,MAAM,gBAAgB,CAAA;AACtD,OAAO,EAAE,IAAI,EAAE,MAAM,kBAAkB,CAAA;AACvC,OAAO,EAAE,UAAU,EAAE,eAAe,EAAE,MAAM,aAAa,CAAA;AASzD,8EAA8E;AAC9E,MAAM,UAAU,YAAY,CAAC,IAAY;IACxC,OAAO,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,OAAO,CAAA;AAC9C,CAAC;AAED;;;;;;;;;;;;;;;;;;GAkBG;AACH,MAAM,CAAC,KAAK,SAAS,CAAC,CAAC,UAAU,CAChC,MAAc,EACd,OAAkC,EAAE;IAEpC,MAAM,GAAG,GAAG,CAAC,IAAI,CAAC,SAAS,IAAI,YAAY,CAAC,MAAM,CAAC,CAAC,KAAK,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,CAAA;IAC3E,kGAAkG;IAClG,qGAAqG;IACrG,mGAAmG;IACnG,iGAAiG;IACjG,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,MAAM,EAAE,GAAG,CAAC,CAAA;IACtC,IAAI,CAAC;QACJ,IAAI,MAAM,GAAoB,IAAI,CAAA;QAClC,IAAI,KAAK,EAAE,MAAM,IAAI,IAAI,eAAe,CAAC,SAAS,CAAC,MAAM,EAAE;YAC1D,SAAS,EAAE,UAAU,CAAC,QAAQ;YAC9B,WAAW,EAAE,KAAK;SAClB,CAAC,EAAE,CAAC;YACJ,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;gBAAE,SAAQ,CAAC,gCAAgC;YAChE,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAA,CAAC,gBAAgB;YAClE,IAAI,MAAM,KAAK,IAAI,EAAE,CAAC;gBACrB,MAAM,GAAG,MAAM,CAAA;gBACf,SAAQ;YACT,CAAC;YACD,MAAM,GAAG,GAA2B,EAAE,CAAA;YACtC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE;gBAAE,GAAG,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,GAAG,MAAM,CAAC,CAAC,CAAC,IAAI,EAAE,CAAA;YACzE,MAAM,GAAG,CAAA;QACV,CAAC;IACF,CAAC;YAAS,CAAC;QACV,MAAM,MAAM,CAAC,KAAK,EAAE,CAAA;IACrB,CAAC;AACF,CAAC;AAwBD;;;;;;;;;GASG;AACH,MAAM,UAAU,YAAY,CAAC,MAAyB;IACrD,6EAA6E;IAC7E,MAAM,GAAG,GAAG,CAAC,CAAS,EAAE,EAAE,CACzB,IAAI,CAAC;SACH,WAAW,EAAE;SACb,OAAO,CAAC,aAAa,EAAE,GAAG,CAAC;SAC3B,IAAI,EAAE,GAAG,CAAA;IACZ,MAAM,OAAO,GAAkB,EAAE,CAAA;IACjC,MAAM,IAAI,GAAa,EAAE,CAAA;IACzB,MAAM,OAAO,GAAa,EAAE,CAAA;IAE5B,KAAK,MAAM,MAAM,IAAI,MAAM,EAAE,CAAC;QAC7B,MAAM,CAAC,GAAG,GAAG,CAAC,MAAM,CAAC,CAAA;QACrB,MAAM,GAAG,GAAG,CAAC,GAAG,KAAe,EAAW,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAA;QAEpF,IAAI,CAAC,OAAO,CAAC,KAAK,IAAI,GAAG,CAAC,OAAO,EAAE,QAAQ,CAAC;YAAE,OAAO,CAAC,KAAK,GAAG,MAAM,CAAA;aAC/D,IAAI,CAAC,OAAO,CAAC,KAAK,IAAI,GAAG,CAAC,OAAO,EAAE,WAAW,EAAE,KAAK,EAAE,QAAQ,EAAE,MAAM,CAAC;YAAE,OAAO,CAAC,KAAK,GAAG,MAAM,CAAA;aAChG,IAAI,CAAC,OAAO,CAAC,EAAE,IAAI,GAAG,CAAC,IAAI,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,KAAK,CAAC;YAAE,OAAO,CAAC,EAAE,GAAG,MAAM,CAAA;aACrG,IAAI,GAAG,CAAC,KAAK,EAAE,cAAc,EAAE,cAAc,EAAE,SAAS,EAAE,UAAU,EAAE,UAAU,EAAE,QAAQ,EAAE,UAAU,CAAC;YAC3G,OAAO,CAAC,YAAY,KAAK,MAAM,CAAA;aAC3B,IACJ,GAAG,CACF,QAAQ,EACR,SAAS,EACT,MAAM,EACN,MAAM,EACN,MAAM,EACN,OAAO,EACP,UAAU,EACV,KAAK,EACL,SAAS,EACT,QAAQ,EACR,UAAU,EACV,QAAQ,CACR;YAED,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;aAChB,IAAI,GAAG,CAAC,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,QAAQ,EAAE,QAAQ,EAAE,SAAS,EAAE,UAAU,EAAE,SAAS,CAAC;YACnG,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;IACnB,CAAC;IAED,IAAI,IAAI,CAAC,MAAM;QAAE,OAAO,CAAC,IAAI,GAAG,IAAI,CAAC,MAAM,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAE,CAAC,CAAC,CAAC,IAAI,CAAA;IACnE,IAAI,OAAO,CAAC,MAAM;QAAE,OAAO,CAAC,OAAO,GAAG,OAAO,CAAA;IAC7C,OAAO,OAAO,CAAA;AACf,CAAC;AAQD,oFAAoF;AACpF,MAAM,UAAU,QAAQ,CAAC,IAAY;IACpC,OAAO,YAAY,CAAC,IAAI,EAAE,EAAE,OAAO,EAAE,IAAI,EAAE,gBAAgB,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,kBAAkB,EAAE,IAAI,EAAE,CAAC,CAAA;AAC3G,CAAC;AAED,6FAA6F;AAC7F,SAAS,IAAI,CAAC,GAA2B,EAAE,OAA2B;IACrE,IAAI,CAAC,OAAO;QAAE,OAAO,SAAS,CAAA;IAC9B,MAAM,IAAI,GAAG,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAA;IACzD,MAAM,KAAK,GAAG,IAAI;SAChB,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC;SAC1B,MAAM,CAAC,OAAO,CAAC;SACf,IAAI,CAAC,GAAG,CAAC;SACT,IAAI,EAAE,CAAA;IACR,OAAO,KAAK,IAAI,SAAS,CAAA;AAC1B,CAAC;AAED;;;;GAIG;AACH,MAAM,CAAC,KAAK,UAAU,UAAU,CAC/B,IAA8E,EAC9E,OAAsB,EACtB,OAAsB,EAAE;IAExB,MAAM,OAAO,GAAmB,EAAE,CAAA;IAClC,IAAI,KAAK,GAAG,CAAC,CAAA;IAEb,IAAI,KAAK,EAAE,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QAC9B,MAAM,EAAE,GAAG,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,EAAE,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,MAAM,CAAC,KAAK,CAAC,CAAA;QACvE,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,OAAO,CAAC,IAAI,CAAC,CAAA;QACzC,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,OAAO,CAAC,YAAY,CAAC,CAAA;QAChD,MAAM,YAAY,GAAG,IAAI,CAAC,GAAG,EAAE,OAAO,CAAC,OAAO,CAAC,CAAA;QAE/C,IAAI,UAA8C,CAAA;QAClD,IAAI,OAAO,CAAC,UAAU,EAAE,CAAC;YACxB,KAAK,MAAM,CAAC,GAAG,EAAE,OAAO,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,UAAU,CAAC,EAAE,CAAC;gBACjE,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,OAAO,CAAC,CAAA;gBAChC,IAAI,KAAK;oBAAE,CAAC,UAAU,KAAK,EAAE,CAAC,CAAC,GAAG,CAAC,GAAG,KAAK,CAAA;YAC5C,CAAC;QACF,CAAC;QAED,MAAM,MAAM,GAAiB;YAC5B,EAAE;YACF,MAAM,EAAE,OAAO,CAAC,MAAM;YACtB,IAAI,EAAE,SAAS,CAAC,CAAC,CAAC,eAAe,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,SAAS;YACxD,YAAY,EAAE,QAAQ,CAAC,CAAC,CAAC,4BAA4B,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,SAAS;YAC3E,KAAK,EAAE,CAAC,OAAO,CAAC,KAAK,IAAI,GAAG,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,IAAI,EAAE,CAAC,IAAI,SAAS;YACjE,KAAK,EAAE,CAAC,OAAO,CAAC,KAAK,IAAI,GAAG,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,IAAI,EAAE,EAAE,WAAW,EAAE,CAAC,IAAI,SAAS;YAChF,OAAO,EACN,YAAY,IAAI,IAAI,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,cAAc,CAAC,YAAY,CAAC,CAAC,IAAI,SAAS,CAAC,CAAC,CAAC,CAAC,SAAS;YAC3G,UAAU;YACV,GAAG,EAAE,GAAG;SACR,CAAA;QAED,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QACpB,KAAK,EAAE,CAAA;IACR,CAAC;IAED,OAAO,OAAO,CAAA;AACf,CAAC;AAcD;;;;;;GAMG;AACH,MAAM,UAAU,iBAAiB,CAAC,IAIjC;IACA,OAAO,KAAK,EAAE,GAAW,EAAiC,EAAE;QAC3D,MAAM,UAAU,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAA;QACxC,MAAM,IAAI,GAAG,eAAe,CAAC,UAAU,EAAE,EAAE,OAAO,EAAE,IAAI,CAAC,OAAO,EAAE,GAAG,EAAE,CAAC,CAAA;QAExE,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAA;QACxC,IAAI,CAAC,QAAQ,IAAI,QAAQ,CAAC,GAAG,KAAK,IAAI,IAAI,QAAQ,CAAC,GAAG,KAAK,IAAI;YAAE,OAAO,IAAI,CAAA;QAE5E,MAAM,OAAO,GAAmB;YAC/B,UAAU,EAAE,EAAE,QAAQ,EAAE,QAAQ,CAAC,GAAG,EAAE,SAAS,EAAE,QAAQ,CAAC,GAAG,EAAE;YAC/D,IAAI,EAAE,QAAQ,CAAC,eAAe;YAC9B,iBAAiB,EAAE,QAAQ,CAAC,aAAa;YACzC,SAAS,EAAE,QAAQ,CAAC,SAAS;SAC7B,CAAA;QACD,OAAO,WAAW,CAAC,IAAI,EAAE,OAAO,CAAC,CAAA;IAClC,CAAC,CAAA;AACF,CAAC"}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* The learned scorer (#603) — the production wiring for the gradient-boosted-tree model behind
|
|
7
|
+
* {@link ResolveConfig.scorer}. Two pieces:
|
|
8
|
+
*
|
|
9
|
+
* 1. {@link createMatchFeaturizer} — the ONE feature extractor for a candidate pair, used identically
|
|
10
|
+
* at train time (`scripts/record-matcher/train-gbt.ts`), eval time (the learned-scorer
|
|
11
|
+
* evals), and inference time (here). A pair → one-hot of each comparison's agreement level +
|
|
12
|
+
* the over-merge interaction terms (co-located × name/org disagreement) + address
|
|
13
|
+
* crowdedness.
|
|
14
|
+
* 2. {@link createGbtScorer} — wraps a trained {@link GBT} + the featurizer into the `(a, b) => number`
|
|
15
|
+
* the resolve pipeline's `scorer` hook expects (a logit, threshold-comparable with the
|
|
16
|
+
* Fellegi-Sunter weight it replaces).
|
|
17
|
+
*
|
|
18
|
+
* Both take the comparison set as INPUT (rather than importing {@link buildDefaultModel}) so this
|
|
19
|
+
* module has no dependency cycle with `resolve.ts`. The contract that keeps train ≡ inference:
|
|
20
|
+
* feed the comparisons from `buildDefaultModel({ collapseSpatial: true, addressFrequency })` —
|
|
21
|
+
* the model's structure (and thus the feature layout) is fixed by that config; only the frequency
|
|
22
|
+
* VALUES differ between the training corpus and the matched set, which is the point (the model
|
|
23
|
+
* generalizes, as the cross-state eval showed).
|
|
24
|
+
*/
|
|
25
|
+
import { type Comparison, type GBT, type TermFrequencyTable } from "@mailwoman/match";
|
|
26
|
+
import type { SourceRecord } from "./types.js";
|
|
27
|
+
/** Inputs shared by the featurizer + the scorer factory. */
|
|
28
|
+
export interface LearnedFeatureConfig {
|
|
29
|
+
/**
|
|
30
|
+
* The comparison set the features are built over — MUST be `buildDefaultModel({ collapseSpatial:
|
|
31
|
+
* true, addressFrequency }).comparisons` so the feature layout matches the trained model.
|
|
32
|
+
* (`usePhone` / `discriminators` are NOT part of the learned feature model — the GBT replaces the
|
|
33
|
+
* FS weight wholesale and owns its own feature vector.)
|
|
34
|
+
*/
|
|
35
|
+
comparisons: Comparison<SourceRecord>[];
|
|
36
|
+
/** Address-frequency table for the crowdedness feature (a crowded shared address is weak
|
|
37
|
+
identity). */
|
|
38
|
+
addressFrequency: TermFrequencyTable;
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* Build the per-pair feature extractor. The vector is: one-hot of each comparison's agreement
|
|
42
|
+
* level, then the two over-merge interaction terms (spatial-exact × name-disagree, spatial-exact ×
|
|
43
|
+
* org-disagree — the "same place, different names" signature that drives co-located over-merges),
|
|
44
|
+
* then address crowdedness scaled into [0, 1]. Deterministic and EM-independent, so it is identical
|
|
45
|
+
* across train / eval / inference.
|
|
46
|
+
*/
|
|
47
|
+
export declare function createMatchFeaturizer(config: LearnedFeatureConfig): (a: SourceRecord, b: SourceRecord) => number[];
|
|
48
|
+
/**
|
|
49
|
+
* Wrap a trained {@link GBT} into the `(a, b) => number` link scorer for
|
|
50
|
+
* {@link ResolveConfig.scorer}. The returned weight is the model's logit — same threshold-comparable
|
|
51
|
+
* units as the Fellegi-Sunter weight it replaces, so the pipeline's clustering + threshold
|
|
52
|
+
* semantics are unchanged.
|
|
53
|
+
*/
|
|
54
|
+
export declare function createGbtScorer(config: LearnedFeatureConfig & {
|
|
55
|
+
model: GBT;
|
|
56
|
+
}): (a: SourceRecord, b: SourceRecord) => number;
|
|
57
|
+
//# sourceMappingURL=learned-scorer.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"learned-scorer.d.ts","sourceRoot":"","sources":["../learned-scorer.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AAEH,OAAO,EAAoB,KAAK,UAAU,EAAE,KAAK,GAAG,EAAY,KAAK,kBAAkB,EAAE,MAAM,kBAAkB,CAAA;AACjH,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,YAAY,CAAA;AAE9C,4DAA4D;AAC5D,MAAM,WAAW,oBAAoB;IACpC;;;;;OAKG;IACH,WAAW,EAAE,UAAU,CAAC,YAAY,CAAC,EAAE,CAAA;IACvC;aACY;IACZ,gBAAgB,EAAE,kBAAkB,CAAA;CACpC;AAED;;;;;;GAMG;AACH,wBAAgB,qBAAqB,CAAC,MAAM,EAAE,oBAAoB,GAAG,CAAC,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,YAAY,KAAK,MAAM,EAAE,CAkClH;AAED;;;;;GAKG;AACH,wBAAgB,eAAe,CAC9B,MAAM,EAAE,oBAAoB,GAAG;IAAE,KAAK,EAAE,GAAG,CAAA;CAAE,GAC3C,CAAC,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,YAAY,KAAK,MAAM,CAI9C"}
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* The learned scorer (#603) — the production wiring for the gradient-boosted-tree model behind
|
|
7
|
+
* {@link ResolveConfig.scorer}. Two pieces:
|
|
8
|
+
*
|
|
9
|
+
* 1. {@link createMatchFeaturizer} — the ONE feature extractor for a candidate pair, used identically
|
|
10
|
+
* at train time (`scripts/record-matcher/train-gbt.ts`), eval time (the learned-scorer
|
|
11
|
+
* evals), and inference time (here). A pair → one-hot of each comparison's agreement level +
|
|
12
|
+
* the over-merge interaction terms (co-located × name/org disagreement) + address
|
|
13
|
+
* crowdedness.
|
|
14
|
+
* 2. {@link createGbtScorer} — wraps a trained {@link GBT} + the featurizer into the `(a, b) => number`
|
|
15
|
+
* the resolve pipeline's `scorer` hook expects (a logit, threshold-comparable with the
|
|
16
|
+
* Fellegi-Sunter weight it replaces).
|
|
17
|
+
*
|
|
18
|
+
* Both take the comparison set as INPUT (rather than importing {@link buildDefaultModel}) so this
|
|
19
|
+
* module has no dependency cycle with `resolve.ts`. The contract that keeps train ≡ inference:
|
|
20
|
+
* feed the comparisons from `buildDefaultModel({ collapseSpatial: true, addressFrequency })` —
|
|
21
|
+
* the model's structure (and thus the feature layout) is fixed by that config; only the frequency
|
|
22
|
+
* VALUES differ between the training corpus and the matched set, which is the point (the model
|
|
23
|
+
* generalizes, as the cross-state eval showed).
|
|
24
|
+
*/
|
|
25
|
+
import { agreementPattern, gbtScore } from "@mailwoman/match";
|
|
26
|
+
/**
|
|
27
|
+
* Build the per-pair feature extractor. The vector is: one-hot of each comparison's agreement
|
|
28
|
+
* level, then the two over-merge interaction terms (spatial-exact × name-disagree, spatial-exact ×
|
|
29
|
+
* org-disagree — the "same place, different names" signature that drives co-located over-merges),
|
|
30
|
+
* then address crowdedness scaled into [0, 1]. Deterministic and EM-independent, so it is identical
|
|
31
|
+
* across train / eval / inference.
|
|
32
|
+
*/
|
|
33
|
+
export function createMatchFeaturizer(config) {
|
|
34
|
+
const { comparisons, addressFrequency } = config;
|
|
35
|
+
const levelCounts = comparisons.map((c) => c.levels.length);
|
|
36
|
+
const index = Object.fromEntries(comparisons.map((c, i) => [c.name, i]));
|
|
37
|
+
const spatialI = index["spatial"];
|
|
38
|
+
const givenI = index["given"];
|
|
39
|
+
const familyI = index["family"];
|
|
40
|
+
const orgI = index["organization"];
|
|
41
|
+
const lastLevel = (i) => levelCounts[i] - 1;
|
|
42
|
+
return (a, b) => {
|
|
43
|
+
const pat = agreementPattern(comparisons, a, b);
|
|
44
|
+
const f = [];
|
|
45
|
+
for (let i = 0; i < pat.length; i++) {
|
|
46
|
+
const lvl = pat[i];
|
|
47
|
+
for (let l = 0; l < levelCounts[i]; l++)
|
|
48
|
+
f.push(lvl === l ? 1 : 0);
|
|
49
|
+
}
|
|
50
|
+
// Interaction: co-located (spatial exact = level 0) AND names/org disagree (catch-all level).
|
|
51
|
+
const spatialExact = spatialI !== undefined && pat[spatialI] === 0 ? 1 : 0;
|
|
52
|
+
const nameDisagree = givenI !== undefined &&
|
|
53
|
+
familyI !== undefined &&
|
|
54
|
+
pat[givenI] === lastLevel(givenI) &&
|
|
55
|
+
pat[familyI] === lastLevel(familyI)
|
|
56
|
+
? 1
|
|
57
|
+
: 0;
|
|
58
|
+
const orgDisagree = orgI !== undefined && pat[orgI] === lastLevel(orgI) ? 1 : 0;
|
|
59
|
+
f.push(spatialExact * nameDisagree); // the over-merge signature: same place, names disagree
|
|
60
|
+
f.push(spatialExact * orgDisagree);
|
|
61
|
+
// Address crowdedness (how shared this address is) — high → "same address" is weak evidence.
|
|
62
|
+
const freq = a.address?.raw ? addressFrequency.frequency(a.address.raw) : 0;
|
|
63
|
+
f.push(Math.min(1, freq * 1000)); // scale into a usable range
|
|
64
|
+
return f;
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
/**
|
|
68
|
+
* Wrap a trained {@link GBT} into the `(a, b) => number` link scorer for
|
|
69
|
+
* {@link ResolveConfig.scorer}. The returned weight is the model's logit — same threshold-comparable
|
|
70
|
+
* units as the Fellegi-Sunter weight it replaces, so the pipeline's clustering + threshold
|
|
71
|
+
* semantics are unchanged.
|
|
72
|
+
*/
|
|
73
|
+
export function createGbtScorer(config) {
|
|
74
|
+
const featurize = createMatchFeaturizer(config);
|
|
75
|
+
const { model } = config;
|
|
76
|
+
return (a, b) => gbtScore(model, featurize(a, b));
|
|
77
|
+
}
|
|
78
|
+
//# sourceMappingURL=learned-scorer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"learned-scorer.js","sourceRoot":"","sources":["../learned-scorer.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AAEH,OAAO,EAAE,gBAAgB,EAA6B,QAAQ,EAA2B,MAAM,kBAAkB,CAAA;AAiBjH;;;;;;GAMG;AACH,MAAM,UAAU,qBAAqB,CAAC,MAA4B;IACjE,MAAM,EAAE,WAAW,EAAE,gBAAgB,EAAE,GAAG,MAAM,CAAA;IAChD,MAAM,WAAW,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,CAAA;IAC3D,MAAM,KAAK,GAAG,MAAM,CAAC,WAAW,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,CAAuC,CAAA;IAC9G,MAAM,QAAQ,GAAG,KAAK,CAAC,SAAS,CAAC,CAAA;IACjC,MAAM,MAAM,GAAG,KAAK,CAAC,OAAO,CAAC,CAAA;IAC7B,MAAM,OAAO,GAAG,KAAK,CAAC,QAAQ,CAAC,CAAA;IAC/B,MAAM,IAAI,GAAG,KAAK,CAAC,cAAc,CAAC,CAAA;IAClC,MAAM,SAAS,GAAG,CAAC,CAAS,EAAU,EAAE,CAAC,WAAW,CAAC,CAAC,CAAE,GAAG,CAAC,CAAA;IAE5D,OAAO,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;QACf,MAAM,GAAG,GAAG,gBAAgB,CAAC,WAAW,EAAE,CAAC,EAAE,CAAC,CAAC,CAAA;QAC/C,MAAM,CAAC,GAAa,EAAE,CAAA;QACtB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACrC,MAAM,GAAG,GAAG,GAAG,CAAC,CAAC,CAAE,CAAA;YACnB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,WAAW,CAAC,CAAC,CAAE,EAAE,CAAC,EAAE;gBAAE,CAAC,CAAC,IAAI,CAAC,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;QACpE,CAAC;QACD,8FAA8F;QAC9F,MAAM,YAAY,GAAG,QAAQ,KAAK,SAAS,IAAI,GAAG,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;QAC1E,MAAM,YAAY,GACjB,MAAM,KAAK,SAAS;YACpB,OAAO,KAAK,SAAS;YACrB,GAAG,CAAC,MAAM,CAAC,KAAK,SAAS,CAAC,MAAM,CAAC;YACjC,GAAG,CAAC,OAAO,CAAC,KAAK,SAAS,CAAC,OAAO,CAAC;YAClC,CAAC,CAAC,CAAC;YACH,CAAC,CAAC,CAAC,CAAA;QACL,MAAM,WAAW,GAAG,IAAI,KAAK,SAAS,IAAI,GAAG,CAAC,IAAI,CAAC,KAAK,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;QAC/E,CAAC,CAAC,IAAI,CAAC,YAAY,GAAG,YAAY,CAAC,CAAA,CAAC,uDAAuD;QAC3F,CAAC,CAAC,IAAI,CAAC,YAAY,GAAG,WAAW,CAAC,CAAA;QAClC,6FAA6F;QAC7F,MAAM,IAAI,GAAG,CAAC,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC,CAAC,gBAAgB,CAAC,SAAS,CAAC,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;QAC3E,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,GAAG,IAAI,CAAC,CAAC,CAAA,CAAC,4BAA4B;QAC7D,OAAO,CAAC,CAAA;IACT,CAAC,CAAA;AACF,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,eAAe,CAC9B,MAA6C;IAE7C,MAAM,SAAS,GAAG,qBAAqB,CAAC,MAAM,CAAC,CAAA;IAC/C,MAAM,EAAE,KAAK,EAAE,GAAG,MAAM,CAAA;IACxB,OAAO,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,QAAQ,CAAC,KAAK,EAAE,SAAS,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAA;AAClD,CAAC"}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* GENERATED by scripts/record-matcher/train-gbt.ts — DO NOT edit by hand; retrain to update.
|
|
7
|
+
*
|
|
8
|
+
* The default learned-scorer model (#603): a gradient-boosted-tree dedup scorer trained on the
|
|
9
|
+
* NPPES NPI-truth set (TX, 3000 NPIs → 707975 candidate pairs). Validated to generalize across
|
|
10
|
+
* states by learned-scorer-crossstate-eval.ts. Used by resolveEntities' opt-in learnedScorer hook
|
|
11
|
+
* via createGbtScorer. The trained {@link GBT} is plain data.
|
|
12
|
+
*/
|
|
13
|
+
import type { GBT } from "@mailwoman/match";
|
|
14
|
+
/** Provenance for the bundled model — what it was trained on. */
|
|
15
|
+
export declare const DEDUP_GBT_META: {
|
|
16
|
+
readonly version: "1.0.0";
|
|
17
|
+
readonly locale: "en-US";
|
|
18
|
+
readonly trainedOn: "2026-06-15";
|
|
19
|
+
readonly state: "TX";
|
|
20
|
+
readonly npis: 3000;
|
|
21
|
+
readonly records: 8602;
|
|
22
|
+
readonly pairs: 707975;
|
|
23
|
+
readonly posRate: 0.0088;
|
|
24
|
+
readonly hyperparams: {
|
|
25
|
+
readonly rounds: 120;
|
|
26
|
+
readonly depth: 3;
|
|
27
|
+
readonly lr: 0.3;
|
|
28
|
+
readonly minLeaf: 20;
|
|
29
|
+
};
|
|
30
|
+
readonly recommendedThreshold: 2.7143;
|
|
31
|
+
readonly features: 17;
|
|
32
|
+
readonly addressFrequencyDistinct: 3317267;
|
|
33
|
+
readonly addressFrequencyTotal: 9260504;
|
|
34
|
+
};
|
|
35
|
+
export declare const DEDUP_GBT_MODEL: GBT;
|
|
36
|
+
//# sourceMappingURL=dedup-gbt-en-us.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"dedup-gbt-en-us.d.ts","sourceRoot":"","sources":["../../models/dedup-gbt-en-us.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,OAAO,KAAK,EAAE,GAAG,EAAE,MAAM,kBAAkB,CAAA;AAE3C,iEAAiE;AACjE,eAAO,MAAM,cAAc;;;;;;;;;;;;;;;;;;;CAmBjB,CAAA;AAGV,eAAO,MAAM,eAAe,EAAE,GAAo0vD,CAAA"}
|