@mailwoman/registry 4.8.1 → 4.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,31 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * The address-id consumer for the matcher (#259). Derives a stable {@link PostalAddressID} from a
7
+ * resolved {@link SourceRecord} and exposes it as a blocking key — the deterministic,
8
+ * exact-canonical-address complement to the fuzzy Fellegi-Sunter / GBT scoring. Two uses:
9
+ *
10
+ * - **As a pre-dedup / join key:** `GROUP BY postalAddressId(record)` collapses records that resolve
11
+ * to the same place AND share a canonical address with NO scoring at all — the cheap, certain
12
+ * slice of dedup before the matcher does the fuzzy rest.
13
+ * - **As a blocking key:** {@link addressIdBlockingKey} adds the address-id to the blocking union, so
14
+ * records sharing one are guaranteed to be compared.
15
+ */
16
+ import { type PostalAddressID } from "@mailwoman/address-id";
17
+ import { type BlockingKey } from "@mailwoman/match";
18
+ import type { SourceRecord } from "./types.js";
19
+ /**
20
+ * The stable address primary key for a record, or null when it isn't geocoded (no coordinate → no
21
+ * locality cell) or carries no raw address to hash. Uses the resolved coordinate + the raw address;
22
+ * the state prefix is plucked from the address when present.
23
+ */
24
+ export declare function postalAddressId(record: SourceRecord): PostalAddressID | null;
25
+ /**
26
+ * A blocking key on the {@link postalAddressId} — records that resolve to the same place with the
27
+ * same canonical address block together. Add it to {@link defaultBlockingKeys}'s union when an exact
28
+ * address join should never be missed.
29
+ */
30
+ export declare function addressIdBlockingKey(): BlockingKey<SourceRecord>;
31
+ //# sourceMappingURL=address-key.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"address-key.d.ts","sourceRoot":"","sources":["../address-key.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;GAcG;AAEH,OAAO,EAAyB,KAAK,eAAe,EAAE,MAAM,uBAAuB,CAAA;AACnF,OAAO,EAAE,KAAK,WAAW,EAAY,MAAM,kBAAkB,CAAA;AAC7D,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,YAAY,CAAA;AAE9C;;;;GAIG;AACH,wBAAgB,eAAe,CAAC,MAAM,EAAE,YAAY,GAAG,eAAe,GAAG,IAAI,CAK5E;AAED;;;;GAIG;AACH,wBAAgB,oBAAoB,IAAI,WAAW,CAAC,YAAY,CAAC,CAEhE"}
@@ -0,0 +1,38 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * The address-id consumer for the matcher (#259). Derives a stable {@link PostalAddressID} from a
7
+ * resolved {@link SourceRecord} and exposes it as a blocking key — the deterministic,
8
+ * exact-canonical-address complement to the fuzzy Fellegi-Sunter / GBT scoring. Two uses:
9
+ *
10
+ * - **As a pre-dedup / join key:** `GROUP BY postalAddressId(record)` collapses records that resolve
11
+ * to the same place AND share a canonical address with NO scoring at all — the cheap, certain
12
+ * slice of dedup before the matcher does the fuzzy rest.
13
+ * - **As a blocking key:** {@link addressIdBlockingKey} adds the address-id to the blocking union, so
14
+ * records sharing one are guaranteed to be compared.
15
+ */
16
+ import { createPostalAddressID } from "@mailwoman/address-id";
17
+ import { exactKey } from "@mailwoman/match";
18
+ /**
19
+ * The stable address primary key for a record, or null when it isn't geocoded (no coordinate → no
20
+ * locality cell) or carries no raw address to hash. Uses the resolved coordinate + the raw address;
21
+ * the state prefix is plucked from the address when present.
22
+ */
23
+ export function postalAddressId(record) {
24
+ const coordinate = record.address?.geocode?.coordinate;
25
+ const address = record.address?.raw;
26
+ if (!coordinate || !address)
27
+ return null;
28
+ return createPostalAddressID({ coordinate, address });
29
+ }
30
+ /**
31
+ * A blocking key on the {@link postalAddressId} — records that resolve to the same place with the
32
+ * same canonical address block together. Add it to {@link defaultBlockingKeys}'s union when an exact
33
+ * address join should never be missed.
34
+ */
35
+ export function addressIdBlockingKey() {
36
+ return exactKey((record) => postalAddressId(record));
37
+ }
38
+ //# sourceMappingURL=address-key.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"address-key.js","sourceRoot":"","sources":["../address-key.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;GAcG;AAEH,OAAO,EAAE,qBAAqB,EAAwB,MAAM,uBAAuB,CAAA;AACnF,OAAO,EAAoB,QAAQ,EAAE,MAAM,kBAAkB,CAAA;AAG7D;;;;GAIG;AACH,MAAM,UAAU,eAAe,CAAC,MAAoB;IACnD,MAAM,UAAU,GAAG,MAAM,CAAC,OAAO,EAAE,OAAO,EAAE,UAAU,CAAA;IACtD,MAAM,OAAO,GAAG,MAAM,CAAC,OAAO,EAAE,GAAG,CAAA;IACnC,IAAI,CAAC,UAAU,IAAI,CAAC,OAAO;QAAE,OAAO,IAAI,CAAA;IACxC,OAAO,qBAAqB,CAAC,EAAE,UAAU,EAAE,OAAO,EAAE,CAAC,CAAA;AACtD,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,oBAAoB;IACnC,OAAO,QAAQ,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,eAAe,CAAC,MAAM,CAAC,CAAC,CAAA;AACrD,CAAC"}
@@ -1 +1 @@
1
- {"version":3,"file":"geojson.d.ts","sourceRoot":"","sources":["../geojson.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,KAAK,EAAkB,wBAAwB,EAAE,cAAc,EAAgB,MAAM,YAAY,CAAA;AAmCxG;;;GAGG;AACH,wBAAgB,SAAS,CAAC,QAAQ,EAAE,SAAS,cAAc,EAAE,GAAG,wBAAwB,CAKvF"}
1
+ {"version":3,"file":"geojson.d.ts","sourceRoot":"","sources":["../geojson.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,KAAK,EAAkB,wBAAwB,EAAE,cAAc,EAAgB,MAAM,YAAY,CAAA;AAqCxG;;;GAGG;AACH,wBAAgB,SAAS,CAAC,QAAQ,EAAE,SAAS,cAAc,EAAE,GAAG,wBAAwB,CAKvF"}
package/out/geojson.js CHANGED
@@ -34,6 +34,8 @@ function toFeature(entity) {
34
34
  recordCount: entity.records.length,
35
35
  cohesion: entity.cohesion,
36
36
  sourceIds: entity.records.map((r) => r.id),
37
+ // Distinct provenance labels the entity's records span — an entity with ≥2 is a cross-dataset link.
38
+ sources: [...new Set(entity.records.map((r) => r.source).filter((s) => !!s))].sort(),
37
39
  name: displayName(rep),
38
40
  organization: rep.organization?.canonical ?? null,
39
41
  address: rep.address?.formatted ?? null,
@@ -1 +1 @@
1
- {"version":3,"file":"geojson.js","sourceRoot":"","sources":["../geojson.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAIH,0EAA0E;AAC1E,SAAS,WAAW,CAAC,MAAoB;IACxC,MAAM,IAAI,GAAG,MAAM,CAAC,IAAI,CAAA;IACxB,IAAI,CAAC,IAAI;QAAE,OAAO,IAAI,CAAA;IACtB,MAAM,MAAM,GAAG,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,KAAK,EAAE,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,cAAc,EAAE,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,MAAM,CAAC;SAClG,MAAM,CAAC,OAAO,CAAC;SACf,IAAI,CAAC,GAAG,CAAC;SACT,IAAI,EAAE,CAAA;IACR,OAAO,MAAM,IAAI,IAAI,CAAA;AACtB,CAAC;AAED,8CAA8C;AAC9C,SAAS,SAAS,CAAC,MAAsB;IACxC,MAAM,GAAG,GAAG,MAAM,CAAC,cAAc,CAAA;IACjC,OAAO;QACN,IAAI,EAAE,SAAS;QACf,QAAQ,EAAE;YACT,IAAI,EAAE,OAAO;YACb,WAAW,EAAE,CAAC,MAAM,CAAC,UAAW,CAAC,SAAS,EAAE,MAAM,CAAC,UAAW,CAAC,QAAQ,CAAC;SACxE;QACD,UAAU,EAAE;YACX,QAAQ,EAAE,MAAM,CAAC,EAAE;YACnB,WAAW,EAAE,MAAM,CAAC,OAAO,CAAC,MAAM;YAClC,QAAQ,EAAE,MAAM,CAAC,QAAQ;YACzB,SAAS,EAAE,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;YAC1C,IAAI,EAAE,WAAW,CAAC,GAAG,CAAC;YACtB,YAAY,EAAE,GAAG,CAAC,YAAY,EAAE,SAAS,IAAI,IAAI;YACjD,OAAO,EAAE,GAAG,CAAC,OAAO,EAAE,SAAS,IAAI,IAAI;YACvC,WAAW,EAAE,GAAG,CAAC,OAAO,EAAE,OAAO,EAAE,IAAI,IAAI,IAAI;SAC/C;KACD,CAAA;AACF,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,SAAS,CAAC,QAAmC;IAC5D,OAAO;QACN,IAAI,EAAE,mBAAmB;QACzB,QAAQ,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC;KACvE,CAAA;AACF,CAAC"}
1
+ {"version":3,"file":"geojson.js","sourceRoot":"","sources":["../geojson.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAIH,0EAA0E;AAC1E,SAAS,WAAW,CAAC,MAAoB;IACxC,MAAM,IAAI,GAAG,MAAM,CAAC,IAAI,CAAA;IACxB,IAAI,CAAC,IAAI;QAAE,OAAO,IAAI,CAAA;IACtB,MAAM,MAAM,GAAG,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,KAAK,EAAE,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,cAAc,EAAE,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,MAAM,CAAC;SAClG,MAAM,CAAC,OAAO,CAAC;SACf,IAAI,CAAC,GAAG,CAAC;SACT,IAAI,EAAE,CAAA;IACR,OAAO,MAAM,IAAI,IAAI,CAAA;AACtB,CAAC;AAED,8CAA8C;AAC9C,SAAS,SAAS,CAAC,MAAsB;IACxC,MAAM,GAAG,GAAG,MAAM,CAAC,cAAc,CAAA;IACjC,OAAO;QACN,IAAI,EAAE,SAAS;QACf,QAAQ,EAAE;YACT,IAAI,EAAE,OAAO;YACb,WAAW,EAAE,CAAC,MAAM,CAAC,UAAW,CAAC,SAAS,EAAE,MAAM,CAAC,UAAW,CAAC,QAAQ,CAAC;SACxE;QACD,UAAU,EAAE;YACX,QAAQ,EAAE,MAAM,CAAC,EAAE;YACnB,WAAW,EAAE,MAAM,CAAC,OAAO,CAAC,MAAM;YAClC,QAAQ,EAAE,MAAM,CAAC,QAAQ;YACzB,SAAS,EAAE,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;YAC1C,oGAAoG;YACpG,OAAO,EAAE,CAAC,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAe,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE;YACjG,IAAI,EAAE,WAAW,CAAC,GAAG,CAAC;YACtB,YAAY,EAAE,GAAG,CAAC,YAAY,EAAE,SAAS,IAAI,IAAI;YACjD,OAAO,EAAE,GAAG,CAAC,OAAO,EAAE,SAAS,IAAI,IAAI;YACvC,WAAW,EAAE,GAAG,CAAC,OAAO,EAAE,OAAO,EAAE,IAAI,IAAI,IAAI;SAC/C;KACD,CAAA;AACF,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,SAAS,CAAC,QAAmC;IAC5D,OAAO;QACN,IAAI,EAAE,mBAAmB;QACzB,QAAQ,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC;KACvE,CAAA;AACF,CAAC"}
package/out/index.d.ts CHANGED
@@ -10,8 +10,11 @@
10
10
  * {@link toGeoJSON} exports them for QGIS. This is the clinic-funding use case mailwoman was built
11
11
  * for, finally standing on a calibrated, label-free matcher.
12
12
  */
13
+ export * from "./address-key.js";
13
14
  export * from "./geojson.js";
14
15
  export * from "./ingest.js";
16
+ export * from "./learned-scorer.js";
17
+ export * from "./models/dedup-gbt-en-us.js";
15
18
  export * from "./resolve.js";
16
19
  export * from "./types.js";
17
20
  //# sourceMappingURL=index.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,cAAc,cAAc,CAAA;AAC5B,cAAc,aAAa,CAAA;AAC3B,cAAc,cAAc,CAAA;AAC5B,cAAc,YAAY,CAAA"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,cAAc,kBAAkB,CAAA;AAChC,cAAc,cAAc,CAAA;AAC5B,cAAc,aAAa,CAAA;AAC3B,cAAc,qBAAqB,CAAA;AACnC,cAAc,6BAA6B,CAAA;AAC3C,cAAc,cAAc,CAAA;AAC5B,cAAc,YAAY,CAAA"}
package/out/index.js CHANGED
@@ -10,8 +10,11 @@
10
10
  * {@link toGeoJSON} exports them for QGIS. This is the clinic-funding use case mailwoman was built
11
11
  * for, finally standing on a calibrated, label-free matcher.
12
12
  */
13
+ export * from "./address-key.js";
13
14
  export * from "./geojson.js";
14
15
  export * from "./ingest.js";
16
+ export * from "./learned-scorer.js";
17
+ export * from "./models/dedup-gbt-en-us.js";
15
18
  export * from "./resolve.js";
16
19
  export * from "./types.js";
17
20
  //# sourceMappingURL=index.js.map
package/out/index.js.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"file":"index.js","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,cAAc,cAAc,CAAA;AAC5B,cAAc,aAAa,CAAA;AAC3B,cAAc,cAAc,CAAA;AAC5B,cAAc,YAAY,CAAA"}
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,cAAc,kBAAkB,CAAA;AAChC,cAAc,cAAc,CAAA;AAC5B,cAAc,aAAa,CAAA;AAC3B,cAAc,qBAAqB,CAAA;AACnC,cAAc,6BAA6B,CAAA;AAC3C,cAAc,cAAc,CAAA;AAC5B,cAAc,YAAY,CAAA"}
package/out/ingest.d.ts CHANGED
@@ -26,8 +26,36 @@ import { toPostalAddress } from "@mailwoman/record";
26
26
  import type { SourceRecord } from "./types.js";
27
27
  /** Resolve a raw address string into a {@link PostalAddress}. The seam to mailwoman's geocoder. */
28
28
  export type GeocodeAddress = (raw: string) => Promise<PostalAddress | null> | PostalAddress | null;
29
- /** Maps dataset columns to record fields. A field may draw from several columns (joined with
30
- spaces). */
29
+ /** Column delimiter of a delimited source. */
30
+ export type Delimiter = "comma" | "tab";
31
+ /** Infer the delimiter from a path's extension (`.tsv` → tab, else comma). */
32
+ export declare function delimiterFor(path: string): Delimiter;
33
+ /**
34
+ * Stream a delimited file's rows lazily as header-keyed objects — the same shape {@link parseCsv}
35
+ * returns, but **without loading the file into memory**. A multi-GB source (the NPPES registry is
36
+ * ~4.8 GB / 9.6M rows — too big for `readFileSync`, which throws `ERR_STRING_TOO_LONG`) streams
37
+ * line by line. Keys are the original header names so a {@link ColumnMapping} written against the
38
+ * source's headers matches. Filter/sample the stream before {@link ingestRows} to keep only the rows
39
+ * you geocode.
40
+ *
41
+ * We stream _lines_ with spliterator's `TextSpliterator` (pure-Node, the part that handles the huge
42
+ * file) and split each line into columns here with `String.prototype.split`. We deliberately do NOT
43
+ * use `CSVSpliterator`: its column tokenizer hard-codes `skipEmpty` (it builds the column
44
+ * spliterator as `{ delimiter }` with no `skipEmpty: false`), so consecutive delimiters collapse
45
+ * and EMPTY FIELDS ARE DROPPED — fatal for a fixed-width registry like NPPES where a row of 330
46
+ * columns full of empties would mis-parse to 40 and shift every value. (Upstream `spliterator` bug;
47
+ * revisit when it's fixed.)
48
+ *
49
+ * Assumes an unquoted delimited file (no fields containing the delimiter) — true for these
50
+ * government TSVs. For small, possibly-quoted CSVs use {@link parseCsv} (quote-aware, in-memory).
51
+ */
52
+ export declare function streamRows(source: string, opts?: {
53
+ delimiter?: Delimiter;
54
+ }): AsyncGenerator<Record<string, string>>;
55
+ /**
56
+ * Maps dataset columns to record fields. A field may draw from several columns (joined with
57
+ * spaces).
58
+ */
31
59
  export interface ColumnMapping {
32
60
  /** Column holding a stable row id. Falls back to the row index. */
33
61
  id?: string;
@@ -38,7 +66,24 @@ export interface ColumnMapping {
38
66
  address?: string | string[];
39
67
  phone?: string;
40
68
  email?: string;
69
+ /**
70
+ * Extra secondary-identifier fields → the column(s) to draw each from (joined with spaces). Land
71
+ * on `SourceRecord.attributes` under the same key, for the matcher's `discriminators`
72
+ * (authorized-official name, taxonomy, license…).
73
+ */
74
+ attributes?: Record<string, string | string[]>;
41
75
  }
76
+ /**
77
+ * Best-effort {@link ColumnMapping} inferred from a header row — the "point it at any CSV"
78
+ * convenience. Each column name is matched (case- and punctuation-insensitive, on whole tokens) to
79
+ * a field by keyword, in a precedence that resolves the common ambiguities: a dedicated id / phone
80
+ * / email column is claimed before the generic sweep, an org / facility column beats a person
81
+ * "name", and address columns (street / city / state / zip…) collect into one multi-column field.
82
+ * Imperfect on bespoke headers (an explicit mapping or the LLM-assisted inference #603 is the
83
+ * answer there), but it nails tidy and semi-tidy files with no hand-mapping. Unmatched columns are
84
+ * left out.
85
+ */
86
+ export declare function inferMapping(header: readonly string[]): ColumnMapping;
42
87
  /** Options for {@link ingestRows}. */
43
88
  export interface IngestOptions {
44
89
  /** The geocoding seam. Without it, records carry name/org but no resolved address. */
@@ -46,8 +91,12 @@ export interface IngestOptions {
46
91
  }
47
92
  /** Parse a CSV string (with a header row) into row objects keyed by column name. */
48
93
  export declare function parseCsv(text: string): Record<string, string>[];
49
- /** Normalize tabular rows into {@link SourceRecord}s under a {@link ColumnMapping}. */
50
- export declare function ingestRows(rows: Iterable<Record<string, string>>, mapping: ColumnMapping, opts?: IngestOptions): Promise<SourceRecord[]>;
94
+ /**
95
+ * Normalize tabular rows into {@link SourceRecord}s under a {@link ColumnMapping}. Accepts a sync OR
96
+ * async iterable, so {@link parseCsv} (in-memory) and {@link streamRows} (lazy, for huge files) both
97
+ * thread straight through.
98
+ */
99
+ export declare function ingestRows(rows: Iterable<Record<string, string>> | AsyncIterable<Record<string, string>>, mapping: ColumnMapping, opts?: IngestOptions): Promise<SourceRecord[]>;
51
100
  /**
52
101
  * The subset of mailwoman's `GeocodeResult` the adapter consumes — kept structural so this package
53
102
  * never imports the heavy geocoder, yet a real `GeocodeResult` maps straight in.
@@ -1 +1 @@
1
- {"version":3,"file":"ingest.d.ts","sourceRoot":"","sources":["../ingest.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAEH,OAAO,KAAK,EAAE,cAAc,EAAE,aAAa,EAAE,MAAM,mBAAmB,CAAA;AACtE,OAAO,EAAiD,eAAe,EAAe,MAAM,mBAAmB,CAAA;AAE/G,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,YAAY,CAAA;AAE9C,mGAAmG;AACnG,MAAM,MAAM,cAAc,GAAG,CAAC,GAAG,EAAE,MAAM,KAAK,OAAO,CAAC,aAAa,GAAG,IAAI,CAAC,GAAG,aAAa,GAAG,IAAI,CAAA;AAElG;WACW;AACX,MAAM,WAAW,aAAa;IAC7B,mEAAmE;IACnE,EAAE,CAAC,EAAE,MAAM,CAAA;IACX,+DAA+D;IAC/D,MAAM,CAAC,EAAE,MAAM,CAAA;IACf,IAAI,CAAC,EAAE,MAAM,GAAG,MAAM,EAAE,CAAA;IACxB,YAAY,CAAC,EAAE,MAAM,GAAG,MAAM,EAAE,CAAA;IAChC,OAAO,CAAC,EAAE,MAAM,GAAG,MAAM,EAAE,CAAA;IAC3B,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,KAAK,CAAC,EAAE,MAAM,CAAA;CACd;AAED,sCAAsC;AACtC,MAAM,WAAW,aAAa;IAC7B,sFAAsF;IACtF,cAAc,CAAC,EAAE,cAAc,CAAA;CAC/B;AAED,oFAAoF;AACpF,wBAAgB,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,CAE/D;AAcD,uFAAuF;AACvF,wBAAsB,UAAU,CAC/B,IAAI,EAAE,QAAQ,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,EACtC,OAAO,EAAE,aAAa,EACtB,IAAI,GAAE,aAAkB,GACtB,OAAO,CAAC,YAAY,EAAE,CAAC,CA2BzB;AAED;;;GAGG;AACH,MAAM,WAAW,UAAU;IAC1B,GAAG,EAAE,MAAM,GAAG,IAAI,CAAA;IAClB,GAAG,EAAE,MAAM,GAAG,IAAI,CAAA;IAClB,eAAe,EAAE,cAAc,CAAC,MAAM,CAAC,CAAA;IACvC,aAAa,EAAE,MAAM,GAAG,IAAI,CAAA;IAC5B,SAAS,CAAC,EAAE,cAAc,CAAC,WAAW,CAAC,CAAA;CACvC;AAED;;;;;;GAMG;AACH,wBAAgB,iBAAiB,CAAC,IAAI,EAAE;IACvC,KAAK,EAAE,CAAC,GAAG,EAAE,MAAM,KAAK,OAAO,CAAC,UAAU,CAAC,OAAO,eAAe,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,UAAU,CAAC,OAAO,eAAe,CAAC,CAAC,CAAC,CAAC,CAAA;IAC9G,OAAO,EAAE,CAAC,GAAG,EAAE,MAAM,KAAK,OAAO,CAAC,UAAU,GAAG,IAAI,CAAC,GAAG,UAAU,GAAG,IAAI,CAAA;IACxE,OAAO,CAAC,EAAE,MAAM,CAAA;CAChB,GAAG,cAAc,CAgBjB"}
1
+ {"version":3,"file":"ingest.d.ts","sourceRoot":"","sources":["../ingest.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAEH,OAAO,KAAK,EAAE,cAAc,EAAE,aAAa,EAAE,MAAM,mBAAmB,CAAA;AACtE,OAAO,EAAiD,eAAe,EAAe,MAAM,mBAAmB,CAAA;AAI/G,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,YAAY,CAAA;AAE9C,mGAAmG;AACnG,MAAM,MAAM,cAAc,GAAG,CAAC,GAAG,EAAE,MAAM,KAAK,OAAO,CAAC,aAAa,GAAG,IAAI,CAAC,GAAG,aAAa,GAAG,IAAI,CAAA;AAElG,8CAA8C;AAC9C,MAAM,MAAM,SAAS,GAAG,OAAO,GAAG,KAAK,CAAA;AAEvC,8EAA8E;AAC9E,wBAAgB,YAAY,CAAC,IAAI,EAAE,MAAM,GAAG,SAAS,CAEpD;AAED;;;;;;;;;;;;;;;;;;GAkBG;AACH,wBAAuB,UAAU,CAChC,MAAM,EAAE,MAAM,EACd,IAAI,GAAE;IAAE,SAAS,CAAC,EAAE,SAAS,CAAA;CAAO,GAClC,cAAc,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CA0BxC;AAED;;;GAGG;AACH,MAAM,WAAW,aAAa;IAC7B,mEAAmE;IACnE,EAAE,CAAC,EAAE,MAAM,CAAA;IACX,+DAA+D;IAC/D,MAAM,CAAC,EAAE,MAAM,CAAA;IACf,IAAI,CAAC,EAAE,MAAM,GAAG,MAAM,EAAE,CAAA;IACxB,YAAY,CAAC,EAAE,MAAM,GAAG,MAAM,EAAE,CAAA;IAChC,OAAO,CAAC,EAAE,MAAM,GAAG,MAAM,EAAE,CAAA;IAC3B,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,KAAK,CAAC,EAAE,MAAM,CAAA;IACd;;;;OAIG;IACH,UAAU,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,EAAE,CAAC,CAAA;CAC9C;AAED;;;;;;;;;GASG;AACH,wBAAgB,YAAY,CAAC,MAAM,EAAE,SAAS,MAAM,EAAE,GAAG,aAAa,CA4CrE;AAED,sCAAsC;AACtC,MAAM,WAAW,aAAa;IAC7B,sFAAsF;IACtF,cAAc,CAAC,EAAE,cAAc,CAAA;CAC/B;AAED,oFAAoF;AACpF,wBAAgB,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,CAE/D;AAcD;;;;GAIG;AACH,wBAAsB,UAAU,CAC/B,IAAI,EAAE,QAAQ,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,GAAG,aAAa,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,EAC9E,OAAO,EAAE,aAAa,EACtB,IAAI,GAAE,aAAkB,GACtB,OAAO,CAAC,YAAY,EAAE,CAAC,CAoCzB;AAED;;;GAGG;AACH,MAAM,WAAW,UAAU;IAC1B,GAAG,EAAE,MAAM,GAAG,IAAI,CAAA;IAClB,GAAG,EAAE,MAAM,GAAG,IAAI,CAAA;IAClB,eAAe,EAAE,cAAc,CAAC,MAAM,CAAC,CAAA;IACvC,aAAa,EAAE,MAAM,GAAG,IAAI,CAAA;IAC5B,SAAS,CAAC,EAAE,cAAc,CAAC,WAAW,CAAC,CAAA;CACvC;AAED;;;;;;GAMG;AACH,wBAAgB,iBAAiB,CAAC,IAAI,EAAE;IACvC,KAAK,EAAE,CAAC,GAAG,EAAE,MAAM,KAAK,OAAO,CAAC,UAAU,CAAC,OAAO,eAAe,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,UAAU,CAAC,OAAO,eAAe,CAAC,CAAC,CAAC,CAAC,CAAA;IAC9G,OAAO,EAAE,CAAC,GAAG,EAAE,MAAM,KAAK,OAAO,CAAC,UAAU,GAAG,IAAI,CAAC,GAAG,UAAU,GAAG,IAAI,CAAA;IACxE,OAAO,CAAC,EAAE,MAAM,CAAA;CAChB,GAAG,cAAc,CAgBjB"}
package/out/ingest.js CHANGED
@@ -23,6 +23,102 @@
23
23
  */
24
24
  import { canonicalizeOrganizationName, parsePersonName, toPostalAddress, withGeocode } from "@mailwoman/record";
25
25
  import { parse as parseCsvSync } from "csv-parse/sync";
26
+ import { open } from "node:fs/promises";
27
+ import { Delimiters, TextSpliterator } from "spliterator";
28
+ /** Infer the delimiter from a path's extension (`.tsv` → tab, else comma). */
29
+ export function delimiterFor(path) {
30
+ return /\.tsv$/i.test(path) ? "tab" : "comma";
31
+ }
32
+ /**
33
+ * Stream a delimited file's rows lazily as header-keyed objects — the same shape {@link parseCsv}
34
+ * returns, but **without loading the file into memory**. A multi-GB source (the NPPES registry is
35
+ * ~4.8 GB / 9.6M rows — too big for `readFileSync`, which throws `ERR_STRING_TOO_LONG`) streams
36
+ * line by line. Keys are the original header names so a {@link ColumnMapping} written against the
37
+ * source's headers matches. Filter/sample the stream before {@link ingestRows} to keep only the rows
38
+ * you geocode.
39
+ *
40
+ * We stream _lines_ with spliterator's `TextSpliterator` (pure-Node, the part that handles the huge
41
+ * file) and split each line into columns here with `String.prototype.split`. We deliberately do NOT
42
+ * use `CSVSpliterator`: its column tokenizer hard-codes `skipEmpty` (it builds the column
43
+ * spliterator as `{ delimiter }` with no `skipEmpty: false`), so consecutive delimiters collapse
44
+ * and EMPTY FIELDS ARE DROPPED — fatal for a fixed-width registry like NPPES where a row of 330
45
+ * columns full of empties would mis-parse to 40 and shift every value. (Upstream `spliterator` bug;
46
+ * revisit when it's fixed.)
47
+ *
48
+ * Assumes an unquoted delimited file (no fields containing the delimiter) — true for these
49
+ * government TSVs. For small, possibly-quoted CSVs use {@link parseCsv} (quote-aware, in-memory).
50
+ */
51
+ export async function* streamRows(source, opts = {}) {
52
+ const sep = (opts.delimiter ?? delimiterFor(source)) === "tab" ? "\t" : ",";
53
+ // Own the file handle so it's closed deterministically. spliterator's `autoDispose` only fires on
54
+ // natural completion, not on an early `break`/`.return()` — which then leaks the fd (a GC-time error
55
+ // in Node 24+). We open it, pass `autoDispose: false` so spliterator never touches our handle, and
56
+ // close it in `finally` (runs on completion AND when the consumer abandons the generator early).
57
+ const handle = await open(source, "r");
58
+ try {
59
+ let header = null;
60
+ for await (const line of TextSpliterator.fromAsync(handle, {
61
+ delimiter: Delimiters.LineFeed,
62
+ autoDispose: false,
63
+ })) {
64
+ if (line.length === 0)
65
+ continue; // blank line / trailing newline
66
+ const fields = line.replace(/\r$/, "").split(sep); // tolerate CRLF
67
+ if (header === null) {
68
+ header = fields;
69
+ continue;
70
+ }
71
+ const row = {};
72
+ for (let i = 0; i < header.length; i++)
73
+ row[header[i]] = fields[i] ?? "";
74
+ yield row;
75
+ }
76
+ }
77
+ finally {
78
+ await handle.close();
79
+ }
80
+ }
81
+ /**
82
+ * Best-effort {@link ColumnMapping} inferred from a header row — the "point it at any CSV"
83
+ * convenience. Each column name is matched (case- and punctuation-insensitive, on whole tokens) to
84
+ * a field by keyword, in a precedence that resolves the common ambiguities: a dedicated id / phone
85
+ * / email column is claimed before the generic sweep, an org / facility column beats a person
86
+ * "name", and address columns (street / city / state / zip…) collect into one multi-column field.
87
+ * Imperfect on bespoke headers (an explicit mapping or the LLM-assisted inference #603 is the
88
+ * answer there), but it nails tidy and semi-tidy files with no hand-mapping. Unmatched columns are
89
+ * left out.
90
+ */
91
+ export function inferMapping(header) {
92
+ // Pad to whole-token boundaries so "state" doesn't match inside "statement".
93
+ const tok = (h) => ` ${h
94
+ .toLowerCase()
95
+ .replace(/[^a-z0-9]+/g, " ")
96
+ .trim()} `;
97
+ const mapping = {};
98
+ const name = [];
99
+ const address = [];
100
+ for (const column of header) {
101
+ const h = tok(column);
102
+ const has = (...words) => words.some((w) => h.includes(` ${w} `));
103
+ if (!mapping.email && has("email", "e mail"))
104
+ mapping.email = column;
105
+ else if (!mapping.phone && has("phone", "telephone", "tel", "mobile", "cell"))
106
+ mapping.phone = column;
107
+ else if (!mapping.id && has("id", "npi", "ein", "frn", "spin", "uuid", "guid", "key"))
108
+ mapping.id = column;
109
+ else if (has("org", "organization", "organisation", "company", "business", "facility", "agency", "employer"))
110
+ mapping.organization ??= column;
111
+ else if (has("street", "address", "addr", "city", "town", "state", "province", "zip", "zipcode", "postal", "postcode", "county"))
112
+ address.push(column);
113
+ else if (has("name", "first", "last", "given", "family", "middle", "surname", "fullname", "contact"))
114
+ name.push(column);
115
+ }
116
+ if (name.length)
117
+ mapping.name = name.length === 1 ? name[0] : name;
118
+ if (address.length)
119
+ mapping.address = address;
120
+ return mapping;
121
+ }
26
122
  /** Parse a CSV string (with a header row) into row objects keyed by column name. */
27
123
  export function parseCsv(text) {
28
124
  return parseCsvSync(text, { columns: true, skip_empty_lines: true, trim: true, relax_column_count: true });
@@ -39,15 +135,27 @@ function pick(row, columns) {
39
135
  .trim();
40
136
  return value || undefined;
41
137
  }
42
- /** Normalize tabular rows into {@link SourceRecord}s under a {@link ColumnMapping}. */
138
+ /**
139
+ * Normalize tabular rows into {@link SourceRecord}s under a {@link ColumnMapping}. Accepts a sync OR
140
+ * async iterable, so {@link parseCsv} (in-memory) and {@link streamRows} (lazy, for huge files) both
141
+ * thread straight through.
142
+ */
43
143
  export async function ingestRows(rows, mapping, opts = {}) {
44
144
  const records = [];
45
145
  let index = 0;
46
- for (const row of rows) {
146
+ for await (const row of rows) {
47
147
  const id = (mapping.id ? row[mapping.id]?.trim() : "") || String(index);
48
148
  const nameValue = pick(row, mapping.name);
49
149
  const orgValue = pick(row, mapping.organization);
50
150
  const addressValue = pick(row, mapping.address);
151
+ let attributes;
152
+ if (mapping.attributes) {
153
+ for (const [key, columns] of Object.entries(mapping.attributes)) {
154
+ const value = pick(row, columns);
155
+ if (value)
156
+ (attributes ??= {})[key] = value;
157
+ }
158
+ }
51
159
  const record = {
52
160
  id,
53
161
  source: mapping.source,
@@ -56,6 +164,7 @@ export async function ingestRows(rows, mapping, opts = {}) {
56
164
  phone: (mapping.phone && row[mapping.phone]?.trim()) || undefined,
57
165
  email: (mapping.email && row[mapping.email]?.trim()?.toLowerCase()) || undefined,
58
166
  address: addressValue && opts.geocodeAddress ? ((await opts.geocodeAddress(addressValue)) ?? undefined) : undefined,
167
+ attributes,
59
168
  raw: row,
60
169
  };
61
170
  records.push(record);
package/out/ingest.js.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"file":"ingest.js","sourceRoot":"","sources":["../ingest.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAGH,OAAO,EAAE,4BAA4B,EAAE,eAAe,EAAE,eAAe,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAA;AAC/G,OAAO,EAAE,KAAK,IAAI,YAAY,EAAE,MAAM,gBAAgB,CAAA;AA0BtD,oFAAoF;AACpF,MAAM,UAAU,QAAQ,CAAC,IAAY;IACpC,OAAO,YAAY,CAAC,IAAI,EAAE,EAAE,OAAO,EAAE,IAAI,EAAE,gBAAgB,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,kBAAkB,EAAE,IAAI,EAAE,CAAC,CAAA;AAC3G,CAAC;AAED,6FAA6F;AAC7F,SAAS,IAAI,CAAC,GAA2B,EAAE,OAA2B;IACrE,IAAI,CAAC,OAAO;QAAE,OAAO,SAAS,CAAA;IAC9B,MAAM,IAAI,GAAG,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAA;IACzD,MAAM,KAAK,GAAG,IAAI;SAChB,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC;SAC1B,MAAM,CAAC,OAAO,CAAC;SACf,IAAI,CAAC,GAAG,CAAC;SACT,IAAI,EAAE,CAAA;IACR,OAAO,KAAK,IAAI,SAAS,CAAA;AAC1B,CAAC;AAED,uFAAuF;AACvF,MAAM,CAAC,KAAK,UAAU,UAAU,CAC/B,IAAsC,EACtC,OAAsB,EACtB,OAAsB,EAAE;IAExB,MAAM,OAAO,GAAmB,EAAE,CAAA;IAClC,IAAI,KAAK,GAAG,CAAC,CAAA;IAEb,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACxB,MAAM,EAAE,GAAG,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,EAAE,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,MAAM,CAAC,KAAK,CAAC,CAAA;QACvE,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,OAAO,CAAC,IAAI,CAAC,CAAA;QACzC,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,OAAO,CAAC,YAAY,CAAC,CAAA;QAChD,MAAM,YAAY,GAAG,IAAI,CAAC,GAAG,EAAE,OAAO,CAAC,OAAO,CAAC,CAAA;QAE/C,MAAM,MAAM,GAAiB;YAC5B,EAAE;YACF,MAAM,EAAE,OAAO,CAAC,MAAM;YACtB,IAAI,EAAE,SAAS,CAAC,CAAC,CAAC,eAAe,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,SAAS;YACxD,YAAY,EAAE,QAAQ,CAAC,CAAC,CAAC,4BAA4B,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,SAAS;YAC3E,KAAK,EAAE,CAAC,OAAO,CAAC,KAAK,IAAI,GAAG,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,IAAI,EAAE,CAAC,IAAI,SAAS;YACjE,KAAK,EAAE,CAAC,OAAO,CAAC,KAAK,IAAI,GAAG,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,IAAI,EAAE,EAAE,WAAW,EAAE,CAAC,IAAI,SAAS;YAChF,OAAO,EACN,YAAY,IAAI,IAAI,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,cAAc,CAAC,YAAY,CAAC,CAAC,IAAI,SAAS,CAAC,CAAC,CAAC,CAAC,SAAS;YAC3G,GAAG,EAAE,GAAG;SACR,CAAA;QAED,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QACpB,KAAK,EAAE,CAAA;IACR,CAAC;IAED,OAAO,OAAO,CAAA;AACf,CAAC;AAcD;;;;;;GAMG;AACH,MAAM,UAAU,iBAAiB,CAAC,IAIjC;IACA,OAAO,KAAK,EAAE,GAAW,EAAiC,EAAE;QAC3D,MAAM,UAAU,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAA;QACxC,MAAM,IAAI,GAAG,eAAe,CAAC,UAAU,EAAE,EAAE,OAAO,EAAE,IAAI,CAAC,OAAO,EAAE,GAAG,EAAE,CAAC,CAAA;QAExE,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAA;QACxC,IAAI,CAAC,QAAQ,IAAI,QAAQ,CAAC,GAAG,KAAK,IAAI,IAAI,QAAQ,CAAC,GAAG,KAAK,IAAI;YAAE,OAAO,IAAI,CAAA;QAE5E,MAAM,OAAO,GAAmB;YAC/B,UAAU,EAAE,EAAE,QAAQ,EAAE,QAAQ,CAAC,GAAG,EAAE,SAAS,EAAE,QAAQ,CAAC,GAAG,EAAE;YAC/D,IAAI,EAAE,QAAQ,CAAC,eAAe;YAC9B,iBAAiB,EAAE,QAAQ,CAAC,aAAa;YACzC,SAAS,EAAE,QAAQ,CAAC,SAAS;SAC7B,CAAA;QACD,OAAO,WAAW,CAAC,IAAI,EAAE,OAAO,CAAC,CAAA;IAClC,CAAC,CAAA;AACF,CAAC"}
1
+ {"version":3,"file":"ingest.js","sourceRoot":"","sources":["../ingest.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAGH,OAAO,EAAE,4BAA4B,EAAE,eAAe,EAAE,eAAe,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAA;AAC/G,OAAO,EAAE,KAAK,IAAI,YAAY,EAAE,MAAM,gBAAgB,CAAA;AACtD,OAAO,EAAE,IAAI,EAAE,MAAM,kBAAkB,CAAA;AACvC,OAAO,EAAE,UAAU,EAAE,eAAe,EAAE,MAAM,aAAa,CAAA;AASzD,8EAA8E;AAC9E,MAAM,UAAU,YAAY,CAAC,IAAY;IACxC,OAAO,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,OAAO,CAAA;AAC9C,CAAC;AAED;;;;;;;;;;;;;;;;;;GAkBG;AACH,MAAM,CAAC,KAAK,SAAS,CAAC,CAAC,UAAU,CAChC,MAAc,EACd,OAAkC,EAAE;IAEpC,MAAM,GAAG,GAAG,CAAC,IAAI,CAAC,SAAS,IAAI,YAAY,CAAC,MAAM,CAAC,CAAC,KAAK,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,CAAA;IAC3E,kGAAkG;IAClG,qGAAqG;IACrG,mGAAmG;IACnG,iGAAiG;IACjG,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,MAAM,EAAE,GAAG,CAAC,CAAA;IACtC,IAAI,CAAC;QACJ,IAAI,MAAM,GAAoB,IAAI,CAAA;QAClC,IAAI,KAAK,EAAE,MAAM,IAAI,IAAI,eAAe,CAAC,SAAS,CAAC,MAAM,EAAE;YAC1D,SAAS,EAAE,UAAU,CAAC,QAAQ;YAC9B,WAAW,EAAE,KAAK;SAClB,CAAC,EAAE,CAAC;YACJ,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;gBAAE,SAAQ,CAAC,gCAAgC;YAChE,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAA,CAAC,gBAAgB;YAClE,IAAI,MAAM,KAAK,IAAI,EAAE,CAAC;gBACrB,MAAM,GAAG,MAAM,CAAA;gBACf,SAAQ;YACT,CAAC;YACD,MAAM,GAAG,GAA2B,EAAE,CAAA;YACtC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE;gBAAE,GAAG,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,GAAG,MAAM,CAAC,CAAC,CAAC,IAAI,EAAE,CAAA;YACzE,MAAM,GAAG,CAAA;QACV,CAAC;IACF,CAAC;YAAS,CAAC;QACV,MAAM,MAAM,CAAC,KAAK,EAAE,CAAA;IACrB,CAAC;AACF,CAAC;AAwBD;;;;;;;;;GASG;AACH,MAAM,UAAU,YAAY,CAAC,MAAyB;IACrD,6EAA6E;IAC7E,MAAM,GAAG,GAAG,CAAC,CAAS,EAAE,EAAE,CACzB,IAAI,CAAC;SACH,WAAW,EAAE;SACb,OAAO,CAAC,aAAa,EAAE,GAAG,CAAC;SAC3B,IAAI,EAAE,GAAG,CAAA;IACZ,MAAM,OAAO,GAAkB,EAAE,CAAA;IACjC,MAAM,IAAI,GAAa,EAAE,CAAA;IACzB,MAAM,OAAO,GAAa,EAAE,CAAA;IAE5B,KAAK,MAAM,MAAM,IAAI,MAAM,EAAE,CAAC;QAC7B,MAAM,CAAC,GAAG,GAAG,CAAC,MAAM,CAAC,CAAA;QACrB,MAAM,GAAG,GAAG,CAAC,GAAG,KAAe,EAAW,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAA;QAEpF,IAAI,CAAC,OAAO,CAAC,KAAK,IAAI,GAAG,CAAC,OAAO,EAAE,QAAQ,CAAC;YAAE,OAAO,CAAC,KAAK,GAAG,MAAM,CAAA;aAC/D,IAAI,CAAC,OAAO,CAAC,KAAK,IAAI,GAAG,CAAC,OAAO,EAAE,WAAW,EAAE,KAAK,EAAE,QAAQ,EAAE,MAAM,CAAC;YAAE,OAAO,CAAC,KAAK,GAAG,MAAM,CAAA;aAChG,IAAI,CAAC,OAAO,CAAC,EAAE,IAAI,GAAG,CAAC,IAAI,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,KAAK,CAAC;YAAE,OAAO,CAAC,EAAE,GAAG,MAAM,CAAA;aACrG,IAAI,GAAG,CAAC,KAAK,EAAE,cAAc,EAAE,cAAc,EAAE,SAAS,EAAE,UAAU,EAAE,UAAU,EAAE,QAAQ,EAAE,UAAU,CAAC;YAC3G,OAAO,CAAC,YAAY,KAAK,MAAM,CAAA;aAC3B,IACJ,GAAG,CACF,QAAQ,EACR,SAAS,EACT,MAAM,EACN,MAAM,EACN,MAAM,EACN,OAAO,EACP,UAAU,EACV,KAAK,EACL,SAAS,EACT,QAAQ,EACR,UAAU,EACV,QAAQ,CACR;YAED,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;aAChB,IAAI,GAAG,CAAC,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,QAAQ,EAAE,QAAQ,EAAE,SAAS,EAAE,UAAU,EAAE,SAAS,CAAC;YACnG,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;IACnB,CAAC;IAED,IAAI,IAAI,CAAC,MAAM;QAAE,OAAO,CAAC,IAAI,GAAG,IAAI,CAAC,MAAM,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAE,CAAC,CAAC,CAAC,IAAI,CAAA;IACnE,IAAI,OAAO,CAAC,MAAM;QAAE,OAAO,CAAC,OAAO,GAAG,OAAO,CAAA;IAC7C,OAAO,OAAO,CAAA;AACf,CAAC;AAQD,oFAAoF;AACpF,MAAM,UAAU,QAAQ,CAAC,IAAY;IACpC,OAAO,YAAY,CAAC,IAAI,EAAE,EAAE,OAAO,EAAE,IAAI,EAAE,gBAAgB,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,kBAAkB,EAAE,IAAI,EAAE,CAAC,CAAA;AAC3G,CAAC;AAED,6FAA6F;AAC7F,SAAS,IAAI,CAAC,GAA2B,EAAE,OAA2B;IACrE,IAAI,CAAC,OAAO;QAAE,OAAO,SAAS,CAAA;IAC9B,MAAM,IAAI,GAAG,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAA;IACzD,MAAM,KAAK,GAAG,IAAI;SAChB,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC;SAC1B,MAAM,CAAC,OAAO,CAAC;SACf,IAAI,CAAC,GAAG,CAAC;SACT,IAAI,EAAE,CAAA;IACR,OAAO,KAAK,IAAI,SAAS,CAAA;AAC1B,CAAC;AAED;;;;GAIG;AACH,MAAM,CAAC,KAAK,UAAU,UAAU,CAC/B,IAA8E,EAC9E,OAAsB,EACtB,OAAsB,EAAE;IAExB,MAAM,OAAO,GAAmB,EAAE,CAAA;IAClC,IAAI,KAAK,GAAG,CAAC,CAAA;IAEb,IAAI,KAAK,EAAE,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QAC9B,MAAM,EAAE,GAAG,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,EAAE,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,MAAM,CAAC,KAAK,CAAC,CAAA;QACvE,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,OAAO,CAAC,IAAI,CAAC,CAAA;QACzC,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,OAAO,CAAC,YAAY,CAAC,CAAA;QAChD,MAAM,YAAY,GAAG,IAAI,CAAC,GAAG,EAAE,OAAO,CAAC,OAAO,CAAC,CAAA;QAE/C,IAAI,UAA8C,CAAA;QAClD,IAAI,OAAO,CAAC,UAAU,EAAE,CAAC;YACxB,KAAK,MAAM,CAAC,GAAG,EAAE,OAAO,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,UAAU,CAAC,EAAE,CAAC;gBACjE,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,OAAO,CAAC,CAAA;gBAChC,IAAI,KAAK;oBAAE,CAAC,UAAU,KAAK,EAAE,CAAC,CAAC,GAAG,CAAC,GAAG,KAAK,CAAA;YAC5C,CAAC;QACF,CAAC;QAED,MAAM,MAAM,GAAiB;YAC5B,EAAE;YACF,MAAM,EAAE,OAAO,CAAC,MAAM;YACtB,IAAI,EAAE,SAAS,CAAC,CAAC,CAAC,eAAe,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,SAAS;YACxD,YAAY,EAAE,QAAQ,CAAC,CAAC,CAAC,4BAA4B,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,SAAS;YAC3E,KAAK,EAAE,CAAC,OAAO,CAAC,KAAK,IAAI,GAAG,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,IAAI,EAAE,CAAC,IAAI,SAAS;YACjE,KAAK,EAAE,CAAC,OAAO,CAAC,KAAK,IAAI,GAAG,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,IAAI,EAAE,EAAE,WAAW,EAAE,CAAC,IAAI,SAAS;YAChF,OAAO,EACN,YAAY,IAAI,IAAI,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,cAAc,CAAC,YAAY,CAAC,CAAC,IAAI,SAAS,CAAC,CAAC,CAAC,CAAC,SAAS;YAC3G,UAAU;YACV,GAAG,EAAE,GAAG;SACR,CAAA;QAED,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QACpB,KAAK,EAAE,CAAA;IACR,CAAC;IAED,OAAO,OAAO,CAAA;AACf,CAAC;AAcD;;;;;;GAMG;AACH,MAAM,UAAU,iBAAiB,CAAC,IAIjC;IACA,OAAO,KAAK,EAAE,GAAW,EAAiC,EAAE;QAC3D,MAAM,UAAU,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAA;QACxC,MAAM,IAAI,GAAG,eAAe,CAAC,UAAU,EAAE,EAAE,OAAO,EAAE,IAAI,CAAC,OAAO,EAAE,GAAG,EAAE,CAAC,CAAA;QAExE,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAA;QACxC,IAAI,CAAC,QAAQ,IAAI,QAAQ,CAAC,GAAG,KAAK,IAAI,IAAI,QAAQ,CAAC,GAAG,KAAK,IAAI;YAAE,OAAO,IAAI,CAAA;QAE5E,MAAM,OAAO,GAAmB;YAC/B,UAAU,EAAE,EAAE,QAAQ,EAAE,QAAQ,CAAC,GAAG,EAAE,SAAS,EAAE,QAAQ,CAAC,GAAG,EAAE;YAC/D,IAAI,EAAE,QAAQ,CAAC,eAAe;YAC9B,iBAAiB,EAAE,QAAQ,CAAC,aAAa;YACzC,SAAS,EAAE,QAAQ,CAAC,SAAS;SAC7B,CAAA;QACD,OAAO,WAAW,CAAC,IAAI,EAAE,OAAO,CAAC,CAAA;IAClC,CAAC,CAAA;AACF,CAAC"}
@@ -0,0 +1,57 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * The learned scorer (#603) — the production wiring for the gradient-boosted-tree model behind
7
+ * {@link ResolveConfig.scorer}. Two pieces:
8
+ *
9
+ * 1. {@link createMatchFeaturizer} — the ONE feature extractor for a candidate pair, used identically
10
+ * at train time (`scripts/record-matcher/train-gbt.ts`), eval time (the learned-scorer
11
+ * evals), and inference time (here). A pair → one-hot of each comparison's agreement level +
12
+ * the over-merge interaction terms (co-located × name/org disagreement) + address
13
+ * crowdedness.
14
+ * 2. {@link createGbtScorer} — wraps a trained {@link GBT} + the featurizer into the `(a, b) => number`
15
+ * the resolve pipeline's `scorer` hook expects (a logit, threshold-comparable with the
16
+ * Fellegi-Sunter weight it replaces).
17
+ *
18
+ * Both take the comparison set as INPUT (rather than importing {@link buildDefaultModel}) so this
19
+ * module has no dependency cycle with `resolve.ts`. The contract that keeps train ≡ inference:
20
+ * feed the comparisons from `buildDefaultModel({ collapseSpatial: true, addressFrequency })` —
21
+ * the model's structure (and thus the feature layout) is fixed by that config; only the frequency
22
+ * VALUES differ between the training corpus and the matched set, which is the point (the model
23
+ * generalizes, as the cross-state eval showed).
24
+ */
25
+ import { type Comparison, type GBT, type TermFrequencyTable } from "@mailwoman/match";
26
+ import type { SourceRecord } from "./types.js";
27
+ /** Inputs shared by the featurizer + the scorer factory. */
28
+ export interface LearnedFeatureConfig {
29
+ /**
30
+ * The comparison set the features are built over — MUST be `buildDefaultModel({ collapseSpatial:
31
+ * true, addressFrequency }).comparisons` so the feature layout matches the trained model.
32
+ * (`usePhone` / `discriminators` are NOT part of the learned feature model — the GBT replaces the
33
+ * FS weight wholesale and owns its own feature vector.)
34
+ */
35
+ comparisons: Comparison<SourceRecord>[];
36
+ /** Address-frequency table for the crowdedness feature (a crowded shared address is weak
37
+ identity). */
38
+ addressFrequency: TermFrequencyTable;
39
+ }
40
+ /**
41
+ * Build the per-pair feature extractor. The vector is: one-hot of each comparison's agreement
42
+ * level, then the two over-merge interaction terms (spatial-exact × name-disagree, spatial-exact ×
43
+ * org-disagree — the "same place, different names" signature that drives co-located over-merges),
44
+ * then address crowdedness scaled into [0, 1]. Deterministic and EM-independent, so it is identical
45
+ * across train / eval / inference.
46
+ */
47
+ export declare function createMatchFeaturizer(config: LearnedFeatureConfig): (a: SourceRecord, b: SourceRecord) => number[];
48
+ /**
49
+ * Wrap a trained {@link GBT} into the `(a, b) => number` link scorer for
50
+ * {@link ResolveConfig.scorer}. The returned weight is the model's logit — same threshold-comparable
51
+ * units as the Fellegi-Sunter weight it replaces, so the pipeline's clustering + threshold
52
+ * semantics are unchanged.
53
+ */
54
+ export declare function createGbtScorer(config: LearnedFeatureConfig & {
55
+ model: GBT;
56
+ }): (a: SourceRecord, b: SourceRecord) => number;
57
+ //# sourceMappingURL=learned-scorer.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"learned-scorer.d.ts","sourceRoot":"","sources":["../learned-scorer.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AAEH,OAAO,EAAoB,KAAK,UAAU,EAAE,KAAK,GAAG,EAAY,KAAK,kBAAkB,EAAE,MAAM,kBAAkB,CAAA;AACjH,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,YAAY,CAAA;AAE9C,4DAA4D;AAC5D,MAAM,WAAW,oBAAoB;IACpC;;;;;OAKG;IACH,WAAW,EAAE,UAAU,CAAC,YAAY,CAAC,EAAE,CAAA;IACvC;aACY;IACZ,gBAAgB,EAAE,kBAAkB,CAAA;CACpC;AAED;;;;;;GAMG;AACH,wBAAgB,qBAAqB,CAAC,MAAM,EAAE,oBAAoB,GAAG,CAAC,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,YAAY,KAAK,MAAM,EAAE,CAkClH;AAED;;;;;GAKG;AACH,wBAAgB,eAAe,CAC9B,MAAM,EAAE,oBAAoB,GAAG;IAAE,KAAK,EAAE,GAAG,CAAA;CAAE,GAC3C,CAAC,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,YAAY,KAAK,MAAM,CAI9C"}
@@ -0,0 +1,78 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * The learned scorer (#603) — the production wiring for the gradient-boosted-tree model behind
7
+ * {@link ResolveConfig.scorer}. Two pieces:
8
+ *
9
+ * 1. {@link createMatchFeaturizer} — the ONE feature extractor for a candidate pair, used identically
10
+ * at train time (`scripts/record-matcher/train-gbt.ts`), eval time (the learned-scorer
11
+ * evals), and inference time (here). A pair → one-hot of each comparison's agreement level +
12
+ * the over-merge interaction terms (co-located × name/org disagreement) + address
13
+ * crowdedness.
14
+ * 2. {@link createGbtScorer} — wraps a trained {@link GBT} + the featurizer into the `(a, b) => number`
15
+ * the resolve pipeline's `scorer` hook expects (a logit, threshold-comparable with the
16
+ * Fellegi-Sunter weight it replaces).
17
+ *
18
+ * Both take the comparison set as INPUT (rather than importing {@link buildDefaultModel}) so this
19
+ * module has no dependency cycle with `resolve.ts`. The contract that keeps train ≡ inference:
20
+ * feed the comparisons from `buildDefaultModel({ collapseSpatial: true, addressFrequency })` —
21
+ * the model's structure (and thus the feature layout) is fixed by that config; only the frequency
22
+ * VALUES differ between the training corpus and the matched set, which is the point (the model
23
+ * generalizes, as the cross-state eval showed).
24
+ */
25
+ import { agreementPattern, gbtScore } from "@mailwoman/match";
26
+ /**
27
+ * Build the per-pair feature extractor. The vector is: one-hot of each comparison's agreement
28
+ * level, then the two over-merge interaction terms (spatial-exact × name-disagree, spatial-exact ×
29
+ * org-disagree — the "same place, different names" signature that drives co-located over-merges),
30
+ * then address crowdedness scaled into [0, 1]. Deterministic and EM-independent, so it is identical
31
+ * across train / eval / inference.
32
+ */
33
+ export function createMatchFeaturizer(config) {
34
+ const { comparisons, addressFrequency } = config;
35
+ const levelCounts = comparisons.map((c) => c.levels.length);
36
+ const index = Object.fromEntries(comparisons.map((c, i) => [c.name, i]));
37
+ const spatialI = index["spatial"];
38
+ const givenI = index["given"];
39
+ const familyI = index["family"];
40
+ const orgI = index["organization"];
41
+ const lastLevel = (i) => levelCounts[i] - 1;
42
+ return (a, b) => {
43
+ const pat = agreementPattern(comparisons, a, b);
44
+ const f = [];
45
+ for (let i = 0; i < pat.length; i++) {
46
+ const lvl = pat[i];
47
+ for (let l = 0; l < levelCounts[i]; l++)
48
+ f.push(lvl === l ? 1 : 0);
49
+ }
50
+ // Interaction: co-located (spatial exact = level 0) AND names/org disagree (catch-all level).
51
+ const spatialExact = spatialI !== undefined && pat[spatialI] === 0 ? 1 : 0;
52
+ const nameDisagree = givenI !== undefined &&
53
+ familyI !== undefined &&
54
+ pat[givenI] === lastLevel(givenI) &&
55
+ pat[familyI] === lastLevel(familyI)
56
+ ? 1
57
+ : 0;
58
+ const orgDisagree = orgI !== undefined && pat[orgI] === lastLevel(orgI) ? 1 : 0;
59
+ f.push(spatialExact * nameDisagree); // the over-merge signature: same place, names disagree
60
+ f.push(spatialExact * orgDisagree);
61
+ // Address crowdedness (how shared this address is) — high → "same address" is weak evidence.
62
+ const freq = a.address?.raw ? addressFrequency.frequency(a.address.raw) : 0;
63
+ f.push(Math.min(1, freq * 1000)); // scale into a usable range
64
+ return f;
65
+ };
66
+ }
67
+ /**
68
+ * Wrap a trained {@link GBT} into the `(a, b) => number` link scorer for
69
+ * {@link ResolveConfig.scorer}. The returned weight is the model's logit — same threshold-comparable
70
+ * units as the Fellegi-Sunter weight it replaces, so the pipeline's clustering + threshold
71
+ * semantics are unchanged.
72
+ */
73
+ export function createGbtScorer(config) {
74
+ const featurize = createMatchFeaturizer(config);
75
+ const { model } = config;
76
+ return (a, b) => gbtScore(model, featurize(a, b));
77
+ }
78
+ //# sourceMappingURL=learned-scorer.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"learned-scorer.js","sourceRoot":"","sources":["../learned-scorer.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AAEH,OAAO,EAAE,gBAAgB,EAA6B,QAAQ,EAA2B,MAAM,kBAAkB,CAAA;AAiBjH;;;;;;GAMG;AACH,MAAM,UAAU,qBAAqB,CAAC,MAA4B;IACjE,MAAM,EAAE,WAAW,EAAE,gBAAgB,EAAE,GAAG,MAAM,CAAA;IAChD,MAAM,WAAW,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,CAAA;IAC3D,MAAM,KAAK,GAAG,MAAM,CAAC,WAAW,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,CAAuC,CAAA;IAC9G,MAAM,QAAQ,GAAG,KAAK,CAAC,SAAS,CAAC,CAAA;IACjC,MAAM,MAAM,GAAG,KAAK,CAAC,OAAO,CAAC,CAAA;IAC7B,MAAM,OAAO,GAAG,KAAK,CAAC,QAAQ,CAAC,CAAA;IAC/B,MAAM,IAAI,GAAG,KAAK,CAAC,cAAc,CAAC,CAAA;IAClC,MAAM,SAAS,GAAG,CAAC,CAAS,EAAU,EAAE,CAAC,WAAW,CAAC,CAAC,CAAE,GAAG,CAAC,CAAA;IAE5D,OAAO,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;QACf,MAAM,GAAG,GAAG,gBAAgB,CAAC,WAAW,EAAE,CAAC,EAAE,CAAC,CAAC,CAAA;QAC/C,MAAM,CAAC,GAAa,EAAE,CAAA;QACtB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACrC,MAAM,GAAG,GAAG,GAAG,CAAC,CAAC,CAAE,CAAA;YACnB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,WAAW,CAAC,CAAC,CAAE,EAAE,CAAC,EAAE;gBAAE,CAAC,CAAC,IAAI,CAAC,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;QACpE,CAAC;QACD,8FAA8F;QAC9F,MAAM,YAAY,GAAG,QAAQ,KAAK,SAAS,IAAI,GAAG,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;QAC1E,MAAM,YAAY,GACjB,MAAM,KAAK,SAAS;YACpB,OAAO,KAAK,SAAS;YACrB,GAAG,CAAC,MAAM,CAAC,KAAK,SAAS,CAAC,MAAM,CAAC;YACjC,GAAG,CAAC,OAAO,CAAC,KAAK,SAAS,CAAC,OAAO,CAAC;YAClC,CAAC,CAAC,CAAC;YACH,CAAC,CAAC,CAAC,CAAA;QACL,MAAM,WAAW,GAAG,IAAI,KAAK,SAAS,IAAI,GAAG,CAAC,IAAI,CAAC,KAAK,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;QAC/E,CAAC,CAAC,IAAI,CAAC,YAAY,GAAG,YAAY,CAAC,CAAA,CAAC,uDAAuD;QAC3F,CAAC,CAAC,IAAI,CAAC,YAAY,GAAG,WAAW,CAAC,CAAA;QAClC,6FAA6F;QAC7F,MAAM,IAAI,GAAG,CAAC,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC,CAAC,gBAAgB,CAAC,SAAS,CAAC,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;QAC3E,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,GAAG,IAAI,CAAC,CAAC,CAAA,CAAC,4BAA4B;QAC7D,OAAO,CAAC,CAAA;IACT,CAAC,CAAA;AACF,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,eAAe,CAC9B,MAA6C;IAE7C,MAAM,SAAS,GAAG,qBAAqB,CAAC,MAAM,CAAC,CAAA;IAC/C,MAAM,EAAE,KAAK,EAAE,GAAG,MAAM,CAAA;IACxB,OAAO,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,QAAQ,CAAC,KAAK,EAAE,SAAS,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAA;AAClD,CAAC"}
@@ -0,0 +1,36 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * GENERATED by scripts/record-matcher/train-gbt.ts — DO NOT edit by hand; retrain to update.
7
+ *
8
+ * The default learned-scorer model (#603): a gradient-boosted-tree dedup scorer trained on the
9
+ * NPPES NPI-truth set (TX, 3000 NPIs → 707975 candidate pairs). Validated to generalize across
10
+ * states by learned-scorer-crossstate-eval.ts. Used by resolveEntities' opt-in learnedScorer hook
11
+ * via createGbtScorer. The trained {@link GBT} is plain data.
12
+ */
13
+ import type { GBT } from "@mailwoman/match";
14
+ /** Provenance for the bundled model — what it was trained on. */
15
+ export declare const DEDUP_GBT_META: {
16
+ readonly version: "1.0.0";
17
+ readonly locale: "en-US";
18
+ readonly trainedOn: "2026-06-15";
19
+ readonly state: "TX";
20
+ readonly npis: 3000;
21
+ readonly records: 8602;
22
+ readonly pairs: 707975;
23
+ readonly posRate: 0.0088;
24
+ readonly hyperparams: {
25
+ readonly rounds: 120;
26
+ readonly depth: 3;
27
+ readonly lr: 0.3;
28
+ readonly minLeaf: 20;
29
+ };
30
+ readonly recommendedThreshold: 2.7143;
31
+ readonly features: 17;
32
+ readonly addressFrequencyDistinct: 3317267;
33
+ readonly addressFrequencyTotal: 9260504;
34
+ };
35
+ export declare const DEDUP_GBT_MODEL: GBT;
36
+ //# sourceMappingURL=dedup-gbt-en-us.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"dedup-gbt-en-us.d.ts","sourceRoot":"","sources":["../../models/dedup-gbt-en-us.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,OAAO,KAAK,EAAE,GAAG,EAAE,MAAM,kBAAkB,CAAA;AAE3C,iEAAiE;AACjE,eAAO,MAAM,cAAc;;;;;;;;;;;;;;;;;;;CAmBjB,CAAA;AAGV,eAAO,MAAM,eAAe,EAAE,GAAo0vD,CAAA"}