@mailwoman/registry 4.8.1 → 4.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,92 @@
1
+ # @mailwoman/registry
2
+
3
+ **Geocode-first record-matching application** — the high-level entry point that
4
+ runs the full block → score → cluster pipeline over ingested records and returns
5
+ canonical entities ready for export.
6
+
7
+ This is the clinic-funding use case Mailwoman was built for, standing on a
8
+ calibrated, label-free matcher.
9
+
10
+ ```ts
11
+ import { resolveEntities, ingestRows, toGeoJSON } from "@mailwoman/registry";
12
+
13
+ // 1. Ingest — CSV/array → normalized SourceRecords
14
+ const records = ingestRows(rows, {
15
+ mapping: { name: "Provider Name", address: "Street Address", city: "City", ... },
16
+ });
17
+
18
+ // 2. Resolve — block → score → cluster with geo-first defaults
19
+ const entities = resolveEntities(records, {
20
+ geocodeAddress: async (row) => ({ lat: 30.2672, lon: -97.7431 }),
21
+ });
22
+
23
+ // 3. Export — GeoJSON for QGIS
24
+ const fc = toGeoJSON(entities);
25
+ // → FeatureCollection with Point features + entity properties
26
+ ```
27
+
28
+ ## The full pipeline
29
+
30
+ ```
31
+ CSV / SQLite → ingestRows → SourceRecord[] → resolveEntities → ResolvedEntity[]
32
+
33
+ toGeoJSON()
34
+
35
+ GeoJSON → QGIS
36
+ ```
37
+
38
+ ## API
39
+
40
+ ```ts
41
+ // Ingest — parse CSV / map columns → normalized records
42
+ import { ingestRows, parseCsv, inferMapping } from "@mailwoman/registry"
43
+ // {
44
+ // ingestRows(rows, opts): SourceRecord[]
45
+ // parseCsv(csvText): string[][]
46
+ // inferMapping(headers): ColumnMapping
47
+ // }
48
+
49
+ // Resolve — run the full matcher pipeline
50
+ import { resolveEntities } from "@mailwoman/registry"
51
+ // resolveEntities(records, config): ResolvedEntity[]
52
+ // Config: { geocodeAddress?, scorer?, blockingKeys?, threshold?, discriminators? }
53
+
54
+ // Export — GeoJSON, MapLibre HTML, reconciliation reports
55
+ import { toGeoJSON, toMapHTML, reconcile } from "@mailwoman/registry"
56
+
57
+ // Learned scorer — pre-trained GBT for single-dataset dedup
58
+ import { dedupGbtEnUs } from "@mailwoman/registry"
59
+ ```
60
+
61
+ ## Default configuration
62
+
63
+ `resolveEntities` ships with sensible defaults:
64
+
65
+ - **Blocking keys:** geo-cell (H3) + canonical address + phone + email
66
+ - **Scoring model:** Fellegi-Sunter with label-free EM, term frequency adjustment
67
+ - **Learned scorer:** optional GBT for single-dataset dedup (opt-in via `scorer`)
68
+ - **Threshold:** 0.5 (configurable precision/recall knob)
69
+
70
+ ## CLI
71
+
72
+ The `mailwoman` CLI exposes `registry` as a command:
73
+
74
+ ```bash
75
+ # Multi-source entity resolution
76
+ mailwoman registry --sources config.json --out entities.geojson
77
+
78
+ # Cross-dataset reconciliation
79
+ mailwoman registry --sources tx-nppes.json --reconcile tx-fcc.json
80
+ ```
81
+
82
+ ## Related
83
+
84
+ - [`@mailwoman/match`](../match) — the low-level block/score/cluster primitives
85
+ - [`@mailwoman/record`](../record) — `SourceRecord` schema and normalizers
86
+ - [`@mailwoman/address-id`](../address-id) — exact-match join key
87
+ - [Geocode-First Record Matching](https://mailwoman.sister.software/articles/concepts/geocode-first-record-matching/)
88
+ - [Dedup Entity Truth](https://mailwoman.sister.software/articles/concepts/dedup-entity-truth/)
89
+
90
+ ## License
91
+
92
+ [AGPL-3.0-only](https://www.gnu.org/licenses/agpl-3.0.html)
@@ -0,0 +1,31 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * The address-id consumer for the matcher (#259). Derives a stable {@link PostalAddressID} from a
7
+ * resolved {@link SourceRecord} and exposes it as a blocking key — the deterministic,
8
+ * exact-canonical-address complement to the fuzzy Fellegi-Sunter / GBT scoring. Two uses:
9
+ *
10
+ * - **As a pre-dedup / join key:** `GROUP BY postalAddressId(record)` collapses records that resolve
11
+ * to the same place AND share a canonical address with NO scoring at all — the cheap, certain
12
+ * slice of dedup before the matcher does the fuzzy rest.
13
+ * - **As a blocking key:** {@link addressIdBlockingKey} adds the address-id to the blocking union, so
14
+ * records sharing one are guaranteed to be compared.
15
+ */
16
+ import { type PostalAddressID } from "@mailwoman/address-id";
17
+ import { type BlockingKey } from "@mailwoman/match";
18
+ import type { SourceRecord } from "./types.js";
19
+ /**
20
+ * The stable address primary key for a record, or null when it isn't geocoded (no coordinate → no
21
+ * locality cell) or carries no raw address to hash. Uses the resolved coordinate + the raw address;
22
+ * the state prefix is plucked from the address when present.
23
+ */
24
+ export declare function postalAddressId(record: SourceRecord): PostalAddressID | null;
25
+ /**
26
+ * A blocking key on the {@link postalAddressId} — records that resolve to the same place with the
27
+ * same canonical address block together. Add it to {@link defaultBlockingKeys}'s union when an exact
28
+ * address join should never be missed.
29
+ */
30
+ export declare function addressIdBlockingKey(): BlockingKey<SourceRecord>;
31
+ //# sourceMappingURL=address-key.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"address-key.d.ts","sourceRoot":"","sources":["../address-key.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;GAcG;AAEH,OAAO,EAAyB,KAAK,eAAe,EAAE,MAAM,uBAAuB,CAAA;AACnF,OAAO,EAAE,KAAK,WAAW,EAAY,MAAM,kBAAkB,CAAA;AAC7D,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,YAAY,CAAA;AAE9C;;;;GAIG;AACH,wBAAgB,eAAe,CAAC,MAAM,EAAE,YAAY,GAAG,eAAe,GAAG,IAAI,CAK5E;AAED;;;;GAIG;AACH,wBAAgB,oBAAoB,IAAI,WAAW,CAAC,YAAY,CAAC,CAEhE"}
@@ -0,0 +1,38 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * The address-id consumer for the matcher (#259). Derives a stable {@link PostalAddressID} from a
7
+ * resolved {@link SourceRecord} and exposes it as a blocking key — the deterministic,
8
+ * exact-canonical-address complement to the fuzzy Fellegi-Sunter / GBT scoring. Two uses:
9
+ *
10
+ * - **As a pre-dedup / join key:** `GROUP BY postalAddressId(record)` collapses records that resolve
11
+ * to the same place AND share a canonical address with NO scoring at all — the cheap, certain
12
+ * slice of dedup before the matcher does the fuzzy rest.
13
+ * - **As a blocking key:** {@link addressIdBlockingKey} adds the address-id to the blocking union, so
14
+ * records sharing one are guaranteed to be compared.
15
+ */
16
+ import { createPostalAddressID } from "@mailwoman/address-id";
17
+ import { exactKey } from "@mailwoman/match";
18
+ /**
19
+ * The stable address primary key for a record, or null when it isn't geocoded (no coordinate → no
20
+ * locality cell) or carries no raw address to hash. Uses the resolved coordinate + the raw address;
21
+ * the state prefix is plucked from the address when present.
22
+ */
23
+ export function postalAddressId(record) {
24
+ const coordinate = record.address?.geocode?.coordinate;
25
+ const address = record.address?.raw;
26
+ if (!coordinate || !address)
27
+ return null;
28
+ return createPostalAddressID({ coordinate, address });
29
+ }
30
+ /**
31
+ * A blocking key on the {@link postalAddressId} — records that resolve to the same place with the
32
+ * same canonical address block together. Add it to {@link defaultBlockingKeys}'s union when an exact
33
+ * address join should never be missed.
34
+ */
35
+ export function addressIdBlockingKey() {
36
+ return exactKey((record) => postalAddressId(record));
37
+ }
38
+ //# sourceMappingURL=address-key.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"address-key.js","sourceRoot":"","sources":["../address-key.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;GAcG;AAEH,OAAO,EAAE,qBAAqB,EAAwB,MAAM,uBAAuB,CAAA;AACnF,OAAO,EAAoB,QAAQ,EAAE,MAAM,kBAAkB,CAAA;AAG7D;;;;GAIG;AACH,MAAM,UAAU,eAAe,CAAC,MAAoB;IACnD,MAAM,UAAU,GAAG,MAAM,CAAC,OAAO,EAAE,OAAO,EAAE,UAAU,CAAA;IACtD,MAAM,OAAO,GAAG,MAAM,CAAC,OAAO,EAAE,GAAG,CAAA;IACnC,IAAI,CAAC,UAAU,IAAI,CAAC,OAAO;QAAE,OAAO,IAAI,CAAA;IACxC,OAAO,qBAAqB,CAAC,EAAE,UAAU,EAAE,OAAO,EAAE,CAAC,CAAA;AACtD,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,oBAAoB;IACnC,OAAO,QAAQ,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,eAAe,CAAC,MAAM,CAAC,CAAC,CAAA;AACrD,CAAC"}
@@ -1 +1 @@
1
- {"version":3,"file":"geojson.d.ts","sourceRoot":"","sources":["../geojson.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,KAAK,EAAkB,wBAAwB,EAAE,cAAc,EAAgB,MAAM,YAAY,CAAA;AAmCxG;;;GAGG;AACH,wBAAgB,SAAS,CAAC,QAAQ,EAAE,SAAS,cAAc,EAAE,GAAG,wBAAwB,CAKvF"}
1
+ {"version":3,"file":"geojson.d.ts","sourceRoot":"","sources":["../geojson.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,KAAK,EAAkB,wBAAwB,EAAE,cAAc,EAAgB,MAAM,YAAY,CAAA;AAqCxG;;;GAGG;AACH,wBAAgB,SAAS,CAAC,QAAQ,EAAE,SAAS,cAAc,EAAE,GAAG,wBAAwB,CAKvF"}
package/out/geojson.js CHANGED
@@ -34,6 +34,8 @@ function toFeature(entity) {
34
34
  recordCount: entity.records.length,
35
35
  cohesion: entity.cohesion,
36
36
  sourceIds: entity.records.map((r) => r.id),
37
+ // Distinct provenance labels the entity's records span — an entity with ≥2 is a cross-dataset link.
38
+ sources: [...new Set(entity.records.map((r) => r.source).filter((s) => !!s))].sort(),
37
39
  name: displayName(rep),
38
40
  organization: rep.organization?.canonical ?? null,
39
41
  address: rep.address?.formatted ?? null,
@@ -1 +1 @@
1
- {"version":3,"file":"geojson.js","sourceRoot":"","sources":["../geojson.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAIH,0EAA0E;AAC1E,SAAS,WAAW,CAAC,MAAoB;IACxC,MAAM,IAAI,GAAG,MAAM,CAAC,IAAI,CAAA;IACxB,IAAI,CAAC,IAAI;QAAE,OAAO,IAAI,CAAA;IACtB,MAAM,MAAM,GAAG,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,KAAK,EAAE,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,cAAc,EAAE,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,MAAM,CAAC;SAClG,MAAM,CAAC,OAAO,CAAC;SACf,IAAI,CAAC,GAAG,CAAC;SACT,IAAI,EAAE,CAAA;IACR,OAAO,MAAM,IAAI,IAAI,CAAA;AACtB,CAAC;AAED,8CAA8C;AAC9C,SAAS,SAAS,CAAC,MAAsB;IACxC,MAAM,GAAG,GAAG,MAAM,CAAC,cAAc,CAAA;IACjC,OAAO;QACN,IAAI,EAAE,SAAS;QACf,QAAQ,EAAE;YACT,IAAI,EAAE,OAAO;YACb,WAAW,EAAE,CAAC,MAAM,CAAC,UAAW,CAAC,SAAS,EAAE,MAAM,CAAC,UAAW,CAAC,QAAQ,CAAC;SACxE;QACD,UAAU,EAAE;YACX,QAAQ,EAAE,MAAM,CAAC,EAAE;YACnB,WAAW,EAAE,MAAM,CAAC,OAAO,CAAC,MAAM;YAClC,QAAQ,EAAE,MAAM,CAAC,QAAQ;YACzB,SAAS,EAAE,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;YAC1C,IAAI,EAAE,WAAW,CAAC,GAAG,CAAC;YACtB,YAAY,EAAE,GAAG,CAAC,YAAY,EAAE,SAAS,IAAI,IAAI;YACjD,OAAO,EAAE,GAAG,CAAC,OAAO,EAAE,SAAS,IAAI,IAAI;YACvC,WAAW,EAAE,GAAG,CAAC,OAAO,EAAE,OAAO,EAAE,IAAI,IAAI,IAAI;SAC/C;KACD,CAAA;AACF,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,SAAS,CAAC,QAAmC;IAC5D,OAAO;QACN,IAAI,EAAE,mBAAmB;QACzB,QAAQ,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC;KACvE,CAAA;AACF,CAAC"}
1
+ {"version":3,"file":"geojson.js","sourceRoot":"","sources":["../geojson.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAIH,0EAA0E;AAC1E,SAAS,WAAW,CAAC,MAAoB;IACxC,MAAM,IAAI,GAAG,MAAM,CAAC,IAAI,CAAA;IACxB,IAAI,CAAC,IAAI;QAAE,OAAO,IAAI,CAAA;IACtB,MAAM,MAAM,GAAG,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,KAAK,EAAE,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,cAAc,EAAE,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,MAAM,CAAC;SAClG,MAAM,CAAC,OAAO,CAAC;SACf,IAAI,CAAC,GAAG,CAAC;SACT,IAAI,EAAE,CAAA;IACR,OAAO,MAAM,IAAI,IAAI,CAAA;AACtB,CAAC;AAED,8CAA8C;AAC9C,SAAS,SAAS,CAAC,MAAsB;IACxC,MAAM,GAAG,GAAG,MAAM,CAAC,cAAc,CAAA;IACjC,OAAO;QACN,IAAI,EAAE,SAAS;QACf,QAAQ,EAAE;YACT,IAAI,EAAE,OAAO;YACb,WAAW,EAAE,CAAC,MAAM,CAAC,UAAW,CAAC,SAAS,EAAE,MAAM,CAAC,UAAW,CAAC,QAAQ,CAAC;SACxE;QACD,UAAU,EAAE;YACX,QAAQ,EAAE,MAAM,CAAC,EAAE;YACnB,WAAW,EAAE,MAAM,CAAC,OAAO,CAAC,MAAM;YAClC,QAAQ,EAAE,MAAM,CAAC,QAAQ;YACzB,SAAS,EAAE,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;YAC1C,oGAAoG;YACpG,OAAO,EAAE,CAAC,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAe,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE;YACjG,IAAI,EAAE,WAAW,CAAC,GAAG,CAAC;YACtB,YAAY,EAAE,GAAG,CAAC,YAAY,EAAE,SAAS,IAAI,IAAI;YACjD,OAAO,EAAE,GAAG,CAAC,OAAO,EAAE,SAAS,IAAI,IAAI;YACvC,WAAW,EAAE,GAAG,CAAC,OAAO,EAAE,OAAO,EAAE,IAAI,IAAI,IAAI;SAC/C;KACD,CAAA;AACF,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,SAAS,CAAC,QAAmC;IAC5D,OAAO;QACN,IAAI,EAAE,mBAAmB;QACzB,QAAQ,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC;KACvE,CAAA;AACF,CAAC"}
package/out/index.d.ts CHANGED
@@ -10,8 +10,13 @@
10
10
  * {@link toGeoJSON} exports them for QGIS. This is the clinic-funding use case mailwoman was built
11
11
  * for, finally standing on a calibrated, label-free matcher.
12
12
  */
13
+ export * from "./address-key.js";
13
14
  export * from "./geojson.js";
14
15
  export * from "./ingest.js";
16
+ export * from "./learned-scorer.js";
17
+ export * from "./map-html.js";
18
+ export * from "./models/dedup-gbt-en-us.js";
19
+ export * from "./reconcile.js";
15
20
  export * from "./resolve.js";
16
21
  export * from "./types.js";
17
22
  //# sourceMappingURL=index.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,cAAc,cAAc,CAAA;AAC5B,cAAc,aAAa,CAAA;AAC3B,cAAc,cAAc,CAAA;AAC5B,cAAc,YAAY,CAAA"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,cAAc,kBAAkB,CAAA;AAChC,cAAc,cAAc,CAAA;AAC5B,cAAc,aAAa,CAAA;AAC3B,cAAc,qBAAqB,CAAA;AACnC,cAAc,eAAe,CAAA;AAC7B,cAAc,6BAA6B,CAAA;AAC3C,cAAc,gBAAgB,CAAA;AAC9B,cAAc,cAAc,CAAA;AAC5B,cAAc,YAAY,CAAA"}
package/out/index.js CHANGED
@@ -10,8 +10,13 @@
10
10
  * {@link toGeoJSON} exports them for QGIS. This is the clinic-funding use case mailwoman was built
11
11
  * for, finally standing on a calibrated, label-free matcher.
12
12
  */
13
+ export * from "./address-key.js";
13
14
  export * from "./geojson.js";
14
15
  export * from "./ingest.js";
16
+ export * from "./learned-scorer.js";
17
+ export * from "./map-html.js";
18
+ export * from "./models/dedup-gbt-en-us.js";
19
+ export * from "./reconcile.js";
15
20
  export * from "./resolve.js";
16
21
  export * from "./types.js";
17
22
  //# sourceMappingURL=index.js.map
package/out/index.js.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"file":"index.js","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,cAAc,cAAc,CAAA;AAC5B,cAAc,aAAa,CAAA;AAC3B,cAAc,cAAc,CAAA;AAC5B,cAAc,YAAY,CAAA"}
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,cAAc,kBAAkB,CAAA;AAChC,cAAc,cAAc,CAAA;AAC5B,cAAc,aAAa,CAAA;AAC3B,cAAc,qBAAqB,CAAA;AACnC,cAAc,eAAe,CAAA;AAC7B,cAAc,6BAA6B,CAAA;AAC3C,cAAc,gBAAgB,CAAA;AAC9B,cAAc,cAAc,CAAA;AAC5B,cAAc,YAAY,CAAA"}
package/out/ingest.d.ts CHANGED
@@ -26,8 +26,36 @@ import { toPostalAddress } from "@mailwoman/record";
26
26
  import type { SourceRecord } from "./types.js";
27
27
  /** Resolve a raw address string into a {@link PostalAddress}. The seam to mailwoman's geocoder. */
28
28
  export type GeocodeAddress = (raw: string) => Promise<PostalAddress | null> | PostalAddress | null;
29
- /** Maps dataset columns to record fields. A field may draw from several columns (joined with
30
- spaces). */
29
+ /** Column delimiter of a delimited source. */
30
+ export type Delimiter = "comma" | "tab";
31
+ /** Infer the delimiter from a path's extension (`.tsv` → tab, else comma). */
32
+ export declare function delimiterFor(path: string): Delimiter;
33
+ /**
34
+ * Stream a delimited file's rows lazily as header-keyed objects — the same shape {@link parseCsv}
35
+ * returns, but **without loading the file into memory**. A multi-GB source (the NPPES registry is
36
+ * ~4.8 GB / 9.6M rows — too big for `readFileSync`, which throws `ERR_STRING_TOO_LONG`) streams
37
+ * line by line. Keys are the original header names so a {@link ColumnMapping} written against the
38
+ * source's headers matches. Filter/sample the stream before {@link ingestRows} to keep only the rows
39
+ * you geocode.
40
+ *
41
+ * We stream _lines_ with spliterator's `TextSpliterator` (pure-Node, the part that handles the huge
42
+ * file) and split each line into columns here with `String.prototype.split`. We deliberately do NOT
43
+ * use `CSVSpliterator`: its column tokenizer hard-codes `skipEmpty` (it builds the column
44
+ * spliterator as `{ delimiter }` with no `skipEmpty: false`), so consecutive delimiters collapse
45
+ * and EMPTY FIELDS ARE DROPPED — fatal for a fixed-width registry like NPPES where a row of 330
46
+ * columns full of empties would mis-parse to 40 and shift every value. (Upstream `spliterator` bug;
47
+ * revisit when it's fixed.)
48
+ *
49
+ * Assumes an unquoted delimited file (no fields containing the delimiter) — true for these
50
+ * government TSVs. For small, possibly-quoted CSVs use {@link parseCsv} (quote-aware, in-memory).
51
+ */
52
+ export declare function streamRows(source: string, opts?: {
53
+ delimiter?: Delimiter;
54
+ }): AsyncGenerator<Record<string, string>>;
55
+ /**
56
+ * Maps dataset columns to record fields. A field may draw from several columns (joined with
57
+ * spaces).
58
+ */
31
59
  export interface ColumnMapping {
32
60
  /** Column holding a stable row id. Falls back to the row index. */
33
61
  id?: string;
@@ -38,16 +66,49 @@ export interface ColumnMapping {
38
66
  address?: string | string[];
39
67
  phone?: string;
40
68
  email?: string;
69
+ /**
70
+ * Extra secondary-identifier fields → the column(s) to draw each from (joined with spaces). Land
71
+ * on `SourceRecord.attributes` under the same key, for the matcher's `discriminators`
72
+ * (authorized-official name, taxonomy, license…).
73
+ */
74
+ attributes?: Record<string, string | string[]>;
41
75
  }
76
+ /**
77
+ * Best-effort {@link ColumnMapping} inferred from a header row — the "point it at any CSV"
78
+ * convenience. Each column name is matched (case- and punctuation-insensitive, on whole tokens) to
79
+ * a field by keyword, in a precedence that resolves the common ambiguities: a dedicated id / phone
80
+ * / email column is claimed before the generic sweep, an org / facility column beats a person
81
+ * "name", and address columns (street / city / state / zip…) collect into one multi-column field.
82
+ * Imperfect on bespoke headers (an explicit mapping or the LLM-assisted inference #603 is the
83
+ * answer there), but it nails tidy and semi-tidy files with no hand-mapping. Unmatched columns are
84
+ * left out.
85
+ */
86
+ export declare function inferMapping(header: readonly string[]): ColumnMapping;
42
87
  /** Options for {@link ingestRows}. */
43
88
  export interface IngestOptions {
44
89
  /** The geocoding seam. Without it, records carry name/org but no resolved address. */
45
90
  geocodeAddress?: GeocodeAddress;
91
+ /**
92
+ * Separator for joining a multi-column ADDRESS mapping (name/org always join with a space).
93
+ * Default `" "`. Pass `", "` to give the parser delimited input (`"214 Main St, Austin, TX
94
+ * 78701"`) instead of a concatenated run (`"214 Main St Austin TX 78701"`) — the latter strips
95
+ * the parser's segmentation boundaries and is partly OOD (it also breaks all-caps
96
+ * case-normalization; #694). **Default `", "` (#694 flip, validated).** Comma-join is the correct
97
+ * shape for an address built from separate columns, and #700 measured it at +15% cross-dataset
98
+ * rooftop (579→667) with no comma-less crater. The dedup GBT was trained on the old space-joined
99
+ * coords, so this flip is paired with a GBT re-validation (#694). Pass `" "` to restore the
100
+ * legacy space-join for a byte-stable A/B.
101
+ */
102
+ addressSeparator?: string;
46
103
  }
47
104
  /** Parse a CSV string (with a header row) into row objects keyed by column name. */
48
105
  export declare function parseCsv(text: string): Record<string, string>[];
49
- /** Normalize tabular rows into {@link SourceRecord}s under a {@link ColumnMapping}. */
50
- export declare function ingestRows(rows: Iterable<Record<string, string>>, mapping: ColumnMapping, opts?: IngestOptions): Promise<SourceRecord[]>;
106
+ /**
107
+ * Normalize tabular rows into {@link SourceRecord}s under a {@link ColumnMapping}. Accepts a sync OR
108
+ * async iterable, so {@link parseCsv} (in-memory) and {@link streamRows} (lazy, for huge files) both
109
+ * thread straight through.
110
+ */
111
+ export declare function ingestRows(rows: Iterable<Record<string, string>> | AsyncIterable<Record<string, string>>, mapping: ColumnMapping, opts?: IngestOptions): Promise<SourceRecord[]>;
51
112
  /**
52
113
  * The subset of mailwoman's `GeocodeResult` the adapter consumes — kept structural so this package
53
114
  * never imports the heavy geocoder, yet a real `GeocodeResult` maps straight in.
@@ -1 +1 @@
1
- {"version":3,"file":"ingest.d.ts","sourceRoot":"","sources":["../ingest.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAEH,OAAO,KAAK,EAAE,cAAc,EAAE,aAAa,EAAE,MAAM,mBAAmB,CAAA;AACtE,OAAO,EAAiD,eAAe,EAAe,MAAM,mBAAmB,CAAA;AAE/G,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,YAAY,CAAA;AAE9C,mGAAmG;AACnG,MAAM,MAAM,cAAc,GAAG,CAAC,GAAG,EAAE,MAAM,KAAK,OAAO,CAAC,aAAa,GAAG,IAAI,CAAC,GAAG,aAAa,GAAG,IAAI,CAAA;AAElG;WACW;AACX,MAAM,WAAW,aAAa;IAC7B,mEAAmE;IACnE,EAAE,CAAC,EAAE,MAAM,CAAA;IACX,+DAA+D;IAC/D,MAAM,CAAC,EAAE,MAAM,CAAA;IACf,IAAI,CAAC,EAAE,MAAM,GAAG,MAAM,EAAE,CAAA;IACxB,YAAY,CAAC,EAAE,MAAM,GAAG,MAAM,EAAE,CAAA;IAChC,OAAO,CAAC,EAAE,MAAM,GAAG,MAAM,EAAE,CAAA;IAC3B,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,KAAK,CAAC,EAAE,MAAM,CAAA;CACd;AAED,sCAAsC;AACtC,MAAM,WAAW,aAAa;IAC7B,sFAAsF;IACtF,cAAc,CAAC,EAAE,cAAc,CAAA;CAC/B;AAED,oFAAoF;AACpF,wBAAgB,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,CAE/D;AAcD,uFAAuF;AACvF,wBAAsB,UAAU,CAC/B,IAAI,EAAE,QAAQ,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,EACtC,OAAO,EAAE,aAAa,EACtB,IAAI,GAAE,aAAkB,GACtB,OAAO,CAAC,YAAY,EAAE,CAAC,CA2BzB;AAED;;;GAGG;AACH,MAAM,WAAW,UAAU;IAC1B,GAAG,EAAE,MAAM,GAAG,IAAI,CAAA;IAClB,GAAG,EAAE,MAAM,GAAG,IAAI,CAAA;IAClB,eAAe,EAAE,cAAc,CAAC,MAAM,CAAC,CAAA;IACvC,aAAa,EAAE,MAAM,GAAG,IAAI,CAAA;IAC5B,SAAS,CAAC,EAAE,cAAc,CAAC,WAAW,CAAC,CAAA;CACvC;AAED;;;;;;GAMG;AACH,wBAAgB,iBAAiB,CAAC,IAAI,EAAE;IACvC,KAAK,EAAE,CAAC,GAAG,EAAE,MAAM,KAAK,OAAO,CAAC,UAAU,CAAC,OAAO,eAAe,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,UAAU,CAAC,OAAO,eAAe,CAAC,CAAC,CAAC,CAAC,CAAA;IAC9G,OAAO,EAAE,CAAC,GAAG,EAAE,MAAM,KAAK,OAAO,CAAC,UAAU,GAAG,IAAI,CAAC,GAAG,UAAU,GAAG,IAAI,CAAA;IACxE,OAAO,CAAC,EAAE,MAAM,CAAA;CAChB,GAAG,cAAc,CAgBjB"}
1
+ {"version":3,"file":"ingest.d.ts","sourceRoot":"","sources":["../ingest.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAEH,OAAO,KAAK,EAAE,cAAc,EAAE,aAAa,EAAE,MAAM,mBAAmB,CAAA;AACtE,OAAO,EAAiD,eAAe,EAAe,MAAM,mBAAmB,CAAA;AAI/G,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,YAAY,CAAA;AAE9C,mGAAmG;AACnG,MAAM,MAAM,cAAc,GAAG,CAAC,GAAG,EAAE,MAAM,KAAK,OAAO,CAAC,aAAa,GAAG,IAAI,CAAC,GAAG,aAAa,GAAG,IAAI,CAAA;AAElG,8CAA8C;AAC9C,MAAM,MAAM,SAAS,GAAG,OAAO,GAAG,KAAK,CAAA;AAEvC,8EAA8E;AAC9E,wBAAgB,YAAY,CAAC,IAAI,EAAE,MAAM,GAAG,SAAS,CAEpD;AAED;;;;;;;;;;;;;;;;;;GAkBG;AACH,wBAAuB,UAAU,CAChC,MAAM,EAAE,MAAM,EACd,IAAI,GAAE;IAAE,SAAS,CAAC,EAAE,SAAS,CAAA;CAAO,GAClC,cAAc,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CA0BxC;AAED;;;GAGG;AACH,MAAM,WAAW,aAAa;IAC7B,mEAAmE;IACnE,EAAE,CAAC,EAAE,MAAM,CAAA;IACX,+DAA+D;IAC/D,MAAM,CAAC,EAAE,MAAM,CAAA;IACf,IAAI,CAAC,EAAE,MAAM,GAAG,MAAM,EAAE,CAAA;IACxB,YAAY,CAAC,EAAE,MAAM,GAAG,MAAM,EAAE,CAAA;IAChC,OAAO,CAAC,EAAE,MAAM,GAAG,MAAM,EAAE,CAAA;IAC3B,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,KAAK,CAAC,EAAE,MAAM,CAAA;IACd;;;;OAIG;IACH,UAAU,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,EAAE,CAAC,CAAA;CAC9C;AAED;;;;;;;;;GASG;AACH,wBAAgB,YAAY,CAAC,MAAM,EAAE,SAAS,MAAM,EAAE,GAAG,aAAa,CA4CrE;AAED,sCAAsC;AACtC,MAAM,WAAW,aAAa;IAC7B,sFAAsF;IACtF,cAAc,CAAC,EAAE,cAAc,CAAA;IAC/B;;;;;;;;;;OAUG;IACH,gBAAgB,CAAC,EAAE,MAAM,CAAA;CACzB;AAED,oFAAoF;AACpF,wBAAgB,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,CAE/D;AAcD;;;;GAIG;AACH,wBAAsB,UAAU,CAC/B,IAAI,EAAE,QAAQ,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,GAAG,aAAa,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,EAC9E,OAAO,EAAE,aAAa,EACtB,IAAI,GAAE,aAAkB,GACtB,OAAO,CAAC,YAAY,EAAE,CAAC,CAoCzB;AAED;;;GAGG;AACH,MAAM,WAAW,UAAU;IAC1B,GAAG,EAAE,MAAM,GAAG,IAAI,CAAA;IAClB,GAAG,EAAE,MAAM,GAAG,IAAI,CAAA;IAClB,eAAe,EAAE,cAAc,CAAC,MAAM,CAAC,CAAA;IACvC,aAAa,EAAE,MAAM,GAAG,IAAI,CAAA;IAC5B,SAAS,CAAC,EAAE,cAAc,CAAC,WAAW,CAAC,CAAA;CACvC;AAED;;;;;;GAMG;AACH,wBAAgB,iBAAiB,CAAC,IAAI,EAAE;IACvC,KAAK,EAAE,CAAC,GAAG,EAAE,MAAM,KAAK,OAAO,CAAC,UAAU,CAAC,OAAO,eAAe,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,UAAU,CAAC,OAAO,eAAe,CAAC,CAAC,CAAC,CAAC,CAAA;IAC9G,OAAO,EAAE,CAAC,GAAG,EAAE,MAAM,KAAK,OAAO,CAAC,UAAU,GAAG,IAAI,CAAC,GAAG,UAAU,GAAG,IAAI,CAAA;IACxE,OAAO,CAAC,EAAE,MAAM,CAAA;CAChB,GAAG,cAAc,CAgBjB"}
package/out/ingest.js CHANGED
@@ -23,31 +23,139 @@
23
23
  */
24
24
  import { canonicalizeOrganizationName, parsePersonName, toPostalAddress, withGeocode } from "@mailwoman/record";
25
25
  import { parse as parseCsvSync } from "csv-parse/sync";
26
+ import { open } from "node:fs/promises";
27
+ import { Delimiters, TextSpliterator } from "spliterator";
28
+ /** Infer the delimiter from a path's extension (`.tsv` → tab, else comma). */
29
+ export function delimiterFor(path) {
30
+ return /\.tsv$/i.test(path) ? "tab" : "comma";
31
+ }
32
+ /**
33
+ * Stream a delimited file's rows lazily as header-keyed objects — the same shape {@link parseCsv}
34
+ * returns, but **without loading the file into memory**. A multi-GB source (the NPPES registry is
35
+ * ~4.8 GB / 9.6M rows — too big for `readFileSync`, which throws `ERR_STRING_TOO_LONG`) streams
36
+ * line by line. Keys are the original header names so a {@link ColumnMapping} written against the
37
+ * source's headers matches. Filter/sample the stream before {@link ingestRows} to keep only the rows
38
+ * you geocode.
39
+ *
40
+ * We stream _lines_ with spliterator's `TextSpliterator` (pure-Node, the part that handles the huge
41
+ * file) and split each line into columns here with `String.prototype.split`. We deliberately do NOT
42
+ * use `CSVSpliterator`: its column tokenizer hard-codes `skipEmpty` (it builds the column
43
+ * spliterator as `{ delimiter }` with no `skipEmpty: false`), so consecutive delimiters collapse
44
+ * and EMPTY FIELDS ARE DROPPED — fatal for a fixed-width registry like NPPES where a row of 330
45
+ * columns full of empties would mis-parse to 40 and shift every value. (Upstream `spliterator` bug;
46
+ * revisit when it's fixed.)
47
+ *
48
+ * Assumes an unquoted delimited file (no fields containing the delimiter) — true for these
49
+ * government TSVs. For small, possibly-quoted CSVs use {@link parseCsv} (quote-aware, in-memory).
50
+ */
51
+ export async function* streamRows(source, opts = {}) {
52
+ const sep = (opts.delimiter ?? delimiterFor(source)) === "tab" ? "\t" : ",";
53
+ // Own the file handle so it's closed deterministically. spliterator's `autoDispose` only fires on
54
+ // natural completion, not on an early `break`/`.return()` — which then leaks the fd (a GC-time error
55
+ // in Node 24+). We open it, pass `autoDispose: false` so spliterator never touches our handle, and
56
+ // close it in `finally` (runs on completion AND when the consumer abandons the generator early).
57
+ const handle = await open(source, "r");
58
+ try {
59
+ let header = null;
60
+ for await (const line of TextSpliterator.fromAsync(handle, {
61
+ delimiter: Delimiters.LineFeed,
62
+ autoDispose: false,
63
+ })) {
64
+ if (line.length === 0)
65
+ continue; // blank line / trailing newline
66
+ const fields = line.replace(/\r$/, "").split(sep); // tolerate CRLF
67
+ if (header === null) {
68
+ header = fields;
69
+ continue;
70
+ }
71
+ const row = {};
72
+ for (let i = 0; i < header.length; i++)
73
+ row[header[i]] = fields[i] ?? "";
74
+ yield row;
75
+ }
76
+ }
77
+ finally {
78
+ await handle.close();
79
+ }
80
+ }
81
+ /**
82
+ * Best-effort {@link ColumnMapping} inferred from a header row — the "point it at any CSV"
83
+ * convenience. Each column name is matched (case- and punctuation-insensitive, on whole tokens) to
84
+ * a field by keyword, in a precedence that resolves the common ambiguities: a dedicated id / phone
85
+ * / email column is claimed before the generic sweep, an org / facility column beats a person
86
+ * "name", and address columns (street / city / state / zip…) collect into one multi-column field.
87
+ * Imperfect on bespoke headers (an explicit mapping or the LLM-assisted inference #603 is the
88
+ * answer there), but it nails tidy and semi-tidy files with no hand-mapping. Unmatched columns are
89
+ * left out.
90
+ */
91
+ export function inferMapping(header) {
92
+ // Pad to whole-token boundaries so "state" doesn't match inside "statement".
93
+ const tok = (h) => ` ${h
94
+ .toLowerCase()
95
+ .replace(/[^a-z0-9]+/g, " ")
96
+ .trim()} `;
97
+ const mapping = {};
98
+ const name = [];
99
+ const address = [];
100
+ for (const column of header) {
101
+ const h = tok(column);
102
+ const has = (...words) => words.some((w) => h.includes(` ${w} `));
103
+ if (!mapping.email && has("email", "e mail"))
104
+ mapping.email = column;
105
+ else if (!mapping.phone && has("phone", "telephone", "tel", "mobile", "cell"))
106
+ mapping.phone = column;
107
+ else if (!mapping.id && has("id", "npi", "ein", "frn", "spin", "uuid", "guid", "key"))
108
+ mapping.id = column;
109
+ else if (has("org", "organization", "organisation", "company", "business", "facility", "agency", "employer"))
110
+ mapping.organization ??= column;
111
+ else if (has("street", "address", "addr", "city", "town", "state", "province", "zip", "zipcode", "postal", "postcode", "county"))
112
+ address.push(column);
113
+ else if (has("name", "first", "last", "given", "family", "middle", "surname", "fullname", "contact"))
114
+ name.push(column);
115
+ }
116
+ if (name.length)
117
+ mapping.name = name.length === 1 ? name[0] : name;
118
+ if (address.length)
119
+ mapping.address = address;
120
+ return mapping;
121
+ }
26
122
  /** Parse a CSV string (with a header row) into row objects keyed by column name. */
27
123
  export function parseCsv(text) {
28
124
  return parseCsvSync(text, { columns: true, skip_empty_lines: true, trim: true, relax_column_count: true });
29
125
  }
30
126
  /** Join the named column(s) of a row into a single trimmed string, or undefined if empty. */
31
- function pick(row, columns) {
127
+ function pick(row, columns, separator = " ") {
32
128
  if (!columns)
33
129
  return undefined;
34
130
  const list = Array.isArray(columns) ? columns : [columns];
35
131
  const value = list
36
132
  .map((c) => row[c]?.trim())
37
133
  .filter(Boolean)
38
- .join(" ")
134
+ .join(separator)
39
135
  .trim();
40
136
  return value || undefined;
41
137
  }
42
- /** Normalize tabular rows into {@link SourceRecord}s under a {@link ColumnMapping}. */
138
+ /**
139
+ * Normalize tabular rows into {@link SourceRecord}s under a {@link ColumnMapping}. Accepts a sync OR
140
+ * async iterable, so {@link parseCsv} (in-memory) and {@link streamRows} (lazy, for huge files) both
141
+ * thread straight through.
142
+ */
43
143
  export async function ingestRows(rows, mapping, opts = {}) {
44
144
  const records = [];
45
145
  let index = 0;
46
- for (const row of rows) {
146
+ for await (const row of rows) {
47
147
  const id = (mapping.id ? row[mapping.id]?.trim() : "") || String(index);
48
148
  const nameValue = pick(row, mapping.name);
49
149
  const orgValue = pick(row, mapping.organization);
50
- const addressValue = pick(row, mapping.address);
150
+ const addressValue = pick(row, mapping.address, opts.addressSeparator ?? ", ");
151
+ let attributes;
152
+ if (mapping.attributes) {
153
+ for (const [key, columns] of Object.entries(mapping.attributes)) {
154
+ const value = pick(row, columns);
155
+ if (value)
156
+ (attributes ??= {})[key] = value;
157
+ }
158
+ }
51
159
  const record = {
52
160
  id,
53
161
  source: mapping.source,
@@ -56,6 +164,7 @@ export async function ingestRows(rows, mapping, opts = {}) {
56
164
  phone: (mapping.phone && row[mapping.phone]?.trim()) || undefined,
57
165
  email: (mapping.email && row[mapping.email]?.trim()?.toLowerCase()) || undefined,
58
166
  address: addressValue && opts.geocodeAddress ? ((await opts.geocodeAddress(addressValue)) ?? undefined) : undefined,
167
+ attributes,
59
168
  raw: row,
60
169
  };
61
170
  records.push(record);
package/out/ingest.js.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"file":"ingest.js","sourceRoot":"","sources":["../ingest.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAGH,OAAO,EAAE,4BAA4B,EAAE,eAAe,EAAE,eAAe,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAA;AAC/G,OAAO,EAAE,KAAK,IAAI,YAAY,EAAE,MAAM,gBAAgB,CAAA;AA0BtD,oFAAoF;AACpF,MAAM,UAAU,QAAQ,CAAC,IAAY;IACpC,OAAO,YAAY,CAAC,IAAI,EAAE,EAAE,OAAO,EAAE,IAAI,EAAE,gBAAgB,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,kBAAkB,EAAE,IAAI,EAAE,CAAC,CAAA;AAC3G,CAAC;AAED,6FAA6F;AAC7F,SAAS,IAAI,CAAC,GAA2B,EAAE,OAA2B;IACrE,IAAI,CAAC,OAAO;QAAE,OAAO,SAAS,CAAA;IAC9B,MAAM,IAAI,GAAG,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAA;IACzD,MAAM,KAAK,GAAG,IAAI;SAChB,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC;SAC1B,MAAM,CAAC,OAAO,CAAC;SACf,IAAI,CAAC,GAAG,CAAC;SACT,IAAI,EAAE,CAAA;IACR,OAAO,KAAK,IAAI,SAAS,CAAA;AAC1B,CAAC;AAED,uFAAuF;AACvF,MAAM,CAAC,KAAK,UAAU,UAAU,CAC/B,IAAsC,EACtC,OAAsB,EACtB,OAAsB,EAAE;IAExB,MAAM,OAAO,GAAmB,EAAE,CAAA;IAClC,IAAI,KAAK,GAAG,CAAC,CAAA;IAEb,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACxB,MAAM,EAAE,GAAG,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,EAAE,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,MAAM,CAAC,KAAK,CAAC,CAAA;QACvE,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,OAAO,CAAC,IAAI,CAAC,CAAA;QACzC,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,OAAO,CAAC,YAAY,CAAC,CAAA;QAChD,MAAM,YAAY,GAAG,IAAI,CAAC,GAAG,EAAE,OAAO,CAAC,OAAO,CAAC,CAAA;QAE/C,MAAM,MAAM,GAAiB;YAC5B,EAAE;YACF,MAAM,EAAE,OAAO,CAAC,MAAM;YACtB,IAAI,EAAE,SAAS,CAAC,CAAC,CAAC,eAAe,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,SAAS;YACxD,YAAY,EAAE,QAAQ,CAAC,CAAC,CAAC,4BAA4B,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,SAAS;YAC3E,KAAK,EAAE,CAAC,OAAO,CAAC,KAAK,IAAI,GAAG,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,IAAI,EAAE,CAAC,IAAI,SAAS;YACjE,KAAK,EAAE,CAAC,OAAO,CAAC,KAAK,IAAI,GAAG,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,IAAI,EAAE,EAAE,WAAW,EAAE,CAAC,IAAI,SAAS;YAChF,OAAO,EACN,YAAY,IAAI,IAAI,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,cAAc,CAAC,YAAY,CAAC,CAAC,IAAI,SAAS,CAAC,CAAC,CAAC,CAAC,SAAS;YAC3G,GAAG,EAAE,GAAG;SACR,CAAA;QAED,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QACpB,KAAK,EAAE,CAAA;IACR,CAAC;IAED,OAAO,OAAO,CAAA;AACf,CAAC;AAcD;;;;;;GAMG;AACH,MAAM,UAAU,iBAAiB,CAAC,IAIjC;IACA,OAAO,KAAK,EAAE,GAAW,EAAiC,EAAE;QAC3D,MAAM,UAAU,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAA;QACxC,MAAM,IAAI,GAAG,eAAe,CAAC,UAAU,EAAE,EAAE,OAAO,EAAE,IAAI,CAAC,OAAO,EAAE,GAAG,EAAE,CAAC,CAAA;QAExE,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAA;QACxC,IAAI,CAAC,QAAQ,IAAI,QAAQ,CAAC,GAAG,KAAK,IAAI,IAAI,QAAQ,CAAC,GAAG,KAAK,IAAI;YAAE,OAAO,IAAI,CAAA;QAE5E,MAAM,OAAO,GAAmB;YAC/B,UAAU,EAAE,EAAE,QAAQ,EAAE,QAAQ,CAAC,GAAG,EAAE,SAAS,EAAE,QAAQ,CAAC,GAAG,EAAE;YAC/D,IAAI,EAAE,QAAQ,CAAC,eAAe;YAC9B,iBAAiB,EAAE,QAAQ,CAAC,aAAa;YACzC,SAAS,EAAE,QAAQ,CAAC,SAAS;SAC7B,CAAA;QACD,OAAO,WAAW,CAAC,IAAI,EAAE,OAAO,CAAC,CAAA;IAClC,CAAC,CAAA;AACF,CAAC"}
1
+ {"version":3,"file":"ingest.js","sourceRoot":"","sources":["../ingest.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAGH,OAAO,EAAE,4BAA4B,EAAE,eAAe,EAAE,eAAe,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAA;AAC/G,OAAO,EAAE,KAAK,IAAI,YAAY,EAAE,MAAM,gBAAgB,CAAA;AACtD,OAAO,EAAE,IAAI,EAAE,MAAM,kBAAkB,CAAA;AACvC,OAAO,EAAE,UAAU,EAAE,eAAe,EAAE,MAAM,aAAa,CAAA;AASzD,8EAA8E;AAC9E,MAAM,UAAU,YAAY,CAAC,IAAY;IACxC,OAAO,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,OAAO,CAAA;AAC9C,CAAC;AAED;;;;;;;;;;;;;;;;;;GAkBG;AACH,MAAM,CAAC,KAAK,SAAS,CAAC,CAAC,UAAU,CAChC,MAAc,EACd,OAAkC,EAAE;IAEpC,MAAM,GAAG,GAAG,CAAC,IAAI,CAAC,SAAS,IAAI,YAAY,CAAC,MAAM,CAAC,CAAC,KAAK,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,CAAA;IAC3E,kGAAkG;IAClG,qGAAqG;IACrG,mGAAmG;IACnG,iGAAiG;IACjG,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,MAAM,EAAE,GAAG,CAAC,CAAA;IACtC,IAAI,CAAC;QACJ,IAAI,MAAM,GAAoB,IAAI,CAAA;QAClC,IAAI,KAAK,EAAE,MAAM,IAAI,IAAI,eAAe,CAAC,SAAS,CAAC,MAAM,EAAE;YAC1D,SAAS,EAAE,UAAU,CAAC,QAAQ;YAC9B,WAAW,EAAE,KAAK;SAClB,CAAC,EAAE,CAAC;YACJ,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;gBAAE,SAAQ,CAAC,gCAAgC;YAChE,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAA,CAAC,gBAAgB;YAClE,IAAI,MAAM,KAAK,IAAI,EAAE,CAAC;gBACrB,MAAM,GAAG,MAAM,CAAA;gBACf,SAAQ;YACT,CAAC;YACD,MAAM,GAAG,GAA2B,EAAE,CAAA;YACtC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE;gBAAE,GAAG,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,GAAG,MAAM,CAAC,CAAC,CAAC,IAAI,EAAE,CAAA;YACzE,MAAM,GAAG,CAAA;QACV,CAAC;IACF,CAAC;YAAS,CAAC;QACV,MAAM,MAAM,CAAC,KAAK,EAAE,CAAA;IACrB,CAAC;AACF,CAAC;AAwBD;;;;;;;;;GASG;AACH,MAAM,UAAU,YAAY,CAAC,MAAyB;IACrD,6EAA6E;IAC7E,MAAM,GAAG,GAAG,CAAC,CAAS,EAAE,EAAE,CACzB,IAAI,CAAC;SACH,WAAW,EAAE;SACb,OAAO,CAAC,aAAa,EAAE,GAAG,CAAC;SAC3B,IAAI,EAAE,GAAG,CAAA;IACZ,MAAM,OAAO,GAAkB,EAAE,CAAA;IACjC,MAAM,IAAI,GAAa,EAAE,CAAA;IACzB,MAAM,OAAO,GAAa,EAAE,CAAA;IAE5B,KAAK,MAAM,MAAM,IAAI,MAAM,EAAE,CAAC;QAC7B,MAAM,CAAC,GAAG,GAAG,CAAC,MAAM,CAAC,CAAA;QACrB,MAAM,GAAG,GAAG,CAAC,GAAG,KAAe,EAAW,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAA;QAEpF,IAAI,CAAC,OAAO,CAAC,KAAK,IAAI,GAAG,CAAC,OAAO,EAAE,QAAQ,CAAC;YAAE,OAAO,CAAC,KAAK,GAAG,MAAM,CAAA;aAC/D,IAAI,CAAC,OAAO,CAAC,KAAK,IAAI,GAAG,CAAC,OAAO,EAAE,WAAW,EAAE,KAAK,EAAE,QAAQ,EAAE,MAAM,CAAC;YAAE,OAAO,CAAC,KAAK,GAAG,MAAM,CAAA;aAChG,IAAI,CAAC,OAAO,CAAC,EAAE,IAAI,GAAG,CAAC,IAAI,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,KAAK,CAAC;YAAE,OAAO,CAAC,EAAE,GAAG,MAAM,CAAA;aACrG,IAAI,GAAG,CAAC,KAAK,EAAE,cAAc,EAAE,cAAc,EAAE,SAAS,EAAE,UAAU,EAAE,UAAU,EAAE,QAAQ,EAAE,UAAU,CAAC;YAC3G,OAAO,CAAC,YAAY,KAAK,MAAM,CAAA;aAC3B,IACJ,GAAG,CACF,QAAQ,EACR,SAAS,EACT,MAAM,EACN,MAAM,EACN,MAAM,EACN,OAAO,EACP,UAAU,EACV,KAAK,EACL,SAAS,EACT,QAAQ,EACR,UAAU,EACV,QAAQ,CACR;YAED,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;aAChB,IAAI,GAAG,CAAC,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,QAAQ,EAAE,QAAQ,EAAE,SAAS,EAAE,UAAU,EAAE,SAAS,CAAC;YACnG,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;IACnB,CAAC;IAED,IAAI,IAAI,CAAC,MAAM;QAAE,OAAO,CAAC,IAAI,GAAG,IAAI,CAAC,MAAM,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAE,CAAC,CAAC,CAAC,IAAI,CAAA;IACnE,IAAI,OAAO,CAAC,MAAM;QAAE,OAAO,CAAC,OAAO,GAAG,OAAO,CAAA;IAC7C,OAAO,OAAO,CAAA;AACf,CAAC;AAoBD,oFAAoF;AACpF,MAAM,UAAU,QAAQ,CAAC,IAAY;IACpC,OAAO,YAAY,CAAC,IAAI,EAAE,EAAE,OAAO,EAAE,IAAI,EAAE,gBAAgB,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,kBAAkB,EAAE,IAAI,EAAE,CAAC,CAAA;AAC3G,CAAC;AAED,6FAA6F;AAC7F,SAAS,IAAI,CAAC,GAA2B,EAAE,OAA2B,EAAE,SAAS,GAAG,GAAG;IACtF,IAAI,CAAC,OAAO;QAAE,OAAO,SAAS,CAAA;IAC9B,MAAM,IAAI,GAAG,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAA;IACzD,MAAM,KAAK,GAAG,IAAI;SAChB,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC;SAC1B,MAAM,CAAC,OAAO,CAAC;SACf,IAAI,CAAC,SAAS,CAAC;SACf,IAAI,EAAE,CAAA;IACR,OAAO,KAAK,IAAI,SAAS,CAAA;AAC1B,CAAC;AAED;;;;GAIG;AACH,MAAM,CAAC,KAAK,UAAU,UAAU,CAC/B,IAA8E,EAC9E,OAAsB,EACtB,OAAsB,EAAE;IAExB,MAAM,OAAO,GAAmB,EAAE,CAAA;IAClC,IAAI,KAAK,GAAG,CAAC,CAAA;IAEb,IAAI,KAAK,EAAE,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QAC9B,MAAM,EAAE,GAAG,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,EAAE,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,MAAM,CAAC,KAAK,CAAC,CAAA;QACvE,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,OAAO,CAAC,IAAI,CAAC,CAAA;QACzC,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,OAAO,CAAC,YAAY,CAAC,CAAA;QAChD,MAAM,YAAY,GAAG,IAAI,CAAC,GAAG,EAAE,OAAO,CAAC,OAAO,EAAE,IAAI,CAAC,gBAAgB,IAAI,IAAI,CAAC,CAAA;QAE9E,IAAI,UAA8C,CAAA;QAClD,IAAI,OAAO,CAAC,UAAU,EAAE,CAAC;YACxB,KAAK,MAAM,CAAC,GAAG,EAAE,OAAO,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,UAAU,CAAC,EAAE,CAAC;gBACjE,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,OAAO,CAAC,CAAA;gBAChC,IAAI,KAAK;oBAAE,CAAC,UAAU,KAAK,EAAE,CAAC,CAAC,GAAG,CAAC,GAAG,KAAK,CAAA;YAC5C,CAAC;QACF,CAAC;QAED,MAAM,MAAM,GAAiB;YAC5B,EAAE;YACF,MAAM,EAAE,OAAO,CAAC,MAAM;YACtB,IAAI,EAAE,SAAS,CAAC,CAAC,CAAC,eAAe,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,SAAS;YACxD,YAAY,EAAE,QAAQ,CAAC,CAAC,CAAC,4BAA4B,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,SAAS;YAC3E,KAAK,EAAE,CAAC,OAAO,CAAC,KAAK,IAAI,GAAG,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,IAAI,EAAE,CAAC,IAAI,SAAS;YACjE,KAAK,EAAE,CAAC,OAAO,CAAC,KAAK,IAAI,GAAG,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,IAAI,EAAE,EAAE,WAAW,EAAE,CAAC,IAAI,SAAS;YAChF,OAAO,EACN,YAAY,IAAI,IAAI,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,cAAc,CAAC,YAAY,CAAC,CAAC,IAAI,SAAS,CAAC,CAAC,CAAC,CAAC,SAAS;YAC3G,UAAU;YACV,GAAG,EAAE,GAAG;SACR,CAAA;QAED,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QACpB,KAAK,EAAE,CAAA;IACR,CAAC;IAED,OAAO,OAAO,CAAA;AACf,CAAC;AAcD;;;;;;GAMG;AACH,MAAM,UAAU,iBAAiB,CAAC,IAIjC;IACA,OAAO,KAAK,EAAE,GAAW,EAAiC,EAAE;QAC3D,MAAM,UAAU,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAA;QACxC,MAAM,IAAI,GAAG,eAAe,CAAC,UAAU,EAAE,EAAE,OAAO,EAAE,IAAI,CAAC,OAAO,EAAE,GAAG,EAAE,CAAC,CAAA;QAExE,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAA;QACxC,IAAI,CAAC,QAAQ,IAAI,QAAQ,CAAC,GAAG,KAAK,IAAI,IAAI,QAAQ,CAAC,GAAG,KAAK,IAAI;YAAE,OAAO,IAAI,CAAA;QAE5E,MAAM,OAAO,GAAmB;YAC/B,UAAU,EAAE,EAAE,QAAQ,EAAE,QAAQ,CAAC,GAAG,EAAE,SAAS,EAAE,QAAQ,CAAC,GAAG,EAAE;YAC/D,IAAI,EAAE,QAAQ,CAAC,eAAe;YAC9B,iBAAiB,EAAE,QAAQ,CAAC,aAAa;YACzC,SAAS,EAAE,QAAQ,CAAC,SAAS;SAC7B,CAAA;QACD,OAAO,WAAW,CAAC,IAAI,EAAE,OAAO,CAAC,CAAA;IAClC,CAAC,CAAA;AACF,CAAC"}
@@ -0,0 +1,59 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * The learned scorer (#603) — the production wiring for the gradient-boosted-tree model behind
7
+ * {@link ResolveConfig.scorer}. Two pieces:
8
+ *
9
+ * 1. {@link createMatchFeaturizer} — the ONE feature extractor for a candidate pair, used identically
10
+ * at train time (`scripts/record-matcher/train-gbt.ts`), eval time (the learned-scorer
11
+ * evals), and inference time (here). A pair → one-hot of each comparison's agreement level +
12
+ * the over-merge interaction terms (co-located × name/org disagreement) + address
13
+ * crowdedness.
14
+ * 2. {@link createGbtScorer} — wraps a trained {@link GBT} + the featurizer into the `(a, b) => number`
15
+ * the resolve pipeline's `scorer` hook expects (a logit, threshold-comparable with the
16
+ * Fellegi-Sunter weight it replaces).
17
+ *
18
+ * Both take the comparison set as INPUT (rather than importing {@link buildDefaultModel}) so this
19
+ * module has no dependency cycle with `resolve.ts`. The contract that keeps train ≡ inference:
20
+ * feed the comparisons from `buildDefaultModel({ collapseSpatial: true, addressFrequency })` —
21
+ * the model's structure (and thus the feature layout) is fixed by that config; only the frequency
22
+ * VALUES differ between the training corpus and the matched set, which is the point (the model
23
+ * generalizes, as the cross-state eval showed).
24
+ */
25
+ import { type Comparison, type GBT, type TermFrequencyTable } from "@mailwoman/match";
26
+ import type { SourceRecord } from "./types.js";
27
+ /** Inputs shared by the featurizer + the scorer factory. */
28
+ export interface LearnedFeatureConfig {
29
+ /**
30
+ * The comparison set the features are built over — MUST be `buildDefaultModel({ collapseSpatial:
31
+ * true, addressFrequency }).comparisons` so the feature layout matches the trained model.
32
+ * (`usePhone` / `discriminators` are NOT part of the learned feature model — the GBT replaces the
33
+ * FS weight wholesale and owns its own feature vector.)
34
+ */
35
+ comparisons: Comparison<SourceRecord>[];
36
+ /**
37
+ * Address-frequency table for the crowdedness feature (a crowded shared address is weak
38
+ * identity).
39
+ */
40
+ addressFrequency: TermFrequencyTable;
41
+ }
42
+ /**
43
+ * Build the per-pair feature extractor. The vector is: one-hot of each comparison's agreement
44
+ * level, then the two over-merge interaction terms (spatial-exact × name-disagree, spatial-exact ×
45
+ * org-disagree — the "same place, different names" signature that drives co-located over-merges),
46
+ * then address crowdedness scaled into [0, 1]. Deterministic and EM-independent, so it is identical
47
+ * across train / eval / inference.
48
+ */
49
+ export declare function createMatchFeaturizer(config: LearnedFeatureConfig): (a: SourceRecord, b: SourceRecord) => number[];
50
+ /**
51
+ * Wrap a trained {@link GBT} into the `(a, b) => number` link scorer for
52
+ * {@link ResolveConfig.scorer}. The returned weight is the model's logit — same threshold-comparable
53
+ * units as the Fellegi-Sunter weight it replaces, so the pipeline's clustering + threshold
54
+ * semantics are unchanged.
55
+ */
56
+ export declare function createGbtScorer(config: LearnedFeatureConfig & {
57
+ model: GBT;
58
+ }): (a: SourceRecord, b: SourceRecord) => number;
59
+ //# sourceMappingURL=learned-scorer.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"learned-scorer.d.ts","sourceRoot":"","sources":["../learned-scorer.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AAEH,OAAO,EAAoB,KAAK,UAAU,EAAE,KAAK,GAAG,EAAY,KAAK,kBAAkB,EAAE,MAAM,kBAAkB,CAAA;AACjH,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,YAAY,CAAA;AAE9C,4DAA4D;AAC5D,MAAM,WAAW,oBAAoB;IACpC;;;;;OAKG;IACH,WAAW,EAAE,UAAU,CAAC,YAAY,CAAC,EAAE,CAAA;IACvC;;;OAGG;IACH,gBAAgB,EAAE,kBAAkB,CAAA;CACpC;AAED;;;;;;GAMG;AACH,wBAAgB,qBAAqB,CAAC,MAAM,EAAE,oBAAoB,GAAG,CAAC,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,YAAY,KAAK,MAAM,EAAE,CAkClH;AAED;;;;;GAKG;AACH,wBAAgB,eAAe,CAC9B,MAAM,EAAE,oBAAoB,GAAG;IAAE,KAAK,EAAE,GAAG,CAAA;CAAE,GAC3C,CAAC,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,YAAY,KAAK,MAAM,CAI9C"}