@mailwoman/registry 4.8.1 → 4.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +92 -0
- package/out/address-key.d.ts +31 -0
- package/out/address-key.d.ts.map +1 -0
- package/out/address-key.js +38 -0
- package/out/address-key.js.map +1 -0
- package/out/geojson.d.ts.map +1 -1
- package/out/geojson.js +2 -0
- package/out/geojson.js.map +1 -1
- package/out/index.d.ts +5 -0
- package/out/index.d.ts.map +1 -1
- package/out/index.js +5 -0
- package/out/index.js.map +1 -1
- package/out/ingest.d.ts +65 -4
- package/out/ingest.d.ts.map +1 -1
- package/out/ingest.js +114 -5
- package/out/ingest.js.map +1 -1
- package/out/learned-scorer.d.ts +59 -0
- package/out/learned-scorer.d.ts.map +1 -0
- package/out/learned-scorer.js +78 -0
- package/out/learned-scorer.js.map +1 -0
- package/out/map-html.d.ts +51 -0
- package/out/map-html.d.ts.map +1 -0
- package/out/map-html.js +262 -0
- package/out/map-html.js.map +1 -0
- package/out/models/dedup-gbt-en-us.d.ts +36 -0
- package/out/models/dedup-gbt-en-us.d.ts.map +1 -0
- package/out/models/dedup-gbt-en-us.js +36 -0
- package/out/models/dedup-gbt-en-us.js.map +1 -0
- package/out/reconcile.d.ts +86 -0
- package/out/reconcile.d.ts.map +1 -0
- package/out/reconcile.js +161 -0
- package/out/reconcile.js.map +1 -0
- package/out/resolve.d.ts +114 -4
- package/out/resolve.d.ts.map +1 -1
- package/out/resolve.js +165 -21
- package/out/resolve.js.map +1 -1
- package/out/types.d.ts +7 -0
- package/out/types.d.ts.map +1 -1
- package/out/types.js.map +1 -1
- package/package.json +7 -4
package/README.md
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# @mailwoman/registry
|
|
2
|
+
|
|
3
|
+
**Geocode-first record-matching application** — the high-level entry point that
|
|
4
|
+
runs the full block → score → cluster pipeline over ingested records and returns
|
|
5
|
+
canonical entities ready for export.
|
|
6
|
+
|
|
7
|
+
This is the clinic-funding use case Mailwoman was built for, standing on a
|
|
8
|
+
calibrated, label-free matcher.
|
|
9
|
+
|
|
10
|
+
```ts
|
|
11
|
+
import { resolveEntities, ingestRows, toGeoJSON } from "@mailwoman/registry";
|
|
12
|
+
|
|
13
|
+
// 1. Ingest — CSV/array → normalized SourceRecords
|
|
14
|
+
const records = ingestRows(rows, {
|
|
15
|
+
mapping: { name: "Provider Name", address: "Street Address", city: "City", ... },
|
|
16
|
+
});
|
|
17
|
+
|
|
18
|
+
// 2. Resolve — block → score → cluster with geo-first defaults
|
|
19
|
+
const entities = resolveEntities(records, {
|
|
20
|
+
geocodeAddress: async (row) => ({ lat: 30.2672, lon: -97.7431 }),
|
|
21
|
+
});
|
|
22
|
+
|
|
23
|
+
// 3. Export — GeoJSON for QGIS
|
|
24
|
+
const fc = toGeoJSON(entities);
|
|
25
|
+
// → FeatureCollection with Point features + entity properties
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## The full pipeline
|
|
29
|
+
|
|
30
|
+
```
|
|
31
|
+
CSV / SQLite → ingestRows → SourceRecord[] → resolveEntities → ResolvedEntity[]
|
|
32
|
+
↓
|
|
33
|
+
toGeoJSON()
|
|
34
|
+
↓
|
|
35
|
+
GeoJSON → QGIS
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## API
|
|
39
|
+
|
|
40
|
+
```ts
|
|
41
|
+
// Ingest — parse CSV / map columns → normalized records
|
|
42
|
+
import { ingestRows, parseCsv, inferMapping } from "@mailwoman/registry"
|
|
43
|
+
// {
|
|
44
|
+
// ingestRows(rows, opts): SourceRecord[]
|
|
45
|
+
// parseCsv(csvText): string[][]
|
|
46
|
+
// inferMapping(headers): ColumnMapping
|
|
47
|
+
// }
|
|
48
|
+
|
|
49
|
+
// Resolve — run the full matcher pipeline
|
|
50
|
+
import { resolveEntities } from "@mailwoman/registry"
|
|
51
|
+
// resolveEntities(records, config): ResolvedEntity[]
|
|
52
|
+
// Config: { geocodeAddress?, scorer?, blockingKeys?, threshold?, discriminators? }
|
|
53
|
+
|
|
54
|
+
// Export — GeoJSON, MapLibre HTML, reconciliation reports
|
|
55
|
+
import { toGeoJSON, toMapHTML, reconcile } from "@mailwoman/registry"
|
|
56
|
+
|
|
57
|
+
// Learned scorer — pre-trained GBT for single-dataset dedup
|
|
58
|
+
import { dedupGbtEnUs } from "@mailwoman/registry"
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Default configuration
|
|
62
|
+
|
|
63
|
+
`resolveEntities` ships with sensible defaults:
|
|
64
|
+
|
|
65
|
+
- **Blocking keys:** geo-cell (H3) + canonical address + phone + email
|
|
66
|
+
- **Scoring model:** Fellegi-Sunter with label-free EM, term frequency adjustment
|
|
67
|
+
- **Learned scorer:** optional GBT for single-dataset dedup (opt-in via `scorer`)
|
|
68
|
+
- **Threshold:** 0.5 (configurable precision/recall knob)
|
|
69
|
+
|
|
70
|
+
## CLI
|
|
71
|
+
|
|
72
|
+
The `mailwoman` CLI exposes `registry` as a command:
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
# Multi-source entity resolution
|
|
76
|
+
mailwoman registry --sources config.json --out entities.geojson
|
|
77
|
+
|
|
78
|
+
# Cross-dataset reconciliation
|
|
79
|
+
mailwoman registry --sources tx-nppes.json --reconcile tx-fcc.json
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Related
|
|
83
|
+
|
|
84
|
+
- [`@mailwoman/match`](../match) — the low-level block/score/cluster primitives
|
|
85
|
+
- [`@mailwoman/record`](../record) — `SourceRecord` schema and normalizers
|
|
86
|
+
- [`@mailwoman/address-id`](../address-id) — exact-match join key
|
|
87
|
+
- [Geocode-First Record Matching](https://mailwoman.sister.software/articles/concepts/geocode-first-record-matching/)
|
|
88
|
+
- [Dedup Entity Truth](https://mailwoman.sister.software/articles/concepts/dedup-entity-truth/)
|
|
89
|
+
|
|
90
|
+
## License
|
|
91
|
+
|
|
92
|
+
[AGPL-3.0-only](https://www.gnu.org/licenses/agpl-3.0.html)
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* The address-id consumer for the matcher (#259). Derives a stable {@link PostalAddressID} from a
|
|
7
|
+
* resolved {@link SourceRecord} and exposes it as a blocking key — the deterministic,
|
|
8
|
+
* exact-canonical-address complement to the fuzzy Fellegi-Sunter / GBT scoring. Two uses:
|
|
9
|
+
*
|
|
10
|
+
* - **As a pre-dedup / join key:** `GROUP BY postalAddressId(record)` collapses records that resolve
|
|
11
|
+
* to the same place AND share a canonical address with NO scoring at all — the cheap, certain
|
|
12
|
+
* slice of dedup before the matcher does the fuzzy rest.
|
|
13
|
+
* - **As a blocking key:** {@link addressIdBlockingKey} adds the address-id to the blocking union, so
|
|
14
|
+
* records sharing one are guaranteed to be compared.
|
|
15
|
+
*/
|
|
16
|
+
import { type PostalAddressID } from "@mailwoman/address-id";
|
|
17
|
+
import { type BlockingKey } from "@mailwoman/match";
|
|
18
|
+
import type { SourceRecord } from "./types.js";
|
|
19
|
+
/**
|
|
20
|
+
* The stable address primary key for a record, or null when it isn't geocoded (no coordinate → no
|
|
21
|
+
* locality cell) or carries no raw address to hash. Uses the resolved coordinate + the raw address;
|
|
22
|
+
* the state prefix is plucked from the address when present.
|
|
23
|
+
*/
|
|
24
|
+
export declare function postalAddressId(record: SourceRecord): PostalAddressID | null;
|
|
25
|
+
/**
|
|
26
|
+
* A blocking key on the {@link postalAddressId} — records that resolve to the same place with the
|
|
27
|
+
* same canonical address block together. Add it to {@link defaultBlockingKeys}'s union when an exact
|
|
28
|
+
* address join should never be missed.
|
|
29
|
+
*/
|
|
30
|
+
export declare function addressIdBlockingKey(): BlockingKey<SourceRecord>;
|
|
31
|
+
//# sourceMappingURL=address-key.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"address-key.d.ts","sourceRoot":"","sources":["../address-key.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;GAcG;AAEH,OAAO,EAAyB,KAAK,eAAe,EAAE,MAAM,uBAAuB,CAAA;AACnF,OAAO,EAAE,KAAK,WAAW,EAAY,MAAM,kBAAkB,CAAA;AAC7D,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,YAAY,CAAA;AAE9C;;;;GAIG;AACH,wBAAgB,eAAe,CAAC,MAAM,EAAE,YAAY,GAAG,eAAe,GAAG,IAAI,CAK5E;AAED;;;;GAIG;AACH,wBAAgB,oBAAoB,IAAI,WAAW,CAAC,YAAY,CAAC,CAEhE"}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* The address-id consumer for the matcher (#259). Derives a stable {@link PostalAddressID} from a
|
|
7
|
+
* resolved {@link SourceRecord} and exposes it as a blocking key — the deterministic,
|
|
8
|
+
* exact-canonical-address complement to the fuzzy Fellegi-Sunter / GBT scoring. Two uses:
|
|
9
|
+
*
|
|
10
|
+
* - **As a pre-dedup / join key:** `GROUP BY postalAddressId(record)` collapses records that resolve
|
|
11
|
+
* to the same place AND share a canonical address with NO scoring at all — the cheap, certain
|
|
12
|
+
* slice of dedup before the matcher does the fuzzy rest.
|
|
13
|
+
* - **As a blocking key:** {@link addressIdBlockingKey} adds the address-id to the blocking union, so
|
|
14
|
+
* records sharing one are guaranteed to be compared.
|
|
15
|
+
*/
|
|
16
|
+
import { createPostalAddressID } from "@mailwoman/address-id";
|
|
17
|
+
import { exactKey } from "@mailwoman/match";
|
|
18
|
+
/**
|
|
19
|
+
* The stable address primary key for a record, or null when it isn't geocoded (no coordinate → no
|
|
20
|
+
* locality cell) or carries no raw address to hash. Uses the resolved coordinate + the raw address;
|
|
21
|
+
* the state prefix is plucked from the address when present.
|
|
22
|
+
*/
|
|
23
|
+
export function postalAddressId(record) {
|
|
24
|
+
const coordinate = record.address?.geocode?.coordinate;
|
|
25
|
+
const address = record.address?.raw;
|
|
26
|
+
if (!coordinate || !address)
|
|
27
|
+
return null;
|
|
28
|
+
return createPostalAddressID({ coordinate, address });
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* A blocking key on the {@link postalAddressId} — records that resolve to the same place with the
|
|
32
|
+
* same canonical address block together. Add it to {@link defaultBlockingKeys}'s union when an exact
|
|
33
|
+
* address join should never be missed.
|
|
34
|
+
*/
|
|
35
|
+
export function addressIdBlockingKey() {
|
|
36
|
+
return exactKey((record) => postalAddressId(record));
|
|
37
|
+
}
|
|
38
|
+
//# sourceMappingURL=address-key.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"address-key.js","sourceRoot":"","sources":["../address-key.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;GAcG;AAEH,OAAO,EAAE,qBAAqB,EAAwB,MAAM,uBAAuB,CAAA;AACnF,OAAO,EAAoB,QAAQ,EAAE,MAAM,kBAAkB,CAAA;AAG7D;;;;GAIG;AACH,MAAM,UAAU,eAAe,CAAC,MAAoB;IACnD,MAAM,UAAU,GAAG,MAAM,CAAC,OAAO,EAAE,OAAO,EAAE,UAAU,CAAA;IACtD,MAAM,OAAO,GAAG,MAAM,CAAC,OAAO,EAAE,GAAG,CAAA;IACnC,IAAI,CAAC,UAAU,IAAI,CAAC,OAAO;QAAE,OAAO,IAAI,CAAA;IACxC,OAAO,qBAAqB,CAAC,EAAE,UAAU,EAAE,OAAO,EAAE,CAAC,CAAA;AACtD,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,oBAAoB;IACnC,OAAO,QAAQ,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,eAAe,CAAC,MAAM,CAAC,CAAC,CAAA;AACrD,CAAC"}
|
package/out/geojson.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"geojson.d.ts","sourceRoot":"","sources":["../geojson.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,KAAK,EAAkB,wBAAwB,EAAE,cAAc,EAAgB,MAAM,YAAY,CAAA;
|
|
1
|
+
{"version":3,"file":"geojson.d.ts","sourceRoot":"","sources":["../geojson.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,KAAK,EAAkB,wBAAwB,EAAE,cAAc,EAAgB,MAAM,YAAY,CAAA;AAqCxG;;;GAGG;AACH,wBAAgB,SAAS,CAAC,QAAQ,EAAE,SAAS,cAAc,EAAE,GAAG,wBAAwB,CAKvF"}
|
package/out/geojson.js
CHANGED
|
@@ -34,6 +34,8 @@ function toFeature(entity) {
|
|
|
34
34
|
recordCount: entity.records.length,
|
|
35
35
|
cohesion: entity.cohesion,
|
|
36
36
|
sourceIds: entity.records.map((r) => r.id),
|
|
37
|
+
// Distinct provenance labels the entity's records span — an entity with ≥2 is a cross-dataset link.
|
|
38
|
+
sources: [...new Set(entity.records.map((r) => r.source).filter((s) => !!s))].sort(),
|
|
37
39
|
name: displayName(rep),
|
|
38
40
|
organization: rep.organization?.canonical ?? null,
|
|
39
41
|
address: rep.address?.formatted ?? null,
|
package/out/geojson.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"geojson.js","sourceRoot":"","sources":["../geojson.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAIH,0EAA0E;AAC1E,SAAS,WAAW,CAAC,MAAoB;IACxC,MAAM,IAAI,GAAG,MAAM,CAAC,IAAI,CAAA;IACxB,IAAI,CAAC,IAAI;QAAE,OAAO,IAAI,CAAA;IACtB,MAAM,MAAM,GAAG,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,KAAK,EAAE,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,cAAc,EAAE,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,MAAM,CAAC;SAClG,MAAM,CAAC,OAAO,CAAC;SACf,IAAI,CAAC,GAAG,CAAC;SACT,IAAI,EAAE,CAAA;IACR,OAAO,MAAM,IAAI,IAAI,CAAA;AACtB,CAAC;AAED,8CAA8C;AAC9C,SAAS,SAAS,CAAC,MAAsB;IACxC,MAAM,GAAG,GAAG,MAAM,CAAC,cAAc,CAAA;IACjC,OAAO;QACN,IAAI,EAAE,SAAS;QACf,QAAQ,EAAE;YACT,IAAI,EAAE,OAAO;YACb,WAAW,EAAE,CAAC,MAAM,CAAC,UAAW,CAAC,SAAS,EAAE,MAAM,CAAC,UAAW,CAAC,QAAQ,CAAC;SACxE;QACD,UAAU,EAAE;YACX,QAAQ,EAAE,MAAM,CAAC,EAAE;YACnB,WAAW,EAAE,MAAM,CAAC,OAAO,CAAC,MAAM;YAClC,QAAQ,EAAE,MAAM,CAAC,QAAQ;YACzB,SAAS,EAAE,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;YAC1C,IAAI,EAAE,WAAW,CAAC,GAAG,CAAC;YACtB,YAAY,EAAE,GAAG,CAAC,YAAY,EAAE,SAAS,IAAI,IAAI;YACjD,OAAO,EAAE,GAAG,CAAC,OAAO,EAAE,SAAS,IAAI,IAAI;YACvC,WAAW,EAAE,GAAG,CAAC,OAAO,EAAE,OAAO,EAAE,IAAI,IAAI,IAAI;SAC/C;KACD,CAAA;AACF,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,SAAS,CAAC,QAAmC;IAC5D,OAAO;QACN,IAAI,EAAE,mBAAmB;QACzB,QAAQ,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC;KACvE,CAAA;AACF,CAAC"}
|
|
1
|
+
{"version":3,"file":"geojson.js","sourceRoot":"","sources":["../geojson.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAIH,0EAA0E;AAC1E,SAAS,WAAW,CAAC,MAAoB;IACxC,MAAM,IAAI,GAAG,MAAM,CAAC,IAAI,CAAA;IACxB,IAAI,CAAC,IAAI;QAAE,OAAO,IAAI,CAAA;IACtB,MAAM,MAAM,GAAG,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,KAAK,EAAE,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,cAAc,EAAE,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,MAAM,CAAC;SAClG,MAAM,CAAC,OAAO,CAAC;SACf,IAAI,CAAC,GAAG,CAAC;SACT,IAAI,EAAE,CAAA;IACR,OAAO,MAAM,IAAI,IAAI,CAAA;AACtB,CAAC;AAED,8CAA8C;AAC9C,SAAS,SAAS,CAAC,MAAsB;IACxC,MAAM,GAAG,GAAG,MAAM,CAAC,cAAc,CAAA;IACjC,OAAO;QACN,IAAI,EAAE,SAAS;QACf,QAAQ,EAAE;YACT,IAAI,EAAE,OAAO;YACb,WAAW,EAAE,CAAC,MAAM,CAAC,UAAW,CAAC,SAAS,EAAE,MAAM,CAAC,UAAW,CAAC,QAAQ,CAAC;SACxE;QACD,UAAU,EAAE;YACX,QAAQ,EAAE,MAAM,CAAC,EAAE;YACnB,WAAW,EAAE,MAAM,CAAC,OAAO,CAAC,MAAM;YAClC,QAAQ,EAAE,MAAM,CAAC,QAAQ;YACzB,SAAS,EAAE,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;YAC1C,oGAAoG;YACpG,OAAO,EAAE,CAAC,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAe,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE;YACjG,IAAI,EAAE,WAAW,CAAC,GAAG,CAAC;YACtB,YAAY,EAAE,GAAG,CAAC,YAAY,EAAE,SAAS,IAAI,IAAI;YACjD,OAAO,EAAE,GAAG,CAAC,OAAO,EAAE,SAAS,IAAI,IAAI;YACvC,WAAW,EAAE,GAAG,CAAC,OAAO,EAAE,OAAO,EAAE,IAAI,IAAI,IAAI;SAC/C;KACD,CAAA;AACF,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,SAAS,CAAC,QAAmC;IAC5D,OAAO;QACN,IAAI,EAAE,mBAAmB;QACzB,QAAQ,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC;KACvE,CAAA;AACF,CAAC"}
|
package/out/index.d.ts
CHANGED
|
@@ -10,8 +10,13 @@
|
|
|
10
10
|
* {@link toGeoJSON} exports them for QGIS. This is the clinic-funding use case mailwoman was built
|
|
11
11
|
* for, finally standing on a calibrated, label-free matcher.
|
|
12
12
|
*/
|
|
13
|
+
export * from "./address-key.js";
|
|
13
14
|
export * from "./geojson.js";
|
|
14
15
|
export * from "./ingest.js";
|
|
16
|
+
export * from "./learned-scorer.js";
|
|
17
|
+
export * from "./map-html.js";
|
|
18
|
+
export * from "./models/dedup-gbt-en-us.js";
|
|
19
|
+
export * from "./reconcile.js";
|
|
15
20
|
export * from "./resolve.js";
|
|
16
21
|
export * from "./types.js";
|
|
17
22
|
//# sourceMappingURL=index.d.ts.map
|
package/out/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,cAAc,cAAc,CAAA;AAC5B,cAAc,aAAa,CAAA;AAC3B,cAAc,cAAc,CAAA;AAC5B,cAAc,YAAY,CAAA"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,cAAc,kBAAkB,CAAA;AAChC,cAAc,cAAc,CAAA;AAC5B,cAAc,aAAa,CAAA;AAC3B,cAAc,qBAAqB,CAAA;AACnC,cAAc,eAAe,CAAA;AAC7B,cAAc,6BAA6B,CAAA;AAC3C,cAAc,gBAAgB,CAAA;AAC9B,cAAc,cAAc,CAAA;AAC5B,cAAc,YAAY,CAAA"}
|
package/out/index.js
CHANGED
|
@@ -10,8 +10,13 @@
|
|
|
10
10
|
* {@link toGeoJSON} exports them for QGIS. This is the clinic-funding use case mailwoman was built
|
|
11
11
|
* for, finally standing on a calibrated, label-free matcher.
|
|
12
12
|
*/
|
|
13
|
+
export * from "./address-key.js";
|
|
13
14
|
export * from "./geojson.js";
|
|
14
15
|
export * from "./ingest.js";
|
|
16
|
+
export * from "./learned-scorer.js";
|
|
17
|
+
export * from "./map-html.js";
|
|
18
|
+
export * from "./models/dedup-gbt-en-us.js";
|
|
19
|
+
export * from "./reconcile.js";
|
|
15
20
|
export * from "./resolve.js";
|
|
16
21
|
export * from "./types.js";
|
|
17
22
|
//# sourceMappingURL=index.js.map
|
package/out/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,cAAc,cAAc,CAAA;AAC5B,cAAc,aAAa,CAAA;AAC3B,cAAc,cAAc,CAAA;AAC5B,cAAc,YAAY,CAAA"}
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,cAAc,kBAAkB,CAAA;AAChC,cAAc,cAAc,CAAA;AAC5B,cAAc,aAAa,CAAA;AAC3B,cAAc,qBAAqB,CAAA;AACnC,cAAc,eAAe,CAAA;AAC7B,cAAc,6BAA6B,CAAA;AAC3C,cAAc,gBAAgB,CAAA;AAC9B,cAAc,cAAc,CAAA;AAC5B,cAAc,YAAY,CAAA"}
|
package/out/ingest.d.ts
CHANGED
|
@@ -26,8 +26,36 @@ import { toPostalAddress } from "@mailwoman/record";
|
|
|
26
26
|
import type { SourceRecord } from "./types.js";
|
|
27
27
|
/** Resolve a raw address string into a {@link PostalAddress}. The seam to mailwoman's geocoder. */
|
|
28
28
|
export type GeocodeAddress = (raw: string) => Promise<PostalAddress | null> | PostalAddress | null;
|
|
29
|
-
/**
|
|
30
|
-
|
|
29
|
+
/** Column delimiter of a delimited source. */
|
|
30
|
+
export type Delimiter = "comma" | "tab";
|
|
31
|
+
/** Infer the delimiter from a path's extension (`.tsv` → tab, else comma). */
|
|
32
|
+
export declare function delimiterFor(path: string): Delimiter;
|
|
33
|
+
/**
|
|
34
|
+
* Stream a delimited file's rows lazily as header-keyed objects — the same shape {@link parseCsv}
|
|
35
|
+
* returns, but **without loading the file into memory**. A multi-GB source (the NPPES registry is
|
|
36
|
+
* ~4.8 GB / 9.6M rows — too big for `readFileSync`, which throws `ERR_STRING_TOO_LONG`) streams
|
|
37
|
+
* line by line. Keys are the original header names so a {@link ColumnMapping} written against the
|
|
38
|
+
* source's headers matches. Filter/sample the stream before {@link ingestRows} to keep only the rows
|
|
39
|
+
* you geocode.
|
|
40
|
+
*
|
|
41
|
+
* We stream _lines_ with spliterator's `TextSpliterator` (pure-Node, the part that handles the huge
|
|
42
|
+
* file) and split each line into columns here with `String.prototype.split`. We deliberately do NOT
|
|
43
|
+
* use `CSVSpliterator`: its column tokenizer hard-codes `skipEmpty` (it builds the column
|
|
44
|
+
* spliterator as `{ delimiter }` with no `skipEmpty: false`), so consecutive delimiters collapse
|
|
45
|
+
* and EMPTY FIELDS ARE DROPPED — fatal for a fixed-width registry like NPPES where a row of 330
|
|
46
|
+
* columns full of empties would mis-parse to 40 and shift every value. (Upstream `spliterator` bug;
|
|
47
|
+
* revisit when it's fixed.)
|
|
48
|
+
*
|
|
49
|
+
* Assumes an unquoted delimited file (no fields containing the delimiter) — true for these
|
|
50
|
+
* government TSVs. For small, possibly-quoted CSVs use {@link parseCsv} (quote-aware, in-memory).
|
|
51
|
+
*/
|
|
52
|
+
export declare function streamRows(source: string, opts?: {
|
|
53
|
+
delimiter?: Delimiter;
|
|
54
|
+
}): AsyncGenerator<Record<string, string>>;
|
|
55
|
+
/**
|
|
56
|
+
* Maps dataset columns to record fields. A field may draw from several columns (joined with
|
|
57
|
+
* spaces).
|
|
58
|
+
*/
|
|
31
59
|
export interface ColumnMapping {
|
|
32
60
|
/** Column holding a stable row id. Falls back to the row index. */
|
|
33
61
|
id?: string;
|
|
@@ -38,16 +66,49 @@ export interface ColumnMapping {
|
|
|
38
66
|
address?: string | string[];
|
|
39
67
|
phone?: string;
|
|
40
68
|
email?: string;
|
|
69
|
+
/**
|
|
70
|
+
* Extra secondary-identifier fields → the column(s) to draw each from (joined with spaces). Land
|
|
71
|
+
* on `SourceRecord.attributes` under the same key, for the matcher's `discriminators`
|
|
72
|
+
* (authorized-official name, taxonomy, license…).
|
|
73
|
+
*/
|
|
74
|
+
attributes?: Record<string, string | string[]>;
|
|
41
75
|
}
|
|
76
|
+
/**
|
|
77
|
+
* Best-effort {@link ColumnMapping} inferred from a header row — the "point it at any CSV"
|
|
78
|
+
* convenience. Each column name is matched (case- and punctuation-insensitive, on whole tokens) to
|
|
79
|
+
* a field by keyword, in a precedence that resolves the common ambiguities: a dedicated id / phone
|
|
80
|
+
* / email column is claimed before the generic sweep, an org / facility column beats a person
|
|
81
|
+
* "name", and address columns (street / city / state / zip…) collect into one multi-column field.
|
|
82
|
+
* Imperfect on bespoke headers (an explicit mapping or the LLM-assisted inference #603 is the
|
|
83
|
+
* answer there), but it nails tidy and semi-tidy files with no hand-mapping. Unmatched columns are
|
|
84
|
+
* left out.
|
|
85
|
+
*/
|
|
86
|
+
export declare function inferMapping(header: readonly string[]): ColumnMapping;
|
|
42
87
|
/** Options for {@link ingestRows}. */
|
|
43
88
|
export interface IngestOptions {
|
|
44
89
|
/** The geocoding seam. Without it, records carry name/org but no resolved address. */
|
|
45
90
|
geocodeAddress?: GeocodeAddress;
|
|
91
|
+
/**
|
|
92
|
+
* Separator for joining a multi-column ADDRESS mapping (name/org always join with a space).
|
|
93
|
+
* Default `" "`. Pass `", "` to give the parser delimited input (`"214 Main St, Austin, TX
|
|
94
|
+
* 78701"`) instead of a concatenated run (`"214 Main St Austin TX 78701"`) — the latter strips
|
|
95
|
+
* the parser's segmentation boundaries and is partly OOD (it also breaks all-caps
|
|
96
|
+
* case-normalization; #694). **Default `", "` (#694 flip, validated).** Comma-join is the correct
|
|
97
|
+
* shape for an address built from separate columns, and #700 measured it at +15% cross-dataset
|
|
98
|
+
* rooftop (579→667) with no comma-less crater. The dedup GBT was trained on the old space-joined
|
|
99
|
+
* coords, so this flip is paired with a GBT re-validation (#694). Pass `" "` to restore the
|
|
100
|
+
* legacy space-join for a byte-stable A/B.
|
|
101
|
+
*/
|
|
102
|
+
addressSeparator?: string;
|
|
46
103
|
}
|
|
47
104
|
/** Parse a CSV string (with a header row) into row objects keyed by column name. */
|
|
48
105
|
export declare function parseCsv(text: string): Record<string, string>[];
|
|
49
|
-
/**
|
|
50
|
-
|
|
106
|
+
/**
|
|
107
|
+
* Normalize tabular rows into {@link SourceRecord}s under a {@link ColumnMapping}. Accepts a sync OR
|
|
108
|
+
* async iterable, so {@link parseCsv} (in-memory) and {@link streamRows} (lazy, for huge files) both
|
|
109
|
+
* thread straight through.
|
|
110
|
+
*/
|
|
111
|
+
export declare function ingestRows(rows: Iterable<Record<string, string>> | AsyncIterable<Record<string, string>>, mapping: ColumnMapping, opts?: IngestOptions): Promise<SourceRecord[]>;
|
|
51
112
|
/**
|
|
52
113
|
* The subset of mailwoman's `GeocodeResult` the adapter consumes — kept structural so this package
|
|
53
114
|
* never imports the heavy geocoder, yet a real `GeocodeResult` maps straight in.
|
package/out/ingest.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ingest.d.ts","sourceRoot":"","sources":["../ingest.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAEH,OAAO,KAAK,EAAE,cAAc,EAAE,aAAa,EAAE,MAAM,mBAAmB,CAAA;AACtE,OAAO,EAAiD,eAAe,EAAe,MAAM,mBAAmB,CAAA;
|
|
1
|
+
{"version":3,"file":"ingest.d.ts","sourceRoot":"","sources":["../ingest.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAEH,OAAO,KAAK,EAAE,cAAc,EAAE,aAAa,EAAE,MAAM,mBAAmB,CAAA;AACtE,OAAO,EAAiD,eAAe,EAAe,MAAM,mBAAmB,CAAA;AAI/G,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,YAAY,CAAA;AAE9C,mGAAmG;AACnG,MAAM,MAAM,cAAc,GAAG,CAAC,GAAG,EAAE,MAAM,KAAK,OAAO,CAAC,aAAa,GAAG,IAAI,CAAC,GAAG,aAAa,GAAG,IAAI,CAAA;AAElG,8CAA8C;AAC9C,MAAM,MAAM,SAAS,GAAG,OAAO,GAAG,KAAK,CAAA;AAEvC,8EAA8E;AAC9E,wBAAgB,YAAY,CAAC,IAAI,EAAE,MAAM,GAAG,SAAS,CAEpD;AAED;;;;;;;;;;;;;;;;;;GAkBG;AACH,wBAAuB,UAAU,CAChC,MAAM,EAAE,MAAM,EACd,IAAI,GAAE;IAAE,SAAS,CAAC,EAAE,SAAS,CAAA;CAAO,GAClC,cAAc,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CA0BxC;AAED;;;GAGG;AACH,MAAM,WAAW,aAAa;IAC7B,mEAAmE;IACnE,EAAE,CAAC,EAAE,MAAM,CAAA;IACX,+DAA+D;IAC/D,MAAM,CAAC,EAAE,MAAM,CAAA;IACf,IAAI,CAAC,EAAE,MAAM,GAAG,MAAM,EAAE,CAAA;IACxB,YAAY,CAAC,EAAE,MAAM,GAAG,MAAM,EAAE,CAAA;IAChC,OAAO,CAAC,EAAE,MAAM,GAAG,MAAM,EAAE,CAAA;IAC3B,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,KAAK,CAAC,EAAE,MAAM,CAAA;IACd;;;;OAIG;IACH,UAAU,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,EAAE,CAAC,CAAA;CAC9C;AAED;;;;;;;;;GASG;AACH,wBAAgB,YAAY,CAAC,MAAM,EAAE,SAAS,MAAM,EAAE,GAAG,aAAa,CA4CrE;AAED,sCAAsC;AACtC,MAAM,WAAW,aAAa;IAC7B,sFAAsF;IACtF,cAAc,CAAC,EAAE,cAAc,CAAA;IAC/B;;;;;;;;;;OAUG;IACH,gBAAgB,CAAC,EAAE,MAAM,CAAA;CACzB;AAED,oFAAoF;AACpF,wBAAgB,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,CAE/D;AAcD;;;;GAIG;AACH,wBAAsB,UAAU,CAC/B,IAAI,EAAE,QAAQ,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,GAAG,aAAa,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,EAC9E,OAAO,EAAE,aAAa,EACtB,IAAI,GAAE,aAAkB,GACtB,OAAO,CAAC,YAAY,EAAE,CAAC,CAoCzB;AAED;;;GAGG;AACH,MAAM,WAAW,UAAU;IAC1B,GAAG,EAAE,MAAM,GAAG,IAAI,CAAA;IAClB,GAAG,EAAE,MAAM,GAAG,IAAI,CAAA;IAClB,eAAe,EAAE,cAAc,CAAC,MAAM,CAAC,CAAA;IACvC,aAAa,EAAE,MAAM,GAAG,IAAI,CAAA;IAC5B,SAAS,CAAC,EAAE,cAAc,CAAC,WAAW,CAAC,CAAA;CACvC;AAED;;;;;;GAMG;AACH,wBAAgB,iBAAiB,CAAC,IAAI,EAAE;IACvC,KAAK,EAAE,CAAC,GAAG,EAAE,MAAM,KAAK,OAAO,CAAC,UAAU,CAAC,OAAO,eAAe,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,UAAU,CAAC,OAAO,eAAe,CAAC,CAAC,CAAC,CAAC,CAAA;IAC9G,OAAO,EAAE,CAAC,GAAG,EAAE,MAAM,KAAK,OAAO,CAAC,UAAU,GAAG,IAAI,CAAC,GAAG,UAAU,GAAG,IAAI,CAAA;IACxE,OAAO,CAAC,EAAE,MAAM,CAAA;CAChB,GAAG,cAAc,CAgBjB"}
|
package/out/ingest.js
CHANGED
|
@@ -23,31 +23,139 @@
|
|
|
23
23
|
*/
|
|
24
24
|
import { canonicalizeOrganizationName, parsePersonName, toPostalAddress, withGeocode } from "@mailwoman/record";
|
|
25
25
|
import { parse as parseCsvSync } from "csv-parse/sync";
|
|
26
|
+
import { open } from "node:fs/promises";
|
|
27
|
+
import { Delimiters, TextSpliterator } from "spliterator";
|
|
28
|
+
/** Infer the delimiter from a path's extension (`.tsv` → tab, else comma). */
|
|
29
|
+
export function delimiterFor(path) {
|
|
30
|
+
return /\.tsv$/i.test(path) ? "tab" : "comma";
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Stream a delimited file's rows lazily as header-keyed objects — the same shape {@link parseCsv}
|
|
34
|
+
* returns, but **without loading the file into memory**. A multi-GB source (the NPPES registry is
|
|
35
|
+
* ~4.8 GB / 9.6M rows — too big for `readFileSync`, which throws `ERR_STRING_TOO_LONG`) streams
|
|
36
|
+
* line by line. Keys are the original header names so a {@link ColumnMapping} written against the
|
|
37
|
+
* source's headers matches. Filter/sample the stream before {@link ingestRows} to keep only the rows
|
|
38
|
+
* you geocode.
|
|
39
|
+
*
|
|
40
|
+
* We stream _lines_ with spliterator's `TextSpliterator` (pure-Node, the part that handles the huge
|
|
41
|
+
* file) and split each line into columns here with `String.prototype.split`. We deliberately do NOT
|
|
42
|
+
* use `CSVSpliterator`: its column tokenizer hard-codes `skipEmpty` (it builds the column
|
|
43
|
+
* spliterator as `{ delimiter }` with no `skipEmpty: false`), so consecutive delimiters collapse
|
|
44
|
+
* and EMPTY FIELDS ARE DROPPED — fatal for a fixed-width registry like NPPES where a row of 330
|
|
45
|
+
* columns full of empties would mis-parse to 40 and shift every value. (Upstream `spliterator` bug;
|
|
46
|
+
* revisit when it's fixed.)
|
|
47
|
+
*
|
|
48
|
+
* Assumes an unquoted delimited file (no fields containing the delimiter) — true for these
|
|
49
|
+
* government TSVs. For small, possibly-quoted CSVs use {@link parseCsv} (quote-aware, in-memory).
|
|
50
|
+
*/
|
|
51
|
+
export async function* streamRows(source, opts = {}) {
|
|
52
|
+
const sep = (opts.delimiter ?? delimiterFor(source)) === "tab" ? "\t" : ",";
|
|
53
|
+
// Own the file handle so it's closed deterministically. spliterator's `autoDispose` only fires on
|
|
54
|
+
// natural completion, not on an early `break`/`.return()` — which then leaks the fd (a GC-time error
|
|
55
|
+
// in Node 24+). We open it, pass `autoDispose: false` so spliterator never touches our handle, and
|
|
56
|
+
// close it in `finally` (runs on completion AND when the consumer abandons the generator early).
|
|
57
|
+
const handle = await open(source, "r");
|
|
58
|
+
try {
|
|
59
|
+
let header = null;
|
|
60
|
+
for await (const line of TextSpliterator.fromAsync(handle, {
|
|
61
|
+
delimiter: Delimiters.LineFeed,
|
|
62
|
+
autoDispose: false,
|
|
63
|
+
})) {
|
|
64
|
+
if (line.length === 0)
|
|
65
|
+
continue; // blank line / trailing newline
|
|
66
|
+
const fields = line.replace(/\r$/, "").split(sep); // tolerate CRLF
|
|
67
|
+
if (header === null) {
|
|
68
|
+
header = fields;
|
|
69
|
+
continue;
|
|
70
|
+
}
|
|
71
|
+
const row = {};
|
|
72
|
+
for (let i = 0; i < header.length; i++)
|
|
73
|
+
row[header[i]] = fields[i] ?? "";
|
|
74
|
+
yield row;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
finally {
|
|
78
|
+
await handle.close();
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
/**
|
|
82
|
+
* Best-effort {@link ColumnMapping} inferred from a header row — the "point it at any CSV"
|
|
83
|
+
* convenience. Each column name is matched (case- and punctuation-insensitive, on whole tokens) to
|
|
84
|
+
* a field by keyword, in a precedence that resolves the common ambiguities: a dedicated id / phone
|
|
85
|
+
* / email column is claimed before the generic sweep, an org / facility column beats a person
|
|
86
|
+
* "name", and address columns (street / city / state / zip…) collect into one multi-column field.
|
|
87
|
+
* Imperfect on bespoke headers (an explicit mapping or the LLM-assisted inference #603 is the
|
|
88
|
+
* answer there), but it nails tidy and semi-tidy files with no hand-mapping. Unmatched columns are
|
|
89
|
+
* left out.
|
|
90
|
+
*/
|
|
91
|
+
export function inferMapping(header) {
|
|
92
|
+
// Pad to whole-token boundaries so "state" doesn't match inside "statement".
|
|
93
|
+
const tok = (h) => ` ${h
|
|
94
|
+
.toLowerCase()
|
|
95
|
+
.replace(/[^a-z0-9]+/g, " ")
|
|
96
|
+
.trim()} `;
|
|
97
|
+
const mapping = {};
|
|
98
|
+
const name = [];
|
|
99
|
+
const address = [];
|
|
100
|
+
for (const column of header) {
|
|
101
|
+
const h = tok(column);
|
|
102
|
+
const has = (...words) => words.some((w) => h.includes(` ${w} `));
|
|
103
|
+
if (!mapping.email && has("email", "e mail"))
|
|
104
|
+
mapping.email = column;
|
|
105
|
+
else if (!mapping.phone && has("phone", "telephone", "tel", "mobile", "cell"))
|
|
106
|
+
mapping.phone = column;
|
|
107
|
+
else if (!mapping.id && has("id", "npi", "ein", "frn", "spin", "uuid", "guid", "key"))
|
|
108
|
+
mapping.id = column;
|
|
109
|
+
else if (has("org", "organization", "organisation", "company", "business", "facility", "agency", "employer"))
|
|
110
|
+
mapping.organization ??= column;
|
|
111
|
+
else if (has("street", "address", "addr", "city", "town", "state", "province", "zip", "zipcode", "postal", "postcode", "county"))
|
|
112
|
+
address.push(column);
|
|
113
|
+
else if (has("name", "first", "last", "given", "family", "middle", "surname", "fullname", "contact"))
|
|
114
|
+
name.push(column);
|
|
115
|
+
}
|
|
116
|
+
if (name.length)
|
|
117
|
+
mapping.name = name.length === 1 ? name[0] : name;
|
|
118
|
+
if (address.length)
|
|
119
|
+
mapping.address = address;
|
|
120
|
+
return mapping;
|
|
121
|
+
}
|
|
26
122
|
/** Parse a CSV string (with a header row) into row objects keyed by column name. */
|
|
27
123
|
export function parseCsv(text) {
|
|
28
124
|
return parseCsvSync(text, { columns: true, skip_empty_lines: true, trim: true, relax_column_count: true });
|
|
29
125
|
}
|
|
30
126
|
/** Join the named column(s) of a row into a single trimmed string, or undefined if empty. */
|
|
31
|
-
function pick(row, columns) {
|
|
127
|
+
function pick(row, columns, separator = " ") {
|
|
32
128
|
if (!columns)
|
|
33
129
|
return undefined;
|
|
34
130
|
const list = Array.isArray(columns) ? columns : [columns];
|
|
35
131
|
const value = list
|
|
36
132
|
.map((c) => row[c]?.trim())
|
|
37
133
|
.filter(Boolean)
|
|
38
|
-
.join(
|
|
134
|
+
.join(separator)
|
|
39
135
|
.trim();
|
|
40
136
|
return value || undefined;
|
|
41
137
|
}
|
|
42
|
-
/**
|
|
138
|
+
/**
|
|
139
|
+
* Normalize tabular rows into {@link SourceRecord}s under a {@link ColumnMapping}. Accepts a sync OR
|
|
140
|
+
* async iterable, so {@link parseCsv} (in-memory) and {@link streamRows} (lazy, for huge files) both
|
|
141
|
+
* thread straight through.
|
|
142
|
+
*/
|
|
43
143
|
export async function ingestRows(rows, mapping, opts = {}) {
|
|
44
144
|
const records = [];
|
|
45
145
|
let index = 0;
|
|
46
|
-
for (const row of rows) {
|
|
146
|
+
for await (const row of rows) {
|
|
47
147
|
const id = (mapping.id ? row[mapping.id]?.trim() : "") || String(index);
|
|
48
148
|
const nameValue = pick(row, mapping.name);
|
|
49
149
|
const orgValue = pick(row, mapping.organization);
|
|
50
|
-
const addressValue = pick(row, mapping.address);
|
|
150
|
+
const addressValue = pick(row, mapping.address, opts.addressSeparator ?? ", ");
|
|
151
|
+
let attributes;
|
|
152
|
+
if (mapping.attributes) {
|
|
153
|
+
for (const [key, columns] of Object.entries(mapping.attributes)) {
|
|
154
|
+
const value = pick(row, columns);
|
|
155
|
+
if (value)
|
|
156
|
+
(attributes ??= {})[key] = value;
|
|
157
|
+
}
|
|
158
|
+
}
|
|
51
159
|
const record = {
|
|
52
160
|
id,
|
|
53
161
|
source: mapping.source,
|
|
@@ -56,6 +164,7 @@ export async function ingestRows(rows, mapping, opts = {}) {
|
|
|
56
164
|
phone: (mapping.phone && row[mapping.phone]?.trim()) || undefined,
|
|
57
165
|
email: (mapping.email && row[mapping.email]?.trim()?.toLowerCase()) || undefined,
|
|
58
166
|
address: addressValue && opts.geocodeAddress ? ((await opts.geocodeAddress(addressValue)) ?? undefined) : undefined,
|
|
167
|
+
attributes,
|
|
59
168
|
raw: row,
|
|
60
169
|
};
|
|
61
170
|
records.push(record);
|
package/out/ingest.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ingest.js","sourceRoot":"","sources":["../ingest.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAGH,OAAO,EAAE,4BAA4B,EAAE,eAAe,EAAE,eAAe,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAA;AAC/G,OAAO,EAAE,KAAK,IAAI,YAAY,EAAE,MAAM,gBAAgB,CAAA;
|
|
1
|
+
{"version":3,"file":"ingest.js","sourceRoot":"","sources":["../ingest.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAGH,OAAO,EAAE,4BAA4B,EAAE,eAAe,EAAE,eAAe,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAA;AAC/G,OAAO,EAAE,KAAK,IAAI,YAAY,EAAE,MAAM,gBAAgB,CAAA;AACtD,OAAO,EAAE,IAAI,EAAE,MAAM,kBAAkB,CAAA;AACvC,OAAO,EAAE,UAAU,EAAE,eAAe,EAAE,MAAM,aAAa,CAAA;AASzD,8EAA8E;AAC9E,MAAM,UAAU,YAAY,CAAC,IAAY;IACxC,OAAO,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,OAAO,CAAA;AAC9C,CAAC;AAED;;;;;;;;;;;;;;;;;;GAkBG;AACH,MAAM,CAAC,KAAK,SAAS,CAAC,CAAC,UAAU,CAChC,MAAc,EACd,OAAkC,EAAE;IAEpC,MAAM,GAAG,GAAG,CAAC,IAAI,CAAC,SAAS,IAAI,YAAY,CAAC,MAAM,CAAC,CAAC,KAAK,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,CAAA;IAC3E,kGAAkG;IAClG,qGAAqG;IACrG,mGAAmG;IACnG,iGAAiG;IACjG,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,MAAM,EAAE,GAAG,CAAC,CAAA;IACtC,IAAI,CAAC;QACJ,IAAI,MAAM,GAAoB,IAAI,CAAA;QAClC,IAAI,KAAK,EAAE,MAAM,IAAI,IAAI,eAAe,CAAC,SAAS,CAAC,MAAM,EAAE;YAC1D,SAAS,EAAE,UAAU,CAAC,QAAQ;YAC9B,WAAW,EAAE,KAAK;SAClB,CAAC,EAAE,CAAC;YACJ,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;gBAAE,SAAQ,CAAC,gCAAgC;YAChE,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAA,CAAC,gBAAgB;YAClE,IAAI,MAAM,KAAK,IAAI,EAAE,CAAC;gBACrB,MAAM,GAAG,MAAM,CAAA;gBACf,SAAQ;YACT,CAAC;YACD,MAAM,GAAG,GAA2B,EAAE,CAAA;YACtC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE;gBAAE,GAAG,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,GAAG,MAAM,CAAC,CAAC,CAAC,IAAI,EAAE,CAAA;YACzE,MAAM,GAAG,CAAA;QACV,CAAC;IACF,CAAC;YAAS,CAAC;QACV,MAAM,MAAM,CAAC,KAAK,EAAE,CAAA;IACrB,CAAC;AACF,CAAC;AAwBD;;;;;;;;;GASG;AACH,MAAM,UAAU,YAAY,CAAC,MAAyB;IACrD,6EAA6E;IAC7E,MAAM,GAAG,GAAG,CAAC,CAAS,EAAE,EAAE,CACzB,IAAI,CAAC;SACH,WAAW,EAAE;SACb,OAAO,CAAC,aAAa,EAAE,GAAG,CAAC;SAC3B,IAAI,EAAE,GAAG,CAAA;IACZ,MAAM,OAAO,GAAkB,EAAE,CAAA;IACjC,MAAM,IAAI,GAAa,EAAE,CAAA;IACzB,MAAM,OAAO,GAAa,EAAE,CAAA;IAE5B,KAAK,MAAM,MAAM,IAAI,MAAM,EAAE,CAAC;QAC7B,MAAM,CAAC,GAAG,GAAG,CAAC,MAAM,CAAC,CAAA;QACrB,MAAM,GAAG,GAAG,CAAC,GAAG,KAAe,EAAW,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAA;QAEpF,IAAI,CAAC,OAAO,CAAC,KAAK,IAAI,GAAG,CAAC,OAAO,EAAE,QAAQ,CAAC;YAAE,OAAO,CAAC,KAAK,GAAG,MAAM,CAAA;aAC/D,IAAI,CAAC,OAAO,CAAC,KAAK,IAAI,GAAG,CAAC,OAAO,EAAE,WAAW,EAAE,KAAK,EAAE,QAAQ,EAAE,MAAM,CAAC;YAAE,OAAO,CAAC,KAAK,GAAG,MAAM,CAAA;aAChG,IAAI,CAAC,OAAO,CAAC,EAAE,IAAI,GAAG,CAAC,IAAI,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,KAAK,CAAC;YAAE,OAAO,CAAC,EAAE,GAAG,MAAM,CAAA;aACrG,IAAI,GAAG,CAAC,KAAK,EAAE,cAAc,EAAE,cAAc,EAAE,SAAS,EAAE,UAAU,EAAE,UAAU,EAAE,QAAQ,EAAE,UAAU,CAAC;YAC3G,OAAO,CAAC,YAAY,KAAK,MAAM,CAAA;aAC3B,IACJ,GAAG,CACF,QAAQ,EACR,SAAS,EACT,MAAM,EACN,MAAM,EACN,MAAM,EACN,OAAO,EACP,UAAU,EACV,KAAK,EACL,SAAS,EACT,QAAQ,EACR,UAAU,EACV,QAAQ,CACR;YAED,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;aAChB,IAAI,GAAG,CAAC,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,QAAQ,EAAE,QAAQ,EAAE,SAAS,EAAE,UAAU,EAAE,SAAS,CAAC;YACnG,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;IACnB,CAAC;IAED,IAAI,IAAI,CAAC,MAAM;QAAE,OAAO,CAAC,IAAI,GAAG,IAAI,CAAC,MAAM,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAE,CAAC,CAAC,CAAC,IAAI,CAAA;IACnE,IAAI,OAAO,CAAC,MAAM;QAAE,OAAO,CAAC,OAAO,GAAG,OAAO,CAAA;IAC7C,OAAO,OAAO,CAAA;AACf,CAAC;AAoBD,oFAAoF;AACpF,MAAM,UAAU,QAAQ,CAAC,IAAY;IACpC,OAAO,YAAY,CAAC,IAAI,EAAE,EAAE,OAAO,EAAE,IAAI,EAAE,gBAAgB,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,kBAAkB,EAAE,IAAI,EAAE,CAAC,CAAA;AAC3G,CAAC;AAED,6FAA6F;AAC7F,SAAS,IAAI,CAAC,GAA2B,EAAE,OAA2B,EAAE,SAAS,GAAG,GAAG;IACtF,IAAI,CAAC,OAAO;QAAE,OAAO,SAAS,CAAA;IAC9B,MAAM,IAAI,GAAG,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAA;IACzD,MAAM,KAAK,GAAG,IAAI;SAChB,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC;SAC1B,MAAM,CAAC,OAAO,CAAC;SACf,IAAI,CAAC,SAAS,CAAC;SACf,IAAI,EAAE,CAAA;IACR,OAAO,KAAK,IAAI,SAAS,CAAA;AAC1B,CAAC;AAED;;;;GAIG;AACH,MAAM,CAAC,KAAK,UAAU,UAAU,CAC/B,IAA8E,EAC9E,OAAsB,EACtB,OAAsB,EAAE;IAExB,MAAM,OAAO,GAAmB,EAAE,CAAA;IAClC,IAAI,KAAK,GAAG,CAAC,CAAA;IAEb,IAAI,KAAK,EAAE,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QAC9B,MAAM,EAAE,GAAG,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,EAAE,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,MAAM,CAAC,KAAK,CAAC,CAAA;QACvE,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,OAAO,CAAC,IAAI,CAAC,CAAA;QACzC,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,OAAO,CAAC,YAAY,CAAC,CAAA;QAChD,MAAM,YAAY,GAAG,IAAI,CAAC,GAAG,EAAE,OAAO,CAAC,OAAO,EAAE,IAAI,CAAC,gBAAgB,IAAI,IAAI,CAAC,CAAA;QAE9E,IAAI,UAA8C,CAAA;QAClD,IAAI,OAAO,CAAC,UAAU,EAAE,CAAC;YACxB,KAAK,MAAM,CAAC,GAAG,EAAE,OAAO,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,UAAU,CAAC,EAAE,CAAC;gBACjE,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,OAAO,CAAC,CAAA;gBAChC,IAAI,KAAK;oBAAE,CAAC,UAAU,KAAK,EAAE,CAAC,CAAC,GAAG,CAAC,GAAG,KAAK,CAAA;YAC5C,CAAC;QACF,CAAC;QAED,MAAM,MAAM,GAAiB;YAC5B,EAAE;YACF,MAAM,EAAE,OAAO,CAAC,MAAM;YACtB,IAAI,EAAE,SAAS,CAAC,CAAC,CAAC,eAAe,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,SAAS;YACxD,YAAY,EAAE,QAAQ,CAAC,CAAC,CAAC,4BAA4B,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,SAAS;YAC3E,KAAK,EAAE,CAAC,OAAO,CAAC,KAAK,IAAI,GAAG,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,IAAI,EAAE,CAAC,IAAI,SAAS;YACjE,KAAK,EAAE,CAAC,OAAO,CAAC,KAAK,IAAI,GAAG,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,IAAI,EAAE,EAAE,WAAW,EAAE,CAAC,IAAI,SAAS;YAChF,OAAO,EACN,YAAY,IAAI,IAAI,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,cAAc,CAAC,YAAY,CAAC,CAAC,IAAI,SAAS,CAAC,CAAC,CAAC,CAAC,SAAS;YAC3G,UAAU;YACV,GAAG,EAAE,GAAG;SACR,CAAA;QAED,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QACpB,KAAK,EAAE,CAAA;IACR,CAAC;IAED,OAAO,OAAO,CAAA;AACf,CAAC;AAcD;;;;;;GAMG;AACH,MAAM,UAAU,iBAAiB,CAAC,IAIjC;IACA,OAAO,KAAK,EAAE,GAAW,EAAiC,EAAE;QAC3D,MAAM,UAAU,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAA;QACxC,MAAM,IAAI,GAAG,eAAe,CAAC,UAAU,EAAE,EAAE,OAAO,EAAE,IAAI,CAAC,OAAO,EAAE,GAAG,EAAE,CAAC,CAAA;QAExE,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAA;QACxC,IAAI,CAAC,QAAQ,IAAI,QAAQ,CAAC,GAAG,KAAK,IAAI,IAAI,QAAQ,CAAC,GAAG,KAAK,IAAI;YAAE,OAAO,IAAI,CAAA;QAE5E,MAAM,OAAO,GAAmB;YAC/B,UAAU,EAAE,EAAE,QAAQ,EAAE,QAAQ,CAAC,GAAG,EAAE,SAAS,EAAE,QAAQ,CAAC,GAAG,EAAE;YAC/D,IAAI,EAAE,QAAQ,CAAC,eAAe;YAC9B,iBAAiB,EAAE,QAAQ,CAAC,aAAa;YACzC,SAAS,EAAE,QAAQ,CAAC,SAAS;SAC7B,CAAA;QACD,OAAO,WAAW,CAAC,IAAI,EAAE,OAAO,CAAC,CAAA;IAClC,CAAC,CAAA;AACF,CAAC"}
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* The learned scorer (#603) — the production wiring for the gradient-boosted-tree model behind
|
|
7
|
+
* {@link ResolveConfig.scorer}. Two pieces:
|
|
8
|
+
*
|
|
9
|
+
* 1. {@link createMatchFeaturizer} — the ONE feature extractor for a candidate pair, used identically
|
|
10
|
+
* at train time (`scripts/record-matcher/train-gbt.ts`), eval time (the learned-scorer
|
|
11
|
+
* evals), and inference time (here). A pair → one-hot of each comparison's agreement level +
|
|
12
|
+
* the over-merge interaction terms (co-located × name/org disagreement) + address
|
|
13
|
+
* crowdedness.
|
|
14
|
+
* 2. {@link createGbtScorer} — wraps a trained {@link GBT} + the featurizer into the `(a, b) => number`
|
|
15
|
+
* the resolve pipeline's `scorer` hook expects (a logit, threshold-comparable with the
|
|
16
|
+
* Fellegi-Sunter weight it replaces).
|
|
17
|
+
*
|
|
18
|
+
* Both take the comparison set as INPUT (rather than importing {@link buildDefaultModel}) so this
|
|
19
|
+
* module has no dependency cycle with `resolve.ts`. The contract that keeps train ≡ inference:
|
|
20
|
+
* feed the comparisons from `buildDefaultModel({ collapseSpatial: true, addressFrequency })` —
|
|
21
|
+
* the model's structure (and thus the feature layout) is fixed by that config; only the frequency
|
|
22
|
+
* VALUES differ between the training corpus and the matched set, which is the point (the model
|
|
23
|
+
* generalizes, as the cross-state eval showed).
|
|
24
|
+
*/
|
|
25
|
+
import { type Comparison, type GBT, type TermFrequencyTable } from "@mailwoman/match";
|
|
26
|
+
import type { SourceRecord } from "./types.js";
|
|
27
|
+
/** Inputs shared by the featurizer + the scorer factory. */
|
|
28
|
+
export interface LearnedFeatureConfig {
|
|
29
|
+
/**
|
|
30
|
+
* The comparison set the features are built over — MUST be `buildDefaultModel({ collapseSpatial:
|
|
31
|
+
* true, addressFrequency }).comparisons` so the feature layout matches the trained model.
|
|
32
|
+
* (`usePhone` / `discriminators` are NOT part of the learned feature model — the GBT replaces the
|
|
33
|
+
* FS weight wholesale and owns its own feature vector.)
|
|
34
|
+
*/
|
|
35
|
+
comparisons: Comparison<SourceRecord>[];
|
|
36
|
+
/**
|
|
37
|
+
* Address-frequency table for the crowdedness feature (a crowded shared address is weak
|
|
38
|
+
* identity).
|
|
39
|
+
*/
|
|
40
|
+
addressFrequency: TermFrequencyTable;
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* Build the per-pair feature extractor. The vector is: one-hot of each comparison's agreement
|
|
44
|
+
* level, then the two over-merge interaction terms (spatial-exact × name-disagree, spatial-exact ×
|
|
45
|
+
* org-disagree — the "same place, different names" signature that drives co-located over-merges),
|
|
46
|
+
* then address crowdedness scaled into [0, 1]. Deterministic and EM-independent, so it is identical
|
|
47
|
+
* across train / eval / inference.
|
|
48
|
+
*/
|
|
49
|
+
export declare function createMatchFeaturizer(config: LearnedFeatureConfig): (a: SourceRecord, b: SourceRecord) => number[];
|
|
50
|
+
/**
|
|
51
|
+
* Wrap a trained {@link GBT} into the `(a, b) => number` link scorer for
|
|
52
|
+
* {@link ResolveConfig.scorer}. The returned weight is the model's logit — same threshold-comparable
|
|
53
|
+
* units as the Fellegi-Sunter weight it replaces, so the pipeline's clustering + threshold
|
|
54
|
+
* semantics are unchanged.
|
|
55
|
+
*/
|
|
56
|
+
export declare function createGbtScorer(config: LearnedFeatureConfig & {
|
|
57
|
+
model: GBT;
|
|
58
|
+
}): (a: SourceRecord, b: SourceRecord) => number;
|
|
59
|
+
//# sourceMappingURL=learned-scorer.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"learned-scorer.d.ts","sourceRoot":"","sources":["../learned-scorer.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AAEH,OAAO,EAAoB,KAAK,UAAU,EAAE,KAAK,GAAG,EAAY,KAAK,kBAAkB,EAAE,MAAM,kBAAkB,CAAA;AACjH,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,YAAY,CAAA;AAE9C,4DAA4D;AAC5D,MAAM,WAAW,oBAAoB;IACpC;;;;;OAKG;IACH,WAAW,EAAE,UAAU,CAAC,YAAY,CAAC,EAAE,CAAA;IACvC;;;OAGG;IACH,gBAAgB,EAAE,kBAAkB,CAAA;CACpC;AAED;;;;;;GAMG;AACH,wBAAgB,qBAAqB,CAAC,MAAM,EAAE,oBAAoB,GAAG,CAAC,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,YAAY,KAAK,MAAM,EAAE,CAkClH;AAED;;;;;GAKG;AACH,wBAAgB,eAAe,CAC9B,MAAM,EAAE,oBAAoB,GAAG;IAAE,KAAK,EAAE,GAAG,CAAA;CAAE,GAC3C,CAAC,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,YAAY,KAAK,MAAM,CAI9C"}
|