@mailwoman/corpus 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/out/src/adapter.d.ts +96 -0
- package/out/src/adapter.d.ts.map +1 -0
- package/out/src/adapter.js +107 -0
- package/out/src/adapter.js.map +1 -0
- package/out/src/adapters/ban/adapter.d.ts +32 -0
- package/out/src/adapters/ban/adapter.d.ts.map +1 -0
- package/out/src/adapters/ban/adapter.js +133 -0
- package/out/src/adapters/ban/adapter.js.map +1 -0
- package/out/src/adapters/fcc-bdc/adapter.d.ts +61 -0
- package/out/src/adapters/fcc-bdc/adapter.d.ts.map +1 -0
- package/out/src/adapters/fcc-bdc/adapter.js +153 -0
- package/out/src/adapters/fcc-bdc/adapter.js.map +1 -0
- package/out/src/adapters/index.d.ts +42 -0
- package/out/src/adapters/index.d.ts.map +1 -0
- package/out/src/adapters/index.js +76 -0
- package/out/src/adapters/index.js.map +1 -0
- package/out/src/adapters/openaddresses/adapter.d.ts +60 -0
- package/out/src/adapters/openaddresses/adapter.d.ts.map +1 -0
- package/out/src/adapters/openaddresses/adapter.js +174 -0
- package/out/src/adapters/openaddresses/adapter.js.map +1 -0
- package/out/src/adapters/state-ia-contractors/adapter.d.ts +23 -0
- package/out/src/adapters/state-ia-contractors/adapter.d.ts.map +1 -0
- package/out/src/adapters/state-ia-contractors/adapter.js +113 -0
- package/out/src/adapters/state-ia-contractors/adapter.js.map +1 -0
- package/out/src/adapters/state-ny-notaries/adapter.d.ts +21 -0
- package/out/src/adapters/state-ny-notaries/adapter.d.ts.map +1 -0
- package/out/src/adapters/state-ny-notaries/adapter.js +132 -0
- package/out/src/adapters/state-ny-notaries/adapter.js.map +1 -0
- package/out/src/adapters/state-tx-notaries/adapter.d.ts +22 -0
- package/out/src/adapters/state-tx-notaries/adapter.d.ts.map +1 -0
- package/out/src/adapters/state-tx-notaries/adapter.js +125 -0
- package/out/src/adapters/state-tx-notaries/adapter.js.map +1 -0
- package/out/src/adapters/tiger/adapter.d.ts +45 -0
- package/out/src/adapters/tiger/adapter.d.ts.map +1 -0
- package/out/src/adapters/tiger/adapter.js +179 -0
- package/out/src/adapters/tiger/adapter.js.map +1 -0
- package/out/src/adapters/usgov-hrsa-fqhc/adapter.d.ts +36 -0
- package/out/src/adapters/usgov-hrsa-fqhc/adapter.d.ts.map +1 -0
- package/out/src/adapters/usgov-hrsa-fqhc/adapter.js +147 -0
- package/out/src/adapters/usgov-hrsa-fqhc/adapter.js.map +1 -0
- package/out/src/adapters/usgov-imls-pls/adapter.d.ts +25 -0
- package/out/src/adapters/usgov-imls-pls/adapter.d.ts.map +1 -0
- package/out/src/adapters/usgov-imls-pls/adapter.js +118 -0
- package/out/src/adapters/usgov-imls-pls/adapter.js.map +1 -0
- package/out/src/adapters/usgov-nad/adapter.d.ts +37 -0
- package/out/src/adapters/usgov-nad/adapter.d.ts.map +1 -0
- package/out/src/adapters/usgov-nad/adapter.js +227 -0
- package/out/src/adapters/usgov-nad/adapter.js.map +1 -0
- package/out/src/adapters/usgov-nppes/adapter.d.ts +28 -0
- package/out/src/adapters/usgov-nppes/adapter.d.ts.map +1 -0
- package/out/src/adapters/usgov-nppes/adapter.js +123 -0
- package/out/src/adapters/usgov-nppes/adapter.js.map +1 -0
- package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.d.ts +35 -0
- package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.d.ts.map +1 -0
- package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.js +162 -0
- package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.js.map +1 -0
- package/out/src/adapters/wof-admin-json/adapter.d.ts +85 -0
- package/out/src/adapters/wof-admin-json/adapter.d.ts.map +1 -0
- package/out/src/adapters/wof-admin-json/adapter.js +241 -0
- package/out/src/adapters/wof-admin-json/adapter.js.map +1 -0
- package/out/src/adapters/wof-postalcode-json/adapter.d.ts +63 -0
- package/out/src/adapters/wof-postalcode-json/adapter.d.ts.map +1 -0
- package/out/src/adapters/wof-postalcode-json/adapter.js +178 -0
- package/out/src/adapters/wof-postalcode-json/adapter.js.map +1 -0
- package/out/src/align.d.ts +58 -0
- package/out/src/align.d.ts.map +1 -0
- package/out/src/align.js +139 -0
- package/out/src/align.js.map +1 -0
- package/out/src/build.d.ts +104 -0
- package/out/src/build.d.ts.map +1 -0
- package/out/src/build.js +201 -0
- package/out/src/build.js.map +1 -0
- package/out/src/codex/us-fips-state.d.ts +44 -0
- package/out/src/codex/us-fips-state.d.ts.map +1 -0
- package/out/src/codex/us-fips-state.js +105 -0
- package/out/src/codex/us-fips-state.js.map +1 -0
- package/out/src/codex/us-street-suffix.d.ts +259 -0
- package/out/src/codex/us-street-suffix.d.ts.map +1 -0
- package/out/src/codex/us-street-suffix.js +285 -0
- package/out/src/codex/us-street-suffix.js.map +1 -0
- package/out/src/format.d.ts +79 -0
- package/out/src/format.d.ts.map +1 -0
- package/out/src/format.js +151 -0
- package/out/src/format.js.map +1 -0
- package/out/src/golden.d.ts +50 -0
- package/out/src/golden.d.ts.map +1 -0
- package/out/src/golden.js +104 -0
- package/out/src/golden.js.map +1 -0
- package/out/src/index.d.ts +18 -0
- package/out/src/index.d.ts.map +1 -0
- package/out/src/index.js +18 -0
- package/out/src/index.js.map +1 -0
- package/out/src/parquet-wrapper/index.d.ts +12 -0
- package/out/src/parquet-wrapper/index.d.ts.map +1 -0
- package/out/src/parquet-wrapper/index.js +12 -0
- package/out/src/parquet-wrapper/index.js.map +1 -0
- package/out/src/parquet-wrapper/reader.d.ts +31 -0
- package/out/src/parquet-wrapper/reader.d.ts.map +1 -0
- package/out/src/parquet-wrapper/reader.js +54 -0
- package/out/src/parquet-wrapper/reader.js.map +1 -0
- package/out/src/parquet-wrapper/schema.d.ts +45 -0
- package/out/src/parquet-wrapper/schema.d.ts.map +1 -0
- package/out/src/parquet-wrapper/schema.js +55 -0
- package/out/src/parquet-wrapper/schema.js.map +1 -0
- package/out/src/parquet-wrapper/writer.d.ts +41 -0
- package/out/src/parquet-wrapper/writer.d.ts.map +1 -0
- package/out/src/parquet-wrapper/writer.js +71 -0
- package/out/src/parquet-wrapper/writer.js.map +1 -0
- package/out/src/parquet.d.ts +122 -0
- package/out/src/parquet.d.ts.map +1 -0
- package/out/src/parquet.js +220 -0
- package/out/src/parquet.js.map +1 -0
- package/out/src/runner.d.ts +100 -0
- package/out/src/runner.d.ts.map +1 -0
- package/out/src/runner.js +183 -0
- package/out/src/runner.js.map +1 -0
- package/out/src/split.d.ts +108 -0
- package/out/src/split.d.ts.map +1 -0
- package/out/src/split.js +191 -0
- package/out/src/split.js.map +1 -0
- package/out/src/synthesize.d.ts +146 -0
- package/out/src/synthesize.d.ts.map +1 -0
- package/out/src/synthesize.js +472 -0
- package/out/src/synthesize.js.map +1 -0
- package/out/src/tokenize.d.ts +47 -0
- package/out/src/tokenize.d.ts.map +1 -0
- package/out/src/tokenize.js +49 -0
- package/out/src/tokenize.js.map +1 -0
- package/out/src/types.d.ts +168 -0
- package/out/src/types.d.ts.map +1 -0
- package/out/src/types.js +19 -0
- package/out/src/types.js.map +1 -0
- package/out/src/wof-json.d.ts +105 -0
- package/out/src/wof-json.d.ts.map +1 -0
- package/out/src/wof-json.js +174 -0
- package/out/src/wof-json.js.map +1 -0
- package/package.json +36 -0
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* `usgov-hrsa-fqhc`: HRSA "Health Center Service Delivery Site Locations" CSV consumer.
|
|
7
|
+
*
|
|
8
|
+
* Federally Qualified Health Centers (FQHCs) are HRSA-funded community health programs that
|
|
9
|
+
* self-report site addresses to the HRSA Data Warehouse. The published CSV (`data.hrsa.gov`)
|
|
10
|
+
* carries the site name, the postal-formatted street address, and the locality/region/postcode
|
|
11
|
+
* quad. Phase 1.6 §1.2 (#22) selects this source for its adversarial-value-per-row: every
|
|
12
|
+
* facility name is a human-typed venue string and the addresses pass through enough hands to
|
|
13
|
+
* accumulate the abbreviation drift + suite designator chaos that pure gazetteer data does not.
|
|
14
|
+
*
|
|
15
|
+
* The adapter consumes a CSV file the operator pre-downloads. The HRSA data is published as a
|
|
16
|
+
* single national CSV (~10K rows), small enough that the operator can re-fetch on every corpus
|
|
17
|
+
* rebuild without an intermediate SQLite step. Column names below match the HRSA Data Warehouse's
|
|
18
|
+
* "Health Center Service Delivery Site" public dataset. Operators substituting the
|
|
19
|
+
* closely-related "Site Address" or "Health Center" public extracts may need to remap columns;
|
|
20
|
+
* the README documents the expected set.
|
|
21
|
+
*
|
|
22
|
+
* Output: one row per CSV record, with `venue` component carrying the site name and the address
|
|
23
|
+
* quad on `(house_number, street, locality, region, postcode)`. Component order is load-bearing:
|
|
24
|
+
* `venue` is inserted FIRST so alignment claims its surface span before `locality` searches for
|
|
25
|
+
* its own (the kryptonite case "Buffalo Health Clinic, …, Buffalo, NY" relies on `venue`
|
|
26
|
+
* consuming the first "Buffalo" so locality lands on the second).
|
|
27
|
+
*
|
|
28
|
+
* License: stamped `"Public Domain"` per the HRSA Data Warehouse's federal government distribution
|
|
29
|
+
* terms.
|
|
30
|
+
*/
|
|
31
|
+
import { parse as csvParse } from "csv-parse";
|
|
32
|
+
import { createReadStream } from "node:fs";
|
|
33
|
+
import { stableSourceId } from "../../adapter.js";
|
|
34
|
+
import { lookupStateAbbreviation } from "../../codex/us-fips-state.js";
|
|
35
|
+
import { reconcileComponents } from "../../format.js";
|
|
36
|
+
export const USGOV_HRSA_FQHC_ADAPTER_ID = "usgov-hrsa-fqhc";
|
|
37
|
+
export const USGOV_HRSA_FQHC_DEFAULT_LICENSE = "Public Domain";
|
|
38
|
+
/**
|
|
39
|
+
* Split a "123 Main St Suite 4" surface form into `(house_number, street)`. The regex tolerates one
|
|
40
|
+
* trailing letter on the number (`"123A Main St"`) and a hyphenated form (`"40-12 Bell Blvd"`);
|
|
41
|
+
* anything else falls back to street-only.
|
|
42
|
+
*
|
|
43
|
+
* Suite / Apt / Unit designators stay on `street` for Phase 1 — Mailwoman's `unit` component exists
|
|
44
|
+
* but the address-formatter does not have a clean slot for it, and HRSA addresses do not separate
|
|
45
|
+
* the suite into its own column. Leaving the surface form intact in `street` preserves the
|
|
46
|
+
* adversarial training signal (the model learns that a trailing "Suite 4" is part of the road line
|
|
47
|
+
* in this distribution).
|
|
48
|
+
*/
|
|
49
|
+
const HOUSE_NUMBER_PREFIX = /^(\d+(?:-\d+)?[A-Za-z]?)\s+(.+)$/;
|
|
50
|
+
function splitAddress(address) {
|
|
51
|
+
const trimmed = address.trim();
|
|
52
|
+
if (!trimmed)
|
|
53
|
+
return null;
|
|
54
|
+
const m = HOUSE_NUMBER_PREFIX.exec(trimmed);
|
|
55
|
+
if (m)
|
|
56
|
+
return { house_number: m[1], street: m[2].trim() };
|
|
57
|
+
return { street: trimmed };
|
|
58
|
+
}
|
|
59
|
+
/**
|
|
60
|
+
* Compose the raw envelope-style address line. Format:
|
|
61
|
+
*
|
|
62
|
+
* "<Site Name>, <house> <street>, <city>, <state> <postcode>"
|
|
63
|
+
*
|
|
64
|
+
* The site name leads (US conventional addressee-then-address ordering) so a downstream model sees
|
|
65
|
+
* the venue-prefix-then-address shape that HRSA users actually type into geocoders.
|
|
66
|
+
*/
|
|
67
|
+
function composeRaw(venue, house, street, city, state, postcode) {
|
|
68
|
+
const streetPart = [house, street].filter(Boolean).join(" ").trim();
|
|
69
|
+
const cityPart = [city.trim(), [state, postcode].filter(Boolean).join(" ").trim()].filter(Boolean).join(", ");
|
|
70
|
+
return [venue.trim(), streetPart, cityPart].filter(Boolean).join(", ");
|
|
71
|
+
}
|
|
72
|
+
export function createUsgovHrsaFqhcAdapter() {
|
|
73
|
+
return {
|
|
74
|
+
id: USGOV_HRSA_FQHC_ADAPTER_ID,
|
|
75
|
+
defaultLicense: USGOV_HRSA_FQHC_DEFAULT_LICENSE,
|
|
76
|
+
description: "HRSA Federally Qualified Health Center site locations (public-domain). Adversarial source: venue + address co-occurrence, hand-entered.",
|
|
77
|
+
async *rows(opts) {
|
|
78
|
+
if (opts.country && opts.country !== "US") {
|
|
79
|
+
throw new Error(`usgov-hrsa-fqhc adapter: only US supported, got country=${opts.country}`);
|
|
80
|
+
}
|
|
81
|
+
const stream = createReadStream(opts.inputPath, { encoding: "utf8" });
|
|
82
|
+
const parser = stream.pipe(csvParse({
|
|
83
|
+
columns: true,
|
|
84
|
+
skip_empty_lines: true,
|
|
85
|
+
relax_quotes: true,
|
|
86
|
+
relax_column_count: true,
|
|
87
|
+
}));
|
|
88
|
+
let emitted = 0;
|
|
89
|
+
try {
|
|
90
|
+
for await (const record of parser) {
|
|
91
|
+
if (opts.signal?.aborted)
|
|
92
|
+
break;
|
|
93
|
+
if (opts.limit !== undefined && emitted >= opts.limit)
|
|
94
|
+
break;
|
|
95
|
+
const venue = (record["Site Name"] ?? "").trim();
|
|
96
|
+
const split = splitAddress(record["Site Address"] ?? "");
|
|
97
|
+
const city = (record["Site City"] ?? "").trim();
|
|
98
|
+
const stateAbbr = (record["Site State Abbreviation"] ?? "").trim();
|
|
99
|
+
const postcode = (record["Site Postal Code"] ?? "").trim();
|
|
100
|
+
if (!venue || !split || !city || !postcode)
|
|
101
|
+
continue;
|
|
102
|
+
const state = lookupStateAbbreviation(stateAbbr);
|
|
103
|
+
if (!state)
|
|
104
|
+
continue;
|
|
105
|
+
// Insertion order matters here. `venue` first so alignment claims its span
|
|
106
|
+
// (which may contain a token like "Buffalo") before `locality` runs its
|
|
107
|
+
// search — the kryptonite case `Buffalo Health Clinic, Buffalo NY`
|
|
108
|
+
// otherwise mis-labels the venue's "Buffalo" as locality.
|
|
109
|
+
const components = {
|
|
110
|
+
venue,
|
|
111
|
+
...(split.house_number ? { house_number: split.house_number } : {}),
|
|
112
|
+
street: split.street,
|
|
113
|
+
locality: city,
|
|
114
|
+
region: state.abbreviation,
|
|
115
|
+
postcode,
|
|
116
|
+
};
|
|
117
|
+
const raw = composeRaw(venue, split.house_number, split.street, city, state.abbreviation, postcode);
|
|
118
|
+
if (!raw)
|
|
119
|
+
continue;
|
|
120
|
+
const aligned = reconcileComponents(components, raw);
|
|
121
|
+
if (Object.keys(aligned).length === 0)
|
|
122
|
+
continue;
|
|
123
|
+
const siteId = (record["Site ID"] ?? "").trim();
|
|
124
|
+
const sourceId = siteId
|
|
125
|
+
? `${USGOV_HRSA_FQHC_ADAPTER_ID}-${siteId}`
|
|
126
|
+
: stableSourceId(USGOV_HRSA_FQHC_ADAPTER_ID, aligned);
|
|
127
|
+
yield {
|
|
128
|
+
raw,
|
|
129
|
+
components: aligned,
|
|
130
|
+
country: "US",
|
|
131
|
+
locale: "en-US",
|
|
132
|
+
source: USGOV_HRSA_FQHC_ADAPTER_ID,
|
|
133
|
+
source_id: sourceId,
|
|
134
|
+
corpus_version: "",
|
|
135
|
+
license: USGOV_HRSA_FQHC_DEFAULT_LICENSE,
|
|
136
|
+
};
|
|
137
|
+
emitted++;
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
finally {
|
|
141
|
+
stream.destroy();
|
|
142
|
+
}
|
|
143
|
+
},
|
|
144
|
+
};
|
|
145
|
+
}
|
|
146
|
+
export const usgovHrsaFqhcAdapter = createUsgovHrsaFqhcAdapter();
|
|
147
|
+
//# sourceMappingURL=adapter.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"adapter.js","sourceRoot":"","sources":["../../../../src/adapters/usgov-hrsa-fqhc/adapter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6BG;AAEH,OAAO,EAAE,KAAK,IAAI,QAAQ,EAAE,MAAM,WAAW,CAAA;AAC7C,OAAO,EAAE,gBAAgB,EAAE,MAAM,SAAS,CAAA;AAC1C,OAAO,EAAE,cAAc,EAAE,MAAM,kBAAkB,CAAA;AACjD,OAAO,EAAE,uBAAuB,EAAE,MAAM,8BAA8B,CAAA;AACtE,OAAO,EAAE,mBAAmB,EAAE,MAAM,iBAAiB,CAAA;AAGrD,MAAM,CAAC,MAAM,0BAA0B,GAAG,iBAAiB,CAAA;AAC3D,MAAM,CAAC,MAAM,+BAA+B,GAAG,eAAe,CAAA;AAkB9D;;;;;;;;;;GAUG;AACH,MAAM,mBAAmB,GAAG,kCAAkC,CAAA;AAE9D,SAAS,YAAY,CAAC,OAAe;IACpC,MAAM,OAAO,GAAG,OAAO,CAAC,IAAI,EAAE,CAAA;IAC9B,IAAI,CAAC,OAAO;QAAE,OAAO,IAAI,CAAA;IACzB,MAAM,CAAC,GAAG,mBAAmB,CAAC,IAAI,CAAC,OAAO,CAAC,CAAA;IAC3C,IAAI,CAAC;QAAE,OAAO,EAAE,YAAY,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC,CAAC,CAAE,CAAC,IAAI,EAAE,EAAE,CAAA;IAC1D,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,CAAA;AAC3B,CAAC;AAED;;;;;;;GAOG;AACH,SAAS,UAAU,CAClB,KAAa,EACb,KAAyB,EACzB,MAAc,EACd,IAAY,EACZ,KAAa,EACb,QAAgB;IAEhB,MAAM,UAAU,GAAG,CAAC,KAAK,EAAE,MAAM,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAA;IACnE,MAAM,QAAQ,GAAG,CAAC,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC,KAAK,EAAE,QAAQ,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;IAC7G,OAAO,CAAC,KAAK,CAAC,IAAI,EAAE,EAAE,UAAU,EAAE,QAAQ,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;AACvE,CAAC;AAED,MAAM,UAAU,0BAA0B;IACzC,OAAO;QACN,EAAE,EAAE,0BAA0B;QAC9B,cAAc,EAAE,+BAA+B;QAC/C,WAAW,EACV,yIAAyI;QAE1I,KAAK,CAAC,CAAC,IAAI,CAAC,IAAoB;YAC/B,IAAI,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,OAAO,KAAK,IAAI,EAAE,CAAC;gBAC3C,MAAM,IAAI,KAAK,CAAC,2DAA2D,IAAI,CAAC,OAAO,EAAE,CAAC,CAAA;YAC3F,CAAC;YAED,MAAM,MAAM,GAAG,gBAAgB,CAAC,IAAI,CAAC,SAAS,EAAE,EAAE,QAAQ,EAAE,MAAM,EAAE,CAAC,CAAA;YACrE,MAAM,MAAM,GAAG,MAAM,CAAC,IAAI,CACzB,QAAQ,CAAC;gBACR,OAAO,EAAE,IAAI;gBACb,gBAAgB,EAAE,IAAI;gBACtB,YAAY,EAAE,IAAI;gBAClB,kBAAkB,EAAE,IAAI;aACxB,CAAC,CACF,CAAA;YAED,IAAI,OAAO,GAAG,CAAC,CAAA;YACf,IAAI,CAAC;gBACJ,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,MAAoC,EAAE,CAAC;oBACjE,IAAI,IAAI,CAAC,MAAM,EAAE,OAAO;wBAAE,MAAK;oBAC/B,IAAI,IAAI,CAAC,KAAK,KAAK,SAAS,IAAI,OAAO,IAAI,IAAI,CAAC,KAAK;wBAAE,MAAK;oBAE5D,MAAM,KAAK,GAAG,CAAC,MAAM,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBAChD,MAAM,KAAK,GAAG,YAAY,CAAC,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,CAAC,CAAA;oBACxD,MAAM,IAAI,GAAG,CAAC,MAAM,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBAC/C,MAAM,SAAS,GAAG,CAAC,MAAM,CAAC,yBAAyB,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBAClE,MAAM,QAAQ,GAAG,CAAC,MAAM,CAAC,kBAAkB,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBAE1D,IAAI,CAAC,KAAK,IAAI,CAAC,KAAK,IAAI,CAAC,IAAI,IAAI,CAAC,QAAQ;wBAAE,SAAQ;oBACpD,MAAM,KAAK,GAAG,uBAAuB,CAAC,SAAS,CAAC,CAAA;oBAChD,IAAI,CAAC,KAAK;wBAAE,SAAQ;oBAEpB,2EAA2E;oBAC3E,wEAAwE;oBACxE,mEAAmE;oBACnE,0DAA0D;oBAC1D,MAAM,UAAU,GAA+B;wBAC9C,KAAK;wBACL,GAAG,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC,CAAC,EAAE,YAAY,EAAE,KAAK,CAAC,YAAY,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;wBACnE,MAAM,EAAE,KAAK,CAAC,MAAM;wBACpB,QAAQ,EAAE,IAAI;wBACd,MAAM,EAAE,KAAK,CAAC,YAAY;wBAC1B,QAAQ;qBACR,CAAA;oBAED,MAAM,GAAG,GAAG,UAAU,CAAC,KAAK,EAAE,KAAK,CAAC,YAAY,EAAE,KAAK,CAAC,MAAM,EAAE,IAAI,EAAE,KAAK,CAAC,YAAY,EAAE,QAAQ,CAAC,CAAA;oBACnG,IAAI,CAAC,GAAG;wBAAE,SAAQ;oBAElB,MAAM,OAAO,GAAG,mBAAmB,CAAC,UAAU,EAAE,GAAG,CAAC,CAAA;oBACpD,IAAI,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,MAAM,KAAK,CAAC;wBAAE,SAAQ;oBAE/C,MAAM,MAAM,GAAG,CAAC,MAAM,CAAC,SAAS,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBAC/C,MAAM,QAAQ,GAAG,MAAM;wBACtB,CAAC,CAAC,GAAG,0BAA0B,IAAI,MAAM,EAAE;wBAC3C,CAAC,CAAC,cAAc,CAAC,0BAA0B,EAAE,OAAO,CAAC,CAAA;oBAEtD,MAAM;wBACL,GAAG;wBACH,UAAU,EAAE,OAAO;wBACnB,OAAO,EAAE,IAAI;wBACb,MAAM,EAAE,OAAO;wBACf,MAAM,EAAE,0BAA0B;wBAClC,SAAS,EAAE,QAAQ;wBACnB,cAAc,EAAE,EAAE;wBAClB,OAAO,EAAE,+BAA+B;qBACxC,CAAA;oBACD,OAAO,EAAE,CAAA;gBACV,CAAC;YACF,CAAC;oBAAS,CAAC;gBACV,MAAM,CAAC,OAAO,EAAE,CAAA;YACjB,CAAC;QACF,CAAC;KACD,CAAA;AACF,CAAC;AAED,MAAM,CAAC,MAAM,oBAAoB,GAAG,0BAA0B,EAAE,CAAA"}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* `usgov-imls-pls`: IMLS Public Libraries Survey outlet CSV consumer.
|
|
7
|
+
*
|
|
8
|
+
* The Institute of Museum and Library Services publishes an annual Public Libraries Survey with one
|
|
9
|
+
* row per library outlet (~17K rows). Each row carries the library name, street address, city,
|
|
10
|
+
* ZIP, county, and geocoordinates.
|
|
11
|
+
*
|
|
12
|
+
* The adapter consumes the outlet CSV the operator pre-downloads via `fetch-imls-pls.sh`. Column
|
|
13
|
+
* names match the IMLS PLS outlet file header.
|
|
14
|
+
*
|
|
15
|
+
* Output: one row per outlet with `venue` (library name), `(house_number, street, locality,
|
|
16
|
+
* subregion, postcode)`, and lat/lon preserved in `source_id` stability.
|
|
17
|
+
*
|
|
18
|
+
* License: stamped `"Public Domain"` per IMLS federal government distribution terms.
|
|
19
|
+
*/
|
|
20
|
+
import type { CorpusAdapter } from "../../types.js";
|
|
21
|
+
export declare const USGOV_IMLS_PLS_ADAPTER_ID = "usgov-imls-pls";
|
|
22
|
+
export declare const USGOV_IMLS_PLS_DEFAULT_LICENSE = "Public Domain";
|
|
23
|
+
export declare function createUsgovImlsPlsAdapter(): CorpusAdapter;
|
|
24
|
+
export declare const usgovImlsPlsAdapter: CorpusAdapter;
|
|
25
|
+
//# sourceMappingURL=adapter.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"adapter.d.ts","sourceRoot":"","sources":["../../../../src/adapters/usgov-imls-pls/adapter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAOH,OAAO,KAAK,EAAgC,aAAa,EAAE,MAAM,gBAAgB,CAAA;AAEjF,eAAO,MAAM,yBAAyB,mBAAmB,CAAA;AACzD,eAAO,MAAM,8BAA8B,kBAAkB,CAAA;AAsB7D,wBAAgB,yBAAyB,IAAI,aAAa,CAsFzD;AAED,eAAO,MAAM,mBAAmB,eAA8B,CAAA"}
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* `usgov-imls-pls`: IMLS Public Libraries Survey outlet CSV consumer.
|
|
7
|
+
*
|
|
8
|
+
* The Institute of Museum and Library Services publishes an annual Public Libraries Survey with one
|
|
9
|
+
* row per library outlet (~17K rows). Each row carries the library name, street address, city,
|
|
10
|
+
* ZIP, county, and geocoordinates.
|
|
11
|
+
*
|
|
12
|
+
* The adapter consumes the outlet CSV the operator pre-downloads via `fetch-imls-pls.sh`. Column
|
|
13
|
+
* names match the IMLS PLS outlet file header.
|
|
14
|
+
*
|
|
15
|
+
* Output: one row per outlet with `venue` (library name), `(house_number, street, locality,
|
|
16
|
+
* subregion, postcode)`, and lat/lon preserved in `source_id` stability.
|
|
17
|
+
*
|
|
18
|
+
* License: stamped `"Public Domain"` per IMLS federal government distribution terms.
|
|
19
|
+
*/
|
|
20
|
+
import { parse as csvParse } from "csv-parse";
|
|
21
|
+
import { createReadStream } from "node:fs";
|
|
22
|
+
import { stableSourceId } from "../../adapter.js";
|
|
23
|
+
import { lookupStateAbbreviation } from "../../codex/us-fips-state.js";
|
|
24
|
+
import { reconcileComponents } from "../../format.js";
|
|
25
|
+
export const USGOV_IMLS_PLS_ADAPTER_ID = "usgov-imls-pls";
|
|
26
|
+
export const USGOV_IMLS_PLS_DEFAULT_LICENSE = "Public Domain";
|
|
27
|
+
const HOUSE_NUMBER_PREFIX = /^(\d+(?:-\d+)?[A-Za-z]?)\s+(.+)$/;
|
|
28
|
+
function splitAddress(address) {
|
|
29
|
+
const trimmed = address.trim();
|
|
30
|
+
if (!trimmed)
|
|
31
|
+
return null;
|
|
32
|
+
const m = HOUSE_NUMBER_PREFIX.exec(trimmed);
|
|
33
|
+
if (m)
|
|
34
|
+
return { house_number: m[1], street: m[2].trim() };
|
|
35
|
+
return { street: trimmed };
|
|
36
|
+
}
|
|
37
|
+
export function createUsgovImlsPlsAdapter() {
|
|
38
|
+
return {
|
|
39
|
+
id: USGOV_IMLS_PLS_ADAPTER_ID,
|
|
40
|
+
defaultLicense: USGOV_IMLS_PLS_DEFAULT_LICENSE,
|
|
41
|
+
description: "IMLS Public Libraries Survey — ~17K library outlets with venue+address (public-domain).",
|
|
42
|
+
async *rows(opts) {
|
|
43
|
+
if (opts.country && opts.country !== "US") {
|
|
44
|
+
throw new Error(`usgov-imls-pls adapter: only US supported, got country=${opts.country}`);
|
|
45
|
+
}
|
|
46
|
+
const stream = createReadStream(opts.inputPath, { encoding: "utf8" });
|
|
47
|
+
const parser = stream.pipe(csvParse({
|
|
48
|
+
columns: true,
|
|
49
|
+
skip_empty_lines: true,
|
|
50
|
+
relax_quotes: true,
|
|
51
|
+
relax_column_count: true,
|
|
52
|
+
}));
|
|
53
|
+
let emitted = 0;
|
|
54
|
+
try {
|
|
55
|
+
for await (const record of parser) {
|
|
56
|
+
if (opts.signal?.aborted)
|
|
57
|
+
break;
|
|
58
|
+
if (opts.limit !== undefined && emitted >= opts.limit)
|
|
59
|
+
break;
|
|
60
|
+
const libName = (record.LIBNAME ?? "").trim();
|
|
61
|
+
const address = (record.ADDRESS ?? "").trim();
|
|
62
|
+
const city = (record.CITY ?? "").trim();
|
|
63
|
+
const zip = (record.ZIP ?? "").trim();
|
|
64
|
+
const stateAbbr = (record.STABR ?? "").trim();
|
|
65
|
+
const county = (record.CNTY ?? "").trim();
|
|
66
|
+
if (!libName || !city || !zip)
|
|
67
|
+
continue;
|
|
68
|
+
const state = lookupStateAbbreviation(stateAbbr);
|
|
69
|
+
if (!state)
|
|
70
|
+
continue;
|
|
71
|
+
const split = splitAddress(address);
|
|
72
|
+
if (!split)
|
|
73
|
+
continue;
|
|
74
|
+
const components = {
|
|
75
|
+
venue: libName,
|
|
76
|
+
...(split.house_number ? { house_number: split.house_number } : {}),
|
|
77
|
+
street: split.street,
|
|
78
|
+
locality: city,
|
|
79
|
+
region: state.abbreviation,
|
|
80
|
+
postcode: zip,
|
|
81
|
+
...(county ? { subregion: county } : {}),
|
|
82
|
+
};
|
|
83
|
+
const streetPart = [split.house_number, split.street].filter(Boolean).join(" ").trim();
|
|
84
|
+
const raw = [
|
|
85
|
+
libName,
|
|
86
|
+
streetPart,
|
|
87
|
+
[city, [stateAbbr, zip].filter(Boolean).join(" ")].filter(Boolean).join(", "),
|
|
88
|
+
]
|
|
89
|
+
.filter(Boolean)
|
|
90
|
+
.join(", ");
|
|
91
|
+
const aligned = reconcileComponents(components, raw);
|
|
92
|
+
if (Object.keys(aligned).length <= 2)
|
|
93
|
+
continue;
|
|
94
|
+
const fscsKey = (record.FSCSKEY ?? "").trim();
|
|
95
|
+
const sourceId = fscsKey
|
|
96
|
+
? `${USGOV_IMLS_PLS_ADAPTER_ID}-${fscsKey}`
|
|
97
|
+
: stableSourceId(USGOV_IMLS_PLS_ADAPTER_ID, aligned);
|
|
98
|
+
yield {
|
|
99
|
+
raw,
|
|
100
|
+
components: aligned,
|
|
101
|
+
country: "US",
|
|
102
|
+
locale: "en-US",
|
|
103
|
+
source: USGOV_IMLS_PLS_ADAPTER_ID,
|
|
104
|
+
source_id: sourceId,
|
|
105
|
+
corpus_version: "",
|
|
106
|
+
license: USGOV_IMLS_PLS_DEFAULT_LICENSE,
|
|
107
|
+
};
|
|
108
|
+
emitted++;
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
finally {
|
|
112
|
+
stream.destroy();
|
|
113
|
+
}
|
|
114
|
+
},
|
|
115
|
+
};
|
|
116
|
+
}
|
|
117
|
+
export const usgovImlsPlsAdapter = createUsgovImlsPlsAdapter();
|
|
118
|
+
//# sourceMappingURL=adapter.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"adapter.js","sourceRoot":"","sources":["../../../../src/adapters/usgov-imls-pls/adapter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAEH,OAAO,EAAE,KAAK,IAAI,QAAQ,EAAE,MAAM,WAAW,CAAA;AAC7C,OAAO,EAAE,gBAAgB,EAAE,MAAM,SAAS,CAAA;AAC1C,OAAO,EAAE,cAAc,EAAE,MAAM,kBAAkB,CAAA;AACjD,OAAO,EAAE,uBAAuB,EAAE,MAAM,8BAA8B,CAAA;AACtE,OAAO,EAAE,mBAAmB,EAAE,MAAM,iBAAiB,CAAA;AAGrD,MAAM,CAAC,MAAM,yBAAyB,GAAG,gBAAgB,CAAA;AACzD,MAAM,CAAC,MAAM,8BAA8B,GAAG,eAAe,CAAA;AAE7D,MAAM,mBAAmB,GAAG,kCAAkC,CAAA;AAY9D,SAAS,YAAY,CAAC,OAAe;IACpC,MAAM,OAAO,GAAG,OAAO,CAAC,IAAI,EAAE,CAAA;IAC9B,IAAI,CAAC,OAAO;QAAE,OAAO,IAAI,CAAA;IACzB,MAAM,CAAC,GAAG,mBAAmB,CAAC,IAAI,CAAC,OAAO,CAAC,CAAA;IAC3C,IAAI,CAAC;QAAE,OAAO,EAAE,YAAY,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC,CAAC,CAAE,CAAC,IAAI,EAAE,EAAE,CAAA;IAC1D,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,CAAA;AAC3B,CAAC;AAED,MAAM,UAAU,yBAAyB;IACxC,OAAO;QACN,EAAE,EAAE,yBAAyB;QAC7B,cAAc,EAAE,8BAA8B;QAC9C,WAAW,EAAE,yFAAyF;QAEtG,KAAK,CAAC,CAAC,IAAI,CAAC,IAAoB;YAC/B,IAAI,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,OAAO,KAAK,IAAI,EAAE,CAAC;gBAC3C,MAAM,IAAI,KAAK,CAAC,0DAA0D,IAAI,CAAC,OAAO,EAAE,CAAC,CAAA;YAC1F,CAAC;YAED,MAAM,MAAM,GAAG,gBAAgB,CAAC,IAAI,CAAC,SAAS,EAAE,EAAE,QAAQ,EAAE,MAAM,EAAE,CAAC,CAAA;YACrE,MAAM,MAAM,GAAG,MAAM,CAAC,IAAI,CACzB,QAAQ,CAAC;gBACR,OAAO,EAAE,IAAI;gBACb,gBAAgB,EAAE,IAAI;gBACtB,YAAY,EAAE,IAAI;gBAClB,kBAAkB,EAAE,IAAI;aACxB,CAAC,CACF,CAAA;YAED,IAAI,OAAO,GAAG,CAAC,CAAA;YACf,IAAI,CAAC;gBACJ,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,MAAsC,EAAE,CAAC;oBACnE,IAAI,IAAI,CAAC,MAAM,EAAE,OAAO;wBAAE,MAAK;oBAC/B,IAAI,IAAI,CAAC,KAAK,KAAK,SAAS,IAAI,OAAO,IAAI,IAAI,CAAC,KAAK;wBAAE,MAAK;oBAE5D,MAAM,OAAO,GAAG,CAAC,MAAM,CAAC,OAAO,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBAC7C,MAAM,OAAO,GAAG,CAAC,MAAM,CAAC,OAAO,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBAC7C,MAAM,IAAI,GAAG,CAAC,MAAM,CAAC,IAAI,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBACvC,MAAM,GAAG,GAAG,CAAC,MAAM,CAAC,GAAG,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBACrC,MAAM,SAAS,GAAG,CAAC,MAAM,CAAC,KAAK,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBAC7C,MAAM,MAAM,GAAG,CAAC,MAAM,CAAC,IAAI,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBAEzC,IAAI,CAAC,OAAO,IAAI,CAAC,IAAI,IAAI,CAAC,GAAG;wBAAE,SAAQ;oBAEvC,MAAM,KAAK,GAAG,uBAAuB,CAAC,SAAS,CAAC,CAAA;oBAChD,IAAI,CAAC,KAAK;wBAAE,SAAQ;oBAEpB,MAAM,KAAK,GAAG,YAAY,CAAC,OAAO,CAAC,CAAA;oBACnC,IAAI,CAAC,KAAK;wBAAE,SAAQ;oBAEpB,MAAM,UAAU,GAA+B;wBAC9C,KAAK,EAAE,OAAO;wBACd,GAAG,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC,CAAC,EAAE,YAAY,EAAE,KAAK,CAAC,YAAY,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;wBACnE,MAAM,EAAE,KAAK,CAAC,MAAM;wBACpB,QAAQ,EAAE,IAAI;wBACd,MAAM,EAAE,KAAK,CAAC,YAAY;wBAC1B,QAAQ,EAAE,GAAG;wBACb,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,SAAS,EAAE,MAAM,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;qBACxC,CAAA;oBAED,MAAM,UAAU,GAAG,CAAC,KAAK,CAAC,YAAY,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAA;oBACtF,MAAM,GAAG,GAAG;wBACX,OAAO;wBACP,UAAU;wBACV,CAAC,IAAI,EAAE,CAAC,SAAS,EAAE,GAAG,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC;qBAC7E;yBACC,MAAM,CAAC,OAAO,CAAC;yBACf,IAAI,CAAC,IAAI,CAAC,CAAA;oBAEZ,MAAM,OAAO,GAAG,mBAAmB,CAAC,UAAU,EAAE,GAAG,CAAC,CAAA;oBACpD,IAAI,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,MAAM,IAAI,CAAC;wBAAE,SAAQ;oBAE9C,MAAM,OAAO,GAAG,CAAC,MAAM,CAAC,OAAO,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBAC7C,MAAM,QAAQ,GAAG,OAAO;wBACvB,CAAC,CAAC,GAAG,yBAAyB,IAAI,OAAO,EAAE;wBAC3C,CAAC,CAAC,cAAc,CAAC,yBAAyB,EAAE,OAAO,CAAC,CAAA;oBAErD,MAAM;wBACL,GAAG;wBACH,UAAU,EAAE,OAAO;wBACnB,OAAO,EAAE,IAAI;wBACb,MAAM,EAAE,OAAO;wBACf,MAAM,EAAE,yBAAyB;wBACjC,SAAS,EAAE,QAAQ;wBACnB,cAAc,EAAE,EAAE;wBAClB,OAAO,EAAE,8BAA8B;qBACvC,CAAA;oBACD,OAAO,EAAE,CAAA;gBACV,CAAC;YACF,CAAC;oBAAS,CAAC;gBACV,MAAM,CAAC,OAAO,EAAE,CAAA;YACjB,CAAC;QACF,CAAC;KACD,CAAA;AACF,CAAC;AAED,MAAM,CAAC,MAAM,mBAAmB,GAAG,yBAAyB,EAAE,CAAA"}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* `usgov-nad`: US DOT National Address Database — ~97M structured address-point records.
|
|
7
|
+
*
|
|
8
|
+
* The single largest US address source available — federal aggregation of state + local 911-grade
|
|
9
|
+
* address points (every addressable location). Compared to TIGER ADDRFEAT (~20M segment-level, no
|
|
10
|
+
* city/locality) and NPPES (~7M provider-centric venues), NAD covers the entire residential +
|
|
11
|
+
* commercial address space with full structured components.
|
|
12
|
+
*
|
|
13
|
+
* The adapter consumes NDJSON shards produced by `fetch-nad.ts`'s featureserver mode (operator
|
|
14
|
+
* pre-downloads via `npx tsx packages/corpus/scripts/fetch-nad.ts`). Each shard is per-OID-range
|
|
15
|
+
* `oids_<start>-<end>.ndjson` with a sibling `.manifest.json`. Adapter iterates every `.ndjson`
|
|
16
|
+
* in the input directory, skipping the `quarantined-bash-bug/` subdir (legacy of the bash-
|
|
17
|
+
* fetcher's silent-page-failure bug).
|
|
18
|
+
*
|
|
19
|
+
* Field mapping (NAD v9 → CanonicalRow components):
|
|
20
|
+
*
|
|
21
|
+
* - House_number: `AddNo_Full` (pre-composed); falls back to AddNum_Pre + Add_Number + AddNum_Suf
|
|
22
|
+
* - Street: `StNam_Full` (pre-composed); falls back to St_PreDir + St_PreTyp + St_Name + St_PosTyp
|
|
23
|
+
*
|
|
24
|
+
* - St_PosDir + St_PosMod composition
|
|
25
|
+
* - Locality: `Post_City` > `Inc_Muni` > `Census_Plc` > `Uninc_Comm` (first non-empty)
|
|
26
|
+
* - Region: `State` (2-char USPS code, including territories: PR, GU, VI, AS, MP)
|
|
27
|
+
* - Postcode: `Zip_Code` + `Plus_4` (joined as `XXXXX-NNNN` when both present)
|
|
28
|
+
* - Venue: `LandmkName` (typically a park, school, hospital, named facility — when present)
|
|
29
|
+
*
|
|
30
|
+
* License: stamped `"Public Domain"` per 17 U.S.C. § 105 (US federal works).
|
|
31
|
+
*/
|
|
32
|
+
import type { CorpusAdapter } from "../../types.js";
|
|
33
|
+
export declare const USGOV_NAD_ADAPTER_ID = "usgov-nad";
|
|
34
|
+
export declare const USGOV_NAD_DEFAULT_LICENSE = "Public Domain";
|
|
35
|
+
export declare function createUsgovNadAdapter(): CorpusAdapter;
|
|
36
|
+
export declare const usgovNadAdapter: CorpusAdapter;
|
|
37
|
+
//# sourceMappingURL=adapter.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"adapter.d.ts","sourceRoot":"","sources":["../../../../src/adapters/usgov-nad/adapter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA8BG;AAQH,OAAO,KAAK,EAAgC,aAAa,EAAE,MAAM,gBAAgB,CAAA;AAEjF,eAAO,MAAM,oBAAoB,cAAc,CAAA;AAC/C,eAAO,MAAM,yBAAyB,kBAAkB,CAAA;AA+JxD,wBAAgB,qBAAqB,IAAI,aAAa,CAuFrD;AAED,eAAO,MAAM,eAAe,eAA0B,CAAA"}
|
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* `usgov-nad`: US DOT National Address Database — ~97M structured address-point records.
|
|
7
|
+
*
|
|
8
|
+
* The single largest US address source available — federal aggregation of state + local 911-grade
|
|
9
|
+
* address points (every addressable location). Compared to TIGER ADDRFEAT (~20M segment-level, no
|
|
10
|
+
* city/locality) and NPPES (~7M provider-centric venues), NAD covers the entire residential +
|
|
11
|
+
* commercial address space with full structured components.
|
|
12
|
+
*
|
|
13
|
+
* The adapter consumes NDJSON shards produced by `fetch-nad.ts`'s featureserver mode (operator
|
|
14
|
+
* pre-downloads via `npx tsx packages/corpus/scripts/fetch-nad.ts`). Each shard is per-OID-range
|
|
15
|
+
* `oids_<start>-<end>.ndjson` with a sibling `.manifest.json`. Adapter iterates every `.ndjson`
|
|
16
|
+
* in the input directory, skipping the `quarantined-bash-bug/` subdir (legacy of the bash-
|
|
17
|
+
* fetcher's silent-page-failure bug).
|
|
18
|
+
*
|
|
19
|
+
* Field mapping (NAD v9 → CanonicalRow components):
|
|
20
|
+
*
|
|
21
|
+
* - House_number: `AddNo_Full` (pre-composed); falls back to AddNum_Pre + Add_Number + AddNum_Suf
|
|
22
|
+
* - Street: `StNam_Full` (pre-composed); falls back to St_PreDir + St_PreTyp + St_Name + St_PosTyp
|
|
23
|
+
*
|
|
24
|
+
* - St_PosDir + St_PosMod composition
|
|
25
|
+
* - Locality: `Post_City` > `Inc_Muni` > `Census_Plc` > `Uninc_Comm` (first non-empty)
|
|
26
|
+
* - Region: `State` (2-char USPS code, including territories: PR, GU, VI, AS, MP)
|
|
27
|
+
* - Postcode: `Zip_Code` + `Plus_4` (joined as `XXXXX-NNNN` when both present)
|
|
28
|
+
* - Venue: `LandmkName` (typically a park, school, hospital, named facility — when present)
|
|
29
|
+
*
|
|
30
|
+
* License: stamped `"Public Domain"` per 17 U.S.C. § 105 (US federal works).
|
|
31
|
+
*/
|
|
32
|
+
import { createReadStream } from "node:fs";
|
|
33
|
+
import { readdir } from "node:fs/promises";
|
|
34
|
+
import { join } from "node:path";
|
|
35
|
+
import { createInterface } from "node:readline";
|
|
36
|
+
import { reconcileComponents } from "../../format.js";
|
|
37
|
+
export const USGOV_NAD_ADAPTER_ID = "usgov-nad";
|
|
38
|
+
export const USGOV_NAD_DEFAULT_LICENSE = "Public Domain";
|
|
39
|
+
const US_STATES_SET = new Set([
|
|
40
|
+
"AL",
|
|
41
|
+
"AK",
|
|
42
|
+
"AZ",
|
|
43
|
+
"AR",
|
|
44
|
+
"CA",
|
|
45
|
+
"CO",
|
|
46
|
+
"CT",
|
|
47
|
+
"DE",
|
|
48
|
+
"DC",
|
|
49
|
+
"FL",
|
|
50
|
+
"GA",
|
|
51
|
+
"HI",
|
|
52
|
+
"ID",
|
|
53
|
+
"IL",
|
|
54
|
+
"IN",
|
|
55
|
+
"IA",
|
|
56
|
+
"KS",
|
|
57
|
+
"KY",
|
|
58
|
+
"LA",
|
|
59
|
+
"ME",
|
|
60
|
+
"MD",
|
|
61
|
+
"MA",
|
|
62
|
+
"MI",
|
|
63
|
+
"MN",
|
|
64
|
+
"MS",
|
|
65
|
+
"MO",
|
|
66
|
+
"MT",
|
|
67
|
+
"NE",
|
|
68
|
+
"NV",
|
|
69
|
+
"NH",
|
|
70
|
+
"NJ",
|
|
71
|
+
"NM",
|
|
72
|
+
"NY",
|
|
73
|
+
"NC",
|
|
74
|
+
"ND",
|
|
75
|
+
"OH",
|
|
76
|
+
"OK",
|
|
77
|
+
"OR",
|
|
78
|
+
"PA",
|
|
79
|
+
"RI",
|
|
80
|
+
"SC",
|
|
81
|
+
"SD",
|
|
82
|
+
"TN",
|
|
83
|
+
"TX",
|
|
84
|
+
"UT",
|
|
85
|
+
"VT",
|
|
86
|
+
"VA",
|
|
87
|
+
"WA",
|
|
88
|
+
"WV",
|
|
89
|
+
"WI",
|
|
90
|
+
"WY",
|
|
91
|
+
// Territories that ship in NAD
|
|
92
|
+
"PR",
|
|
93
|
+
"GU",
|
|
94
|
+
"VI",
|
|
95
|
+
"AS",
|
|
96
|
+
"MP",
|
|
97
|
+
]);
|
|
98
|
+
function nonEmpty(...values) {
|
|
99
|
+
for (const v of values) {
|
|
100
|
+
const trimmed = (v ?? "").toString().trim();
|
|
101
|
+
if (trimmed)
|
|
102
|
+
return trimmed;
|
|
103
|
+
}
|
|
104
|
+
return undefined;
|
|
105
|
+
}
|
|
106
|
+
function composeHouseNumber(r) {
|
|
107
|
+
const full = (r.AddNo_Full ?? "").toString().trim();
|
|
108
|
+
if (full)
|
|
109
|
+
return full;
|
|
110
|
+
const num = r.Add_Number == null ? "" : String(r.Add_Number).trim();
|
|
111
|
+
if (!num)
|
|
112
|
+
return undefined;
|
|
113
|
+
const pre = (r.AddNum_Pre ?? "").toString().trim();
|
|
114
|
+
const suf = (r.AddNum_Suf ?? "").toString().trim();
|
|
115
|
+
return [pre, num, suf].filter(Boolean).join(" ").trim() || undefined;
|
|
116
|
+
}
|
|
117
|
+
function composeStreet(r) {
|
|
118
|
+
const full = (r.StNam_Full ?? "").toString().trim();
|
|
119
|
+
if (full)
|
|
120
|
+
return full;
|
|
121
|
+
const parts = [r.St_PreMod, r.St_PreDir, r.St_PreTyp, r.St_PreSep, r.St_Name, r.St_PosTyp, r.St_PosDir, r.St_PosMod]
|
|
122
|
+
.map((p) => (p ?? "").toString().trim())
|
|
123
|
+
.filter(Boolean);
|
|
124
|
+
return parts.length ? parts.join(" ") : undefined;
|
|
125
|
+
}
|
|
126
|
+
function composeLocality(r) {
|
|
127
|
+
return nonEmpty(r.Post_City, r.Inc_Muni, r.Census_Plc, r.Uninc_Comm);
|
|
128
|
+
}
|
|
129
|
+
function composePostcode(r) {
|
|
130
|
+
const zip = (r.Zip_Code ?? "").toString().trim();
|
|
131
|
+
if (!zip)
|
|
132
|
+
return undefined;
|
|
133
|
+
const plus4 = (r.Plus_4 ?? "").toString().trim();
|
|
134
|
+
return plus4 ? `${zip}-${plus4}` : zip;
|
|
135
|
+
}
|
|
136
|
+
function composeRaw(parts) {
|
|
137
|
+
const streetLine = [parts.houseNumber, parts.street].filter(Boolean).join(" ").trim();
|
|
138
|
+
const tail = `${parts.locality}, ${parts.region} ${parts.postcode}`;
|
|
139
|
+
return [parts.venue, streetLine || undefined, tail].filter(Boolean).join(", ");
|
|
140
|
+
}
|
|
141
|
+
export function createUsgovNadAdapter() {
|
|
142
|
+
return {
|
|
143
|
+
id: USGOV_NAD_ADAPTER_ID,
|
|
144
|
+
defaultLicense: USGOV_NAD_DEFAULT_LICENSE,
|
|
145
|
+
description: "US DOT National Address Database — ~97M structured US address points (911-grade). Single largest US source.",
|
|
146
|
+
async *rows(opts) {
|
|
147
|
+
if (opts.country && opts.country !== "US") {
|
|
148
|
+
throw new Error(`usgov-nad adapter: only US supported, got country=${opts.country}`);
|
|
149
|
+
}
|
|
150
|
+
// inputPath is a directory of NDJSON shards (per fetch-nad.ts featureserver output).
|
|
151
|
+
// Single-file inputs (e.g. a bulk-extracted CSV) are not currently supported — the
|
|
152
|
+
// featureserver shard pattern is the primary distribution.
|
|
153
|
+
const entries = await readdir(opts.inputPath);
|
|
154
|
+
const shards = entries.filter((n) => n.endsWith(".ndjson")).sort();
|
|
155
|
+
let emitted = 0;
|
|
156
|
+
outer: for (const shard of shards) {
|
|
157
|
+
if (opts.signal?.aborted)
|
|
158
|
+
break;
|
|
159
|
+
const stream = createReadStream(join(opts.inputPath, shard), { encoding: "utf8" });
|
|
160
|
+
const rl = createInterface({ input: stream, crlfDelay: Infinity });
|
|
161
|
+
try {
|
|
162
|
+
for await (const line of rl) {
|
|
163
|
+
if (opts.signal?.aborted)
|
|
164
|
+
break outer;
|
|
165
|
+
if (opts.limit !== undefined && emitted >= opts.limit)
|
|
166
|
+
break outer;
|
|
167
|
+
if (!line)
|
|
168
|
+
continue;
|
|
169
|
+
let record;
|
|
170
|
+
try {
|
|
171
|
+
record = JSON.parse(line);
|
|
172
|
+
}
|
|
173
|
+
catch {
|
|
174
|
+
continue; // malformed line — skip silently
|
|
175
|
+
}
|
|
176
|
+
const state = (record.State ?? "").toString().trim().toUpperCase();
|
|
177
|
+
if (!US_STATES_SET.has(state))
|
|
178
|
+
continue;
|
|
179
|
+
const locality = composeLocality(record);
|
|
180
|
+
if (!locality)
|
|
181
|
+
continue;
|
|
182
|
+
const postcode = composePostcode(record);
|
|
183
|
+
if (!postcode)
|
|
184
|
+
continue;
|
|
185
|
+
const street = composeStreet(record);
|
|
186
|
+
const houseNumber = composeHouseNumber(record);
|
|
187
|
+
const venue = nonEmpty(record.LandmkName);
|
|
188
|
+
const components = {
|
|
189
|
+
...(venue ? { venue } : {}),
|
|
190
|
+
...(houseNumber ? { house_number: houseNumber } : {}),
|
|
191
|
+
...(street ? { street } : {}),
|
|
192
|
+
locality,
|
|
193
|
+
region: state,
|
|
194
|
+
postcode,
|
|
195
|
+
};
|
|
196
|
+
const raw = composeRaw({ venue, houseNumber, street, locality, region: state, postcode });
|
|
197
|
+
if (!raw)
|
|
198
|
+
continue;
|
|
199
|
+
const aligned = reconcileComponents(components, raw);
|
|
200
|
+
if (Object.keys(aligned).length <= 2)
|
|
201
|
+
continue;
|
|
202
|
+
const sourceId = record.UUID
|
|
203
|
+
? `${USGOV_NAD_ADAPTER_ID}-${record.UUID}`
|
|
204
|
+
: `${USGOV_NAD_ADAPTER_ID}-${record.OBJECTID ?? `${shard}:${emitted}`}`;
|
|
205
|
+
yield {
|
|
206
|
+
raw,
|
|
207
|
+
components: aligned,
|
|
208
|
+
country: "US",
|
|
209
|
+
locale: "en-US",
|
|
210
|
+
source: USGOV_NAD_ADAPTER_ID,
|
|
211
|
+
source_id: sourceId,
|
|
212
|
+
corpus_version: "",
|
|
213
|
+
license: USGOV_NAD_DEFAULT_LICENSE,
|
|
214
|
+
};
|
|
215
|
+
emitted++;
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
finally {
|
|
219
|
+
rl.close();
|
|
220
|
+
stream.destroy();
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
},
|
|
224
|
+
};
|
|
225
|
+
}
|
|
226
|
+
export const usgovNadAdapter = createUsgovNadAdapter();
|
|
227
|
+
//# sourceMappingURL=adapter.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"adapter.js","sourceRoot":"","sources":["../../../../src/adapters/usgov-nad/adapter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA8BG;AAEH,OAAO,EAAE,gBAAgB,EAAE,MAAM,SAAS,CAAA;AAC1C,OAAO,EAAE,OAAO,EAAE,MAAM,kBAAkB,CAAA;AAC1C,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAA;AAChC,OAAO,EAAE,eAAe,EAAE,MAAM,eAAe,CAAA;AAE/C,OAAO,EAAE,mBAAmB,EAAE,MAAM,iBAAiB,CAAA;AAGrD,MAAM,CAAC,MAAM,oBAAoB,GAAG,WAAW,CAAA;AAC/C,MAAM,CAAC,MAAM,yBAAyB,GAAG,eAAe,CAAA;AAgDxD,MAAM,aAAa,GAAG,IAAI,GAAG,CAAC;IAC7B,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,+BAA+B;IAC/B,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;CACJ,CAAC,CAAA;AAEF,SAAS,QAAQ,CAAC,GAAG,MAAwC;IAC5D,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;QACxB,MAAM,OAAO,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,IAAI,EAAE,CAAA;QAC3C,IAAI,OAAO;YAAE,OAAO,OAAO,CAAA;IAC5B,CAAC;IACD,OAAO,SAAS,CAAA;AACjB,CAAC;AAED,SAAS,kBAAkB,CAAC,CAAY;IACvC,MAAM,IAAI,GAAG,CAAC,CAAC,CAAC,UAAU,IAAI,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,IAAI,EAAE,CAAA;IACnD,IAAI,IAAI;QAAE,OAAO,IAAI,CAAA;IACrB,MAAM,GAAG,GAAG,CAAC,CAAC,UAAU,IAAI,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,IAAI,EAAE,CAAA;IACnE,IAAI,CAAC,GAAG;QAAE,OAAO,SAAS,CAAA;IAC1B,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,UAAU,IAAI,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,IAAI,EAAE,CAAA;IAClD,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,UAAU,IAAI,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,IAAI,EAAE,CAAA;IAClD,OAAO,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,IAAI,SAAS,CAAA;AACrE,CAAC;AAED,SAAS,aAAa,CAAC,CAAY;IAClC,MAAM,IAAI,GAAG,CAAC,CAAC,CAAC,UAAU,IAAI,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,IAAI,EAAE,CAAA;IACnD,IAAI,IAAI;QAAE,OAAO,IAAI,CAAA;IACrB,MAAM,KAAK,GAAG,CAAC,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,SAAS,CAAC;SAClH,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,IAAI,EAAE,CAAC;SACvC,MAAM,CAAC,OAAO,CAAC,CAAA;IACjB,OAAO,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,SAAS,CAAA;AAClD,CAAC;AAED,SAAS,eAAe,CAAC,CAAY;IACpC,OAAO,QAAQ,CAAC,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC,UAAU,EAAE,CAAC,CAAC,UAAU,CAAC,CAAA;AACrE,CAAC;AAED,SAAS,eAAe,CAAC,CAAY;IACpC,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,QAAQ,IAAI,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,IAAI,EAAE,CAAA;IAChD,IAAI,CAAC,GAAG;QAAE,OAAO,SAAS,CAAA;IAC1B,MAAM,KAAK,GAAG,CAAC,CAAC,CAAC,MAAM,IAAI,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,IAAI,EAAE,CAAA;IAChD,OAAO,KAAK,CAAC,CAAC,CAAC,GAAG,GAAG,IAAI,KAAK,EAAE,CAAC,CAAC,CAAC,GAAG,CAAA;AACvC,CAAC;AAED,SAAS,UAAU,CAAC,KAOnB;IACA,MAAM,UAAU,GAAG,CAAC,KAAK,CAAC,WAAW,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAA;IACrF,MAAM,IAAI,GAAG,GAAG,KAAK,CAAC,QAAQ,KAAK,KAAK,CAAC,MAAM,IAAI,KAAK,CAAC,QAAQ,EAAE,CAAA;IACnE,OAAO,CAAC,KAAK,CAAC,KAAK,EAAE,UAAU,IAAI,SAAS,EAAE,IAAI,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;AAC/E,CAAC;AAED,MAAM,UAAU,qBAAqB;IACpC,OAAO;QACN,EAAE,EAAE,oBAAoB;QACxB,cAAc,EAAE,yBAAyB;QACzC,WAAW,EACV,6GAA6G;QAE9G,KAAK,CAAC,CAAC,IAAI,CAAC,IAAoB;YAC/B,IAAI,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,OAAO,KAAK,IAAI,EAAE,CAAC;gBAC3C,MAAM,IAAI,KAAK,CAAC,qDAAqD,IAAI,CAAC,OAAO,EAAE,CAAC,CAAA;YACrF,CAAC;YAED,qFAAqF;YACrF,mFAAmF;YACnF,2DAA2D;YAC3D,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,IAAI,CAAC,SAAS,CAAC,CAAA;YAC7C,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC,CAAC,IAAI,EAAE,CAAA;YAElE,IAAI,OAAO,GAAG,CAAC,CAAA;YACf,KAAK,EAAE,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;gBACnC,IAAI,IAAI,CAAC,MAAM,EAAE,OAAO;oBAAE,MAAK;gBAC/B,MAAM,MAAM,GAAG,gBAAgB,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,KAAK,CAAC,EAAE,EAAE,QAAQ,EAAE,MAAM,EAAE,CAAC,CAAA;gBAClF,MAAM,EAAE,GAAG,eAAe,CAAC,EAAE,KAAK,EAAE,MAAM,EAAE,SAAS,EAAE,QAAQ,EAAE,CAAC,CAAA;gBAClE,IAAI,CAAC;oBACJ,IAAI,KAAK,EAAE,MAAM,IAAI,IAAI,EAAE,EAAE,CAAC;wBAC7B,IAAI,IAAI,CAAC,MAAM,EAAE,OAAO;4BAAE,MAAM,KAAK,CAAA;wBACrC,IAAI,IAAI,CAAC,KAAK,KAAK,SAAS,IAAI,OAAO,IAAI,IAAI,CAAC,KAAK;4BAAE,MAAM,KAAK,CAAA;wBAClE,IAAI,CAAC,IAAI;4BAAE,SAAQ;wBAEnB,IAAI,MAAiB,CAAA;wBACrB,IAAI,CAAC;4BACJ,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAc,CAAA;wBACvC,CAAC;wBAAC,MAAM,CAAC;4BACR,SAAQ,CAAC,iCAAiC;wBAC3C,CAAC;wBAED,MAAM,KAAK,GAAG,CAAC,MAAM,CAAC,KAAK,IAAI,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAA;wBAClE,IAAI,CAAC,aAAa,CAAC,GAAG,CAAC,KAAK,CAAC;4BAAE,SAAQ;wBAEvC,MAAM,QAAQ,GAAG,eAAe,CAAC,MAAM,CAAC,CAAA;wBACxC,IAAI,CAAC,QAAQ;4BAAE,SAAQ;wBAEvB,MAAM,QAAQ,GAAG,eAAe,CAAC,MAAM,CAAC,CAAA;wBACxC,IAAI,CAAC,QAAQ;4BAAE,SAAQ;wBAEvB,MAAM,MAAM,GAAG,aAAa,CAAC,MAAM,CAAC,CAAA;wBACpC,MAAM,WAAW,GAAG,kBAAkB,CAAC,MAAM,CAAC,CAAA;wBAC9C,MAAM,KAAK,GAAG,QAAQ,CAAC,MAAM,CAAC,UAAU,CAAC,CAAA;wBAEzC,MAAM,UAAU,GAA+B;4BAC9C,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;4BAC3B,GAAG,CAAC,WAAW,CAAC,CAAC,CAAC,EAAE,YAAY,EAAE,WAAW,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;4BACrD,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;4BAC7B,QAAQ;4BACR,MAAM,EAAE,KAAK;4BACb,QAAQ;yBACR,CAAA;wBAED,MAAM,GAAG,GAAG,UAAU,CAAC,EAAE,KAAK,EAAE,WAAW,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC,CAAA;wBACzF,IAAI,CAAC,GAAG;4BAAE,SAAQ;wBAElB,MAAM,OAAO,GAAG,mBAAmB,CAAC,UAAU,EAAE,GAAG,CAAC,CAAA;wBACpD,IAAI,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,MAAM,IAAI,CAAC;4BAAE,SAAQ;wBAE9C,MAAM,QAAQ,GAAG,MAAM,CAAC,IAAI;4BAC3B,CAAC,CAAC,GAAG,oBAAoB,IAAI,MAAM,CAAC,IAAI,EAAE;4BAC1C,CAAC,CAAC,GAAG,oBAAoB,IAAI,MAAM,CAAC,QAAQ,IAAI,GAAG,KAAK,IAAI,OAAO,EAAE,EAAE,CAAA;wBAExE,MAAM;4BACL,GAAG;4BACH,UAAU,EAAE,OAAO;4BACnB,OAAO,EAAE,IAAI;4BACb,MAAM,EAAE,OAAO;4BACf,MAAM,EAAE,oBAAoB;4BAC5B,SAAS,EAAE,QAAQ;4BACnB,cAAc,EAAE,EAAE;4BAClB,OAAO,EAAE,yBAAyB;yBAClC,CAAA;wBACD,OAAO,EAAE,CAAA;oBACV,CAAC;gBACF,CAAC;wBAAS,CAAC;oBACV,EAAE,CAAC,KAAK,EAAE,CAAA;oBACV,MAAM,CAAC,OAAO,EAAE,CAAA;gBACjB,CAAC;YACF,CAAC;QACF,CAAC;KACD,CAAA;AACF,CAAC;AAED,MAAM,CAAC,MAAM,eAAe,GAAG,qBAAqB,EAAE,CAAA"}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* `usgov-nppes`: CMS National Plan and Provider Enumeration System (NPI registry) CSV consumer.
|
|
7
|
+
*
|
|
8
|
+
* NPPES is the authoritative US healthcare provider registry, published monthly by CMS. Each row
|
|
9
|
+
* carries a provider's business practice location address together with their legal business name
|
|
10
|
+
* or individual name. At ~7M rows it is the single largest venue+address signal source
|
|
11
|
+
* available.
|
|
12
|
+
*
|
|
13
|
+
* The adapter consumes the monthly full-replacement CSV (operator pre-downloads via
|
|
14
|
+
* `fetch-nppes.sh`). Column names match the canonical NPPES "Full Replacement Monthly NPI File"
|
|
15
|
+
* header published at `https://download.cms.gov/nppes/NPI_Files.html`.
|
|
16
|
+
*
|
|
17
|
+
* Output: one row per CSV record where the practice location address is populated. Organization
|
|
18
|
+
* rows carry `venue` from the legal business name; individual rows compose `attention` from
|
|
19
|
+
* last+first name. Address quad goes on `(house_number, street, locality, region, postcode)`.
|
|
20
|
+
*
|
|
21
|
+
* License: stamped `"Public Domain"` per CMS's federal government distribution terms.
|
|
22
|
+
*/
|
|
23
|
+
import type { CorpusAdapter } from "../../types.js";
|
|
24
|
+
export declare const USGOV_NPPES_ADAPTER_ID = "usgov-nppes";
|
|
25
|
+
export declare const USGOV_NPPES_DEFAULT_LICENSE = "Public Domain";
|
|
26
|
+
export declare function createUsgovNppesAdapter(): CorpusAdapter;
|
|
27
|
+
export declare const usgovNppesAdapter: CorpusAdapter;
|
|
28
|
+
//# sourceMappingURL=adapter.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"adapter.d.ts","sourceRoot":"","sources":["../../../../src/adapters/usgov-nppes/adapter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;GAqBG;AAOH,OAAO,KAAK,EAAgC,aAAa,EAAE,MAAM,gBAAgB,CAAA;AAEjF,eAAO,MAAM,sBAAsB,gBAAgB,CAAA;AACnD,eAAO,MAAM,2BAA2B,kBAAkB,CAAA;AAsC1D,wBAAgB,uBAAuB,IAAI,aAAa,CAqFvD;AAED,eAAO,MAAM,iBAAiB,eAA4B,CAAA"}
|