@mailwoman/corpus 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/out/src/adapter.d.ts +96 -0
- package/out/src/adapter.d.ts.map +1 -0
- package/out/src/adapter.js +107 -0
- package/out/src/adapter.js.map +1 -0
- package/out/src/adapters/ban/adapter.d.ts +32 -0
- package/out/src/adapters/ban/adapter.d.ts.map +1 -0
- package/out/src/adapters/ban/adapter.js +133 -0
- package/out/src/adapters/ban/adapter.js.map +1 -0
- package/out/src/adapters/fcc-bdc/adapter.d.ts +61 -0
- package/out/src/adapters/fcc-bdc/adapter.d.ts.map +1 -0
- package/out/src/adapters/fcc-bdc/adapter.js +153 -0
- package/out/src/adapters/fcc-bdc/adapter.js.map +1 -0
- package/out/src/adapters/index.d.ts +42 -0
- package/out/src/adapters/index.d.ts.map +1 -0
- package/out/src/adapters/index.js +76 -0
- package/out/src/adapters/index.js.map +1 -0
- package/out/src/adapters/openaddresses/adapter.d.ts +60 -0
- package/out/src/adapters/openaddresses/adapter.d.ts.map +1 -0
- package/out/src/adapters/openaddresses/adapter.js +174 -0
- package/out/src/adapters/openaddresses/adapter.js.map +1 -0
- package/out/src/adapters/state-ia-contractors/adapter.d.ts +23 -0
- package/out/src/adapters/state-ia-contractors/adapter.d.ts.map +1 -0
- package/out/src/adapters/state-ia-contractors/adapter.js +113 -0
- package/out/src/adapters/state-ia-contractors/adapter.js.map +1 -0
- package/out/src/adapters/state-ny-notaries/adapter.d.ts +21 -0
- package/out/src/adapters/state-ny-notaries/adapter.d.ts.map +1 -0
- package/out/src/adapters/state-ny-notaries/adapter.js +132 -0
- package/out/src/adapters/state-ny-notaries/adapter.js.map +1 -0
- package/out/src/adapters/state-tx-notaries/adapter.d.ts +22 -0
- package/out/src/adapters/state-tx-notaries/adapter.d.ts.map +1 -0
- package/out/src/adapters/state-tx-notaries/adapter.js +125 -0
- package/out/src/adapters/state-tx-notaries/adapter.js.map +1 -0
- package/out/src/adapters/tiger/adapter.d.ts +45 -0
- package/out/src/adapters/tiger/adapter.d.ts.map +1 -0
- package/out/src/adapters/tiger/adapter.js +179 -0
- package/out/src/adapters/tiger/adapter.js.map +1 -0
- package/out/src/adapters/usgov-hrsa-fqhc/adapter.d.ts +36 -0
- package/out/src/adapters/usgov-hrsa-fqhc/adapter.d.ts.map +1 -0
- package/out/src/adapters/usgov-hrsa-fqhc/adapter.js +147 -0
- package/out/src/adapters/usgov-hrsa-fqhc/adapter.js.map +1 -0
- package/out/src/adapters/usgov-imls-pls/adapter.d.ts +25 -0
- package/out/src/adapters/usgov-imls-pls/adapter.d.ts.map +1 -0
- package/out/src/adapters/usgov-imls-pls/adapter.js +118 -0
- package/out/src/adapters/usgov-imls-pls/adapter.js.map +1 -0
- package/out/src/adapters/usgov-nad/adapter.d.ts +37 -0
- package/out/src/adapters/usgov-nad/adapter.d.ts.map +1 -0
- package/out/src/adapters/usgov-nad/adapter.js +227 -0
- package/out/src/adapters/usgov-nad/adapter.js.map +1 -0
- package/out/src/adapters/usgov-nppes/adapter.d.ts +28 -0
- package/out/src/adapters/usgov-nppes/adapter.d.ts.map +1 -0
- package/out/src/adapters/usgov-nppes/adapter.js +123 -0
- package/out/src/adapters/usgov-nppes/adapter.js.map +1 -0
- package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.d.ts +35 -0
- package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.d.ts.map +1 -0
- package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.js +162 -0
- package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.js.map +1 -0
- package/out/src/adapters/wof-admin-json/adapter.d.ts +85 -0
- package/out/src/adapters/wof-admin-json/adapter.d.ts.map +1 -0
- package/out/src/adapters/wof-admin-json/adapter.js +241 -0
- package/out/src/adapters/wof-admin-json/adapter.js.map +1 -0
- package/out/src/adapters/wof-postalcode-json/adapter.d.ts +63 -0
- package/out/src/adapters/wof-postalcode-json/adapter.d.ts.map +1 -0
- package/out/src/adapters/wof-postalcode-json/adapter.js +178 -0
- package/out/src/adapters/wof-postalcode-json/adapter.js.map +1 -0
- package/out/src/align.d.ts +58 -0
- package/out/src/align.d.ts.map +1 -0
- package/out/src/align.js +139 -0
- package/out/src/align.js.map +1 -0
- package/out/src/build.d.ts +104 -0
- package/out/src/build.d.ts.map +1 -0
- package/out/src/build.js +201 -0
- package/out/src/build.js.map +1 -0
- package/out/src/codex/us-fips-state.d.ts +44 -0
- package/out/src/codex/us-fips-state.d.ts.map +1 -0
- package/out/src/codex/us-fips-state.js +105 -0
- package/out/src/codex/us-fips-state.js.map +1 -0
- package/out/src/codex/us-street-suffix.d.ts +259 -0
- package/out/src/codex/us-street-suffix.d.ts.map +1 -0
- package/out/src/codex/us-street-suffix.js +285 -0
- package/out/src/codex/us-street-suffix.js.map +1 -0
- package/out/src/format.d.ts +79 -0
- package/out/src/format.d.ts.map +1 -0
- package/out/src/format.js +151 -0
- package/out/src/format.js.map +1 -0
- package/out/src/golden.d.ts +50 -0
- package/out/src/golden.d.ts.map +1 -0
- package/out/src/golden.js +104 -0
- package/out/src/golden.js.map +1 -0
- package/out/src/index.d.ts +18 -0
- package/out/src/index.d.ts.map +1 -0
- package/out/src/index.js +18 -0
- package/out/src/index.js.map +1 -0
- package/out/src/parquet-wrapper/index.d.ts +12 -0
- package/out/src/parquet-wrapper/index.d.ts.map +1 -0
- package/out/src/parquet-wrapper/index.js +12 -0
- package/out/src/parquet-wrapper/index.js.map +1 -0
- package/out/src/parquet-wrapper/reader.d.ts +31 -0
- package/out/src/parquet-wrapper/reader.d.ts.map +1 -0
- package/out/src/parquet-wrapper/reader.js +54 -0
- package/out/src/parquet-wrapper/reader.js.map +1 -0
- package/out/src/parquet-wrapper/schema.d.ts +45 -0
- package/out/src/parquet-wrapper/schema.d.ts.map +1 -0
- package/out/src/parquet-wrapper/schema.js +55 -0
- package/out/src/parquet-wrapper/schema.js.map +1 -0
- package/out/src/parquet-wrapper/writer.d.ts +41 -0
- package/out/src/parquet-wrapper/writer.d.ts.map +1 -0
- package/out/src/parquet-wrapper/writer.js +71 -0
- package/out/src/parquet-wrapper/writer.js.map +1 -0
- package/out/src/parquet.d.ts +122 -0
- package/out/src/parquet.d.ts.map +1 -0
- package/out/src/parquet.js +220 -0
- package/out/src/parquet.js.map +1 -0
- package/out/src/runner.d.ts +100 -0
- package/out/src/runner.d.ts.map +1 -0
- package/out/src/runner.js +183 -0
- package/out/src/runner.js.map +1 -0
- package/out/src/split.d.ts +108 -0
- package/out/src/split.d.ts.map +1 -0
- package/out/src/split.js +191 -0
- package/out/src/split.js.map +1 -0
- package/out/src/synthesize.d.ts +146 -0
- package/out/src/synthesize.d.ts.map +1 -0
- package/out/src/synthesize.js +472 -0
- package/out/src/synthesize.js.map +1 -0
- package/out/src/tokenize.d.ts +47 -0
- package/out/src/tokenize.d.ts.map +1 -0
- package/out/src/tokenize.js +49 -0
- package/out/src/tokenize.js.map +1 -0
- package/out/src/types.d.ts +168 -0
- package/out/src/types.d.ts.map +1 -0
- package/out/src/types.js +19 -0
- package/out/src/types.js.map +1 -0
- package/out/src/wof-json.d.ts +105 -0
- package/out/src/wof-json.d.ts.map +1 -0
- package/out/src/wof-json.js +174 -0
- package/out/src/wof-json.js.map +1 -0
- package/package.json +36 -0
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* `usgov-nppes`: CMS National Plan and Provider Enumeration System (NPI registry) CSV consumer.
|
|
7
|
+
*
|
|
8
|
+
* NPPES is the authoritative US healthcare provider registry, published monthly by CMS. Each row
|
|
9
|
+
* carries a provider's business practice location address together with their legal business name
|
|
10
|
+
* or individual name. At ~7M rows it is the single largest venue+address signal source
|
|
11
|
+
* available.
|
|
12
|
+
*
|
|
13
|
+
* The adapter consumes the monthly full-replacement CSV (operator pre-downloads via
|
|
14
|
+
* `fetch-nppes.sh`). Column names match the canonical NPPES "Full Replacement Monthly NPI File"
|
|
15
|
+
* header published at `https://download.cms.gov/nppes/NPI_Files.html`.
|
|
16
|
+
*
|
|
17
|
+
* Output: one row per CSV record where the practice location address is populated. Organization
|
|
18
|
+
* rows carry `venue` from the legal business name; individual rows compose `attention` from
|
|
19
|
+
* last+first name. Address quad goes on `(house_number, street, locality, region, postcode)`.
|
|
20
|
+
*
|
|
21
|
+
* License: stamped `"Public Domain"` per CMS's federal government distribution terms.
|
|
22
|
+
*/
|
|
23
|
+
import { parse as csvParse } from "csv-parse";
|
|
24
|
+
import { createReadStream } from "node:fs";
|
|
25
|
+
import { stableSourceId } from "../../adapter.js";
|
|
26
|
+
import { lookupStateAbbreviation } from "../../codex/us-fips-state.js";
|
|
27
|
+
import { reconcileComponents } from "../../format.js";
|
|
28
|
+
export const USGOV_NPPES_ADAPTER_ID = "usgov-nppes";
|
|
29
|
+
export const USGOV_NPPES_DEFAULT_LICENSE = "Public Domain";
|
|
30
|
+
const HOUSE_NUMBER_PREFIX = /^(\d+(?:-\d+)?[A-Za-z]?)\s+(.+)$/;
|
|
31
|
+
function splitAddress(address) {
|
|
32
|
+
const trimmed = address.trim();
|
|
33
|
+
if (!trimmed)
|
|
34
|
+
return null;
|
|
35
|
+
const m = HOUSE_NUMBER_PREFIX.exec(trimmed);
|
|
36
|
+
if (m)
|
|
37
|
+
return { house_number: m[1], street: m[2].trim() };
|
|
38
|
+
return { street: trimmed };
|
|
39
|
+
}
|
|
40
|
+
function composeRaw(venue, house, street, city, state, postcode) {
|
|
41
|
+
const streetPart = [house, street].filter(Boolean).join(" ").trim();
|
|
42
|
+
const cityPart = [city.trim(), [state, postcode].filter(Boolean).join(" ").trim()].filter(Boolean).join(", ");
|
|
43
|
+
return [venue, streetPart, cityPart].filter(Boolean).join(", ");
|
|
44
|
+
}
|
|
45
|
+
export function createUsgovNppesAdapter() {
|
|
46
|
+
return {
|
|
47
|
+
id: USGOV_NPPES_ADAPTER_ID,
|
|
48
|
+
defaultLicense: USGOV_NPPES_DEFAULT_LICENSE,
|
|
49
|
+
description: "CMS National Plan and Provider Enumeration System — 7M provider practice locations (public-domain). Venue+address co-occurrence at scale.",
|
|
50
|
+
async *rows(opts) {
|
|
51
|
+
if (opts.country && opts.country !== "US") {
|
|
52
|
+
throw new Error(`usgov-nppes adapter: only US supported, got country=${opts.country}`);
|
|
53
|
+
}
|
|
54
|
+
const stream = createReadStream(opts.inputPath, { encoding: "utf8" });
|
|
55
|
+
const parser = stream.pipe(csvParse({
|
|
56
|
+
columns: true,
|
|
57
|
+
skip_empty_lines: true,
|
|
58
|
+
relax_quotes: true,
|
|
59
|
+
relax_column_count: true,
|
|
60
|
+
}));
|
|
61
|
+
let emitted = 0;
|
|
62
|
+
try {
|
|
63
|
+
for await (const record of parser) {
|
|
64
|
+
if (opts.signal?.aborted)
|
|
65
|
+
break;
|
|
66
|
+
if (opts.limit !== undefined && emitted >= opts.limit)
|
|
67
|
+
break;
|
|
68
|
+
const npi = (record.NPI ?? "").trim();
|
|
69
|
+
const entityType = (record["Entity Type Code"] ?? "").trim();
|
|
70
|
+
const orgName = (record["Provider Organization Name (Legal Business Name)"] ?? "").trim();
|
|
71
|
+
const lastName = (record["Provider Last Name (Legal Name)"] ?? "").trim();
|
|
72
|
+
const firstName = (record["Provider First Name"] ?? "").trim();
|
|
73
|
+
const address1 = (record["Provider First Line Business Practice Location Address"] ?? "").trim();
|
|
74
|
+
const address2 = (record["Provider Second Line Business Practice Location Address"] ?? "").trim();
|
|
75
|
+
const city = (record["Provider Business Practice Location Address City Name"] ?? "").trim();
|
|
76
|
+
const stateRaw = (record["Provider Business Practice Location Address State Name"] ?? "").trim();
|
|
77
|
+
const postcode = (record["Provider Business Practice Location Address Postal Code"] ?? "").trim();
|
|
78
|
+
if (!city || !postcode)
|
|
79
|
+
continue;
|
|
80
|
+
const state = lookupStateAbbreviation(stateRaw);
|
|
81
|
+
if (!state)
|
|
82
|
+
continue;
|
|
83
|
+
const fullStreet = [address1, address2].filter(Boolean).join(" ");
|
|
84
|
+
const split = splitAddress(fullStreet);
|
|
85
|
+
if (!split)
|
|
86
|
+
continue;
|
|
87
|
+
const venue = orgName || [firstName, lastName].filter(Boolean).join(" ") || undefined;
|
|
88
|
+
const components = {
|
|
89
|
+
...(venue ? { venue } : {}),
|
|
90
|
+
...(split.house_number ? { house_number: split.house_number } : {}),
|
|
91
|
+
street: split.street,
|
|
92
|
+
locality: city,
|
|
93
|
+
region: state.abbreviation,
|
|
94
|
+
postcode,
|
|
95
|
+
};
|
|
96
|
+
const raw = composeRaw(venue, split.house_number, split.street, city, state.abbreviation, postcode);
|
|
97
|
+
if (!raw)
|
|
98
|
+
continue;
|
|
99
|
+
const aligned = reconcileComponents(components, raw);
|
|
100
|
+
if (Object.keys(aligned).length <= 2)
|
|
101
|
+
continue;
|
|
102
|
+
const sourceId = npi ? `${USGOV_NPPES_ADAPTER_ID}-${npi}` : stableSourceId(USGOV_NPPES_ADAPTER_ID, aligned);
|
|
103
|
+
yield {
|
|
104
|
+
raw,
|
|
105
|
+
components: aligned,
|
|
106
|
+
country: "US",
|
|
107
|
+
locale: "en-US",
|
|
108
|
+
source: USGOV_NPPES_ADAPTER_ID,
|
|
109
|
+
source_id: sourceId,
|
|
110
|
+
corpus_version: "",
|
|
111
|
+
license: USGOV_NPPES_DEFAULT_LICENSE,
|
|
112
|
+
};
|
|
113
|
+
emitted++;
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
finally {
|
|
117
|
+
stream.destroy();
|
|
118
|
+
}
|
|
119
|
+
},
|
|
120
|
+
};
|
|
121
|
+
}
|
|
122
|
+
export const usgovNppesAdapter = createUsgovNppesAdapter();
|
|
123
|
+
//# sourceMappingURL=adapter.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"adapter.js","sourceRoot":"","sources":["../../../../src/adapters/usgov-nppes/adapter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;GAqBG;AAEH,OAAO,EAAE,KAAK,IAAI,QAAQ,EAAE,MAAM,WAAW,CAAA;AAC7C,OAAO,EAAE,gBAAgB,EAAE,MAAM,SAAS,CAAA;AAC1C,OAAO,EAAE,cAAc,EAAE,MAAM,kBAAkB,CAAA;AACjD,OAAO,EAAE,uBAAuB,EAAE,MAAM,8BAA8B,CAAA;AACtE,OAAO,EAAE,mBAAmB,EAAE,MAAM,iBAAiB,CAAA;AAGrD,MAAM,CAAC,MAAM,sBAAsB,GAAG,aAAa,CAAA;AACnD,MAAM,CAAC,MAAM,2BAA2B,GAAG,eAAe,CAAA;AAE1D,MAAM,mBAAmB,GAAG,kCAAkC,CAAA;AAe9D,SAAS,YAAY,CAAC,OAAe;IACpC,MAAM,OAAO,GAAG,OAAO,CAAC,IAAI,EAAE,CAAA;IAC9B,IAAI,CAAC,OAAO;QAAE,OAAO,IAAI,CAAA;IACzB,MAAM,CAAC,GAAG,mBAAmB,CAAC,IAAI,CAAC,OAAO,CAAC,CAAA;IAC3C,IAAI,CAAC;QAAE,OAAO,EAAE,YAAY,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC,CAAC,CAAE,CAAC,IAAI,EAAE,EAAE,CAAA;IAC1D,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,CAAA;AAC3B,CAAC;AAED,SAAS,UAAU,CAClB,KAAyB,EACzB,KAAyB,EACzB,MAAc,EACd,IAAY,EACZ,KAAa,EACb,QAAgB;IAEhB,MAAM,UAAU,GAAG,CAAC,KAAK,EAAE,MAAM,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAA;IACnE,MAAM,QAAQ,GAAG,CAAC,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC,KAAK,EAAE,QAAQ,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;IAC7G,OAAO,CAAC,KAAK,EAAE,UAAU,EAAE,QAAQ,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;AAChE,CAAC;AAED,MAAM,UAAU,uBAAuB;IACtC,OAAO;QACN,EAAE,EAAE,sBAAsB;QAC1B,cAAc,EAAE,2BAA2B;QAC3C,WAAW,EACV,2IAA2I;QAE5I,KAAK,CAAC,CAAC,IAAI,CAAC,IAAoB;YAC/B,IAAI,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,OAAO,KAAK,IAAI,EAAE,CAAC;gBAC3C,MAAM,IAAI,KAAK,CAAC,uDAAuD,IAAI,CAAC,OAAO,EAAE,CAAC,CAAA;YACvF,CAAC;YAED,MAAM,MAAM,GAAG,gBAAgB,CAAC,IAAI,CAAC,SAAS,EAAE,EAAE,QAAQ,EAAE,MAAM,EAAE,CAAC,CAAA;YACrE,MAAM,MAAM,GAAG,MAAM,CAAC,IAAI,CACzB,QAAQ,CAAC;gBACR,OAAO,EAAE,IAAI;gBACb,gBAAgB,EAAE,IAAI;gBACtB,YAAY,EAAE,IAAI;gBAClB,kBAAkB,EAAE,IAAI;aACxB,CAAC,CACF,CAAA;YAED,IAAI,OAAO,GAAG,CAAC,CAAA;YACf,IAAI,CAAC;gBACJ,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,MAAiC,EAAE,CAAC;oBAC9D,IAAI,IAAI,CAAC,MAAM,EAAE,OAAO;wBAAE,MAAK;oBAC/B,IAAI,IAAI,CAAC,KAAK,KAAK,SAAS,IAAI,OAAO,IAAI,IAAI,CAAC,KAAK;wBAAE,MAAK;oBAE5D,MAAM,GAAG,GAAG,CAAC,MAAM,CAAC,GAAG,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBACrC,MAAM,UAAU,GAAG,CAAC,MAAM,CAAC,kBAAkB,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBAC5D,MAAM,OAAO,GAAG,CAAC,MAAM,CAAC,kDAAkD,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBACzF,MAAM,QAAQ,GAAG,CAAC,MAAM,CAAC,iCAAiC,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBACzE,MAAM,SAAS,GAAG,CAAC,MAAM,CAAC,qBAAqB,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBAE9D,MAAM,QAAQ,GAAG,CAAC,MAAM,CAAC,wDAAwD,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBAChG,MAAM,QAAQ,GAAG,CAAC,MAAM,CAAC,yDAAyD,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBACjG,MAAM,IAAI,GAAG,CAAC,MAAM,CAAC,uDAAuD,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBAC3F,MAAM,QAAQ,GAAG,CAAC,MAAM,CAAC,wDAAwD,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBAChG,MAAM,QAAQ,GAAG,CAAC,MAAM,CAAC,yDAAyD,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBAEjG,IAAI,CAAC,IAAI,IAAI,CAAC,QAAQ;wBAAE,SAAQ;oBAEhC,MAAM,KAAK,GAAG,uBAAuB,CAAC,QAAQ,CAAC,CAAA;oBAC/C,IAAI,CAAC,KAAK;wBAAE,SAAQ;oBAEpB,MAAM,UAAU,GAAG,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;oBACjE,MAAM,KAAK,GAAG,YAAY,CAAC,UAAU,CAAC,CAAA;oBACtC,IAAI,CAAC,KAAK;wBAAE,SAAQ;oBAEpB,MAAM,KAAK,GAAG,OAAO,IAAI,CAAC,SAAS,EAAE,QAAQ,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,SAAS,CAAA;oBAErF,MAAM,UAAU,GAA+B;wBAC9C,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;wBAC3B,GAAG,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC,CAAC,EAAE,YAAY,EAAE,KAAK,CAAC,YAAY,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;wBACnE,MAAM,EAAE,KAAK,CAAC,MAAM;wBACpB,QAAQ,EAAE,IAAI;wBACd,MAAM,EAAE,KAAK,CAAC,YAAY;wBAC1B,QAAQ;qBACR,CAAA;oBAED,MAAM,GAAG,GAAG,UAAU,CAAC,KAAK,EAAE,KAAK,CAAC,YAAY,EAAE,KAAK,CAAC,MAAM,EAAE,IAAI,EAAE,KAAK,CAAC,YAAY,EAAE,QAAQ,CAAC,CAAA;oBACnG,IAAI,CAAC,GAAG;wBAAE,SAAQ;oBAElB,MAAM,OAAO,GAAG,mBAAmB,CAAC,UAAU,EAAE,GAAG,CAAC,CAAA;oBACpD,IAAI,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,MAAM,IAAI,CAAC;wBAAE,SAAQ;oBAE9C,MAAM,QAAQ,GAAG,GAAG,CAAC,CAAC,CAAC,GAAG,sBAAsB,IAAI,GAAG,EAAE,CAAC,CAAC,CAAC,cAAc,CAAC,sBAAsB,EAAE,OAAO,CAAC,CAAA;oBAE3G,MAAM;wBACL,GAAG;wBACH,UAAU,EAAE,OAAO;wBACnB,OAAO,EAAE,IAAI;wBACb,MAAM,EAAE,OAAO;wBACf,MAAM,EAAE,sBAAsB;wBAC9B,SAAS,EAAE,QAAQ;wBACnB,cAAc,EAAE,EAAE;wBAClB,OAAO,EAAE,2BAA2B;qBACpC,CAAA;oBACD,OAAO,EAAE,CAAA;gBACV,CAAC;YACF,CAAC;oBAAS,CAAC;gBACV,MAAM,CAAC,OAAO,EAAE,CAAA;YACjB,CAAC;QACF,CAAC;KACD,CAAA;AACF,CAAC;AAED,MAAM,CAAC,MAAM,iBAAiB,GAAG,uBAAuB,EAAE,CAAA"}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* `usgov-samhsa-treatment-locator`: SAMHSA Behavioral Health Treatment Services Locator CSV
|
|
7
|
+
* consumer.
|
|
8
|
+
*
|
|
9
|
+
* SAMHSA's Treatment Locator (`findtreatment.gov`) is the federal directory of substance-use and
|
|
10
|
+
* mental-health treatment facilities. The published CSV carries the facility name, an optional
|
|
11
|
+
* secondary name (typically the organizational parent), and the postal address quad split into
|
|
12
|
+
* primary + secondary street lines. Phase 1.6 §1.2 (#22) selects this source for the same reason
|
|
13
|
+
* it selects HRSA: facility names are hand-typed venue strings and the addresses pass through
|
|
14
|
+
* enough human + system hands to accumulate the suite-designator + sub-tenant chaos ("Suite C,
|
|
15
|
+
* behind main building") that pure gazetteer data does not.
|
|
16
|
+
*
|
|
17
|
+
* SAMHSA's two-line address shape is the key adapter-specific concern. `street1` typically carries
|
|
18
|
+
* the canonical postal address (`"123 Main St"`); `street2` carries the suite / unit / "second
|
|
19
|
+
* floor" surface form. The adapter joins them with `", "` into a single `street` component (Phase
|
|
20
|
+
* 1 keeps `unit` as a deferred slot since the OpenCage template doesn't have a clean rendering
|
|
21
|
+
* for it). Operators wanting a different join policy can subclass the factory.
|
|
22
|
+
*
|
|
23
|
+
* Column names below match the canonical SAMHSA Behavioral Health Treatment Services Locator CSV
|
|
24
|
+
* export header. Operators substituting a closely-related extract should rename columns to match;
|
|
25
|
+
* the README has the mapping cheatsheet.
|
|
26
|
+
*
|
|
27
|
+
* License: stamped `"Public Domain"` per the SAMHSA Open Data Foundry's federal-government
|
|
28
|
+
* distribution terms.
|
|
29
|
+
*/
|
|
30
|
+
import type { CorpusAdapter } from "../../types.js";
|
|
31
|
+
export declare const USGOV_SAMHSA_ADAPTER_ID = "usgov-samhsa-treatment-locator";
|
|
32
|
+
export declare const USGOV_SAMHSA_DEFAULT_LICENSE = "Public Domain";
|
|
33
|
+
export declare function createUsgovSamhsaTreatmentLocatorAdapter(): CorpusAdapter;
|
|
34
|
+
export declare const usgovSamhsaTreatmentLocatorAdapter: CorpusAdapter;
|
|
35
|
+
//# sourceMappingURL=adapter.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"adapter.d.ts","sourceRoot":"","sources":["../../../../src/adapters/usgov-samhsa-treatment-locator/adapter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA4BG;AAOH,OAAO,KAAK,EAAgC,aAAa,EAAE,MAAM,gBAAgB,CAAA;AAEjF,eAAO,MAAM,uBAAuB,mCAAmC,CAAA;AACvE,eAAO,MAAM,4BAA4B,kBAAkB,CAAA;AA2E3D,wBAAgB,wCAAwC,IAAI,aAAa,CA6ExE;AAED,eAAO,MAAM,kCAAkC,eAA6C,CAAA"}
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* `usgov-samhsa-treatment-locator`: SAMHSA Behavioral Health Treatment Services Locator CSV
|
|
7
|
+
* consumer.
|
|
8
|
+
*
|
|
9
|
+
* SAMHSA's Treatment Locator (`findtreatment.gov`) is the federal directory of substance-use and
|
|
10
|
+
* mental-health treatment facilities. The published CSV carries the facility name, an optional
|
|
11
|
+
* secondary name (typically the organizational parent), and the postal address quad split into
|
|
12
|
+
* primary + secondary street lines. Phase 1.6 §1.2 (#22) selects this source for the same reason
|
|
13
|
+
* it selects HRSA: facility names are hand-typed venue strings and the addresses pass through
|
|
14
|
+
* enough human + system hands to accumulate the suite-designator + sub-tenant chaos ("Suite C,
|
|
15
|
+
* behind main building") that pure gazetteer data does not.
|
|
16
|
+
*
|
|
17
|
+
* SAMHSA's two-line address shape is the key adapter-specific concern. `street1` typically carries
|
|
18
|
+
* the canonical postal address (`"123 Main St"`); `street2` carries the suite / unit / "second
|
|
19
|
+
* floor" surface form. The adapter joins them with `", "` into a single `street` component (Phase
|
|
20
|
+
* 1 keeps `unit` as a deferred slot since the OpenCage template doesn't have a clean rendering
|
|
21
|
+
* for it). Operators wanting a different join policy can subclass the factory.
|
|
22
|
+
*
|
|
23
|
+
* Column names below match the canonical SAMHSA Behavioral Health Treatment Services Locator CSV
|
|
24
|
+
* export header. Operators substituting a closely-related extract should rename columns to match;
|
|
25
|
+
* the README has the mapping cheatsheet.
|
|
26
|
+
*
|
|
27
|
+
* License: stamped `"Public Domain"` per the SAMHSA Open Data Foundry's federal-government
|
|
28
|
+
* distribution terms.
|
|
29
|
+
*/
|
|
30
|
+
import { parse as csvParse } from "csv-parse";
|
|
31
|
+
import { createReadStream } from "node:fs";
|
|
32
|
+
import { stableSourceId } from "../../adapter.js";
|
|
33
|
+
import { lookupStateAbbreviation } from "../../codex/us-fips-state.js";
|
|
34
|
+
import { reconcileComponents } from "../../format.js";
|
|
35
|
+
export const USGOV_SAMHSA_ADAPTER_ID = "usgov-samhsa-treatment-locator";
|
|
36
|
+
export const USGOV_SAMHSA_DEFAULT_LICENSE = "Public Domain";
|
|
37
|
+
const HOUSE_NUMBER_PREFIX = /^(\d+(?:-\d+)?[A-Za-z]?)\s+(.+)$/;
|
|
38
|
+
function splitAddress(address) {
|
|
39
|
+
const trimmed = address.trim();
|
|
40
|
+
if (!trimmed)
|
|
41
|
+
return null;
|
|
42
|
+
const m = HOUSE_NUMBER_PREFIX.exec(trimmed);
|
|
43
|
+
if (m)
|
|
44
|
+
return { house_number: m[1], street: m[2].trim() };
|
|
45
|
+
return { street: trimmed };
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* Join the SAMHSA two-line street: primary street + optional secondary line (suite / unit / floor /
|
|
49
|
+
* "behind main building") on `", "`. The combined value is the `street` component surface form.
|
|
50
|
+
* Phase 1 does not break this out into the `unit` component — see the file-level comment.
|
|
51
|
+
*/
|
|
52
|
+
function joinTwoLineStreet(street1, street2) {
|
|
53
|
+
const s1 = street1.trim();
|
|
54
|
+
const s2 = (street2 ?? "").trim();
|
|
55
|
+
if (!s1 && !s2)
|
|
56
|
+
return "";
|
|
57
|
+
if (!s2)
|
|
58
|
+
return s1;
|
|
59
|
+
if (!s1)
|
|
60
|
+
return s2;
|
|
61
|
+
return `${s1}, ${s2}`;
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* Combine `name1` + optional `name2` into a single venue surface form. SAMHSA conventions:
|
|
65
|
+
*
|
|
66
|
+
* - `name1` is the program / clinic name ("Mountain Plains Counseling Services").
|
|
67
|
+
* - `name2` is the parent organization ("Catholic Charities of Wyoming"), if any.
|
|
68
|
+
*
|
|
69
|
+
* Both render together as `"<name1> - <name2>"` when both are present — geocoder users typically
|
|
70
|
+
* type either form, so the model benefits from the joined surface.
|
|
71
|
+
*/
|
|
72
|
+
function composeVenue(name1, name2) {
|
|
73
|
+
const n1 = name1.trim();
|
|
74
|
+
const n2 = (name2 ?? "").trim();
|
|
75
|
+
if (!n1 && !n2)
|
|
76
|
+
return "";
|
|
77
|
+
if (!n2)
|
|
78
|
+
return n1;
|
|
79
|
+
if (!n1)
|
|
80
|
+
return n2;
|
|
81
|
+
return `${n1} - ${n2}`;
|
|
82
|
+
}
|
|
83
|
+
/** Same envelope-style format as HRSA: venue prefix, street body, city/state/zip suffix. */
|
|
84
|
+
function composeRaw(venue, house, street, city, state, postcode) {
|
|
85
|
+
const streetPart = [house, street].filter(Boolean).join(" ").trim();
|
|
86
|
+
const cityPart = [city.trim(), [state, postcode].filter(Boolean).join(" ").trim()].filter(Boolean).join(", ");
|
|
87
|
+
return [venue.trim(), streetPart, cityPart].filter(Boolean).join(", ");
|
|
88
|
+
}
|
|
89
|
+
export function createUsgovSamhsaTreatmentLocatorAdapter() {
|
|
90
|
+
return {
|
|
91
|
+
id: USGOV_SAMHSA_ADAPTER_ID,
|
|
92
|
+
defaultLicense: USGOV_SAMHSA_DEFAULT_LICENSE,
|
|
93
|
+
description: "SAMHSA Behavioral Health Treatment Services Locator (public-domain). Adversarial source: venue + two-line address co-occurrence, hand-entered.",
|
|
94
|
+
async *rows(opts) {
|
|
95
|
+
if (opts.country && opts.country !== "US") {
|
|
96
|
+
throw new Error(`usgov-samhsa adapter: only US supported, got country=${opts.country}`);
|
|
97
|
+
}
|
|
98
|
+
const stream = createReadStream(opts.inputPath, { encoding: "utf8" });
|
|
99
|
+
const parser = stream.pipe(csvParse({
|
|
100
|
+
columns: true,
|
|
101
|
+
skip_empty_lines: true,
|
|
102
|
+
relax_quotes: true,
|
|
103
|
+
relax_column_count: true,
|
|
104
|
+
}));
|
|
105
|
+
let emitted = 0;
|
|
106
|
+
try {
|
|
107
|
+
for await (const record of parser) {
|
|
108
|
+
if (opts.signal?.aborted)
|
|
109
|
+
break;
|
|
110
|
+
if (opts.limit !== undefined && emitted >= opts.limit)
|
|
111
|
+
break;
|
|
112
|
+
const venue = composeVenue(record.name1 ?? "", record.name2);
|
|
113
|
+
const street = joinTwoLineStreet(record.street1 ?? "", record.street2);
|
|
114
|
+
const split = splitAddress(street);
|
|
115
|
+
const city = (record.city ?? "").trim();
|
|
116
|
+
const stateAbbr = (record.state ?? "").trim();
|
|
117
|
+
const postcode = (record.zip ?? "").trim();
|
|
118
|
+
if (!venue || !split || !city || !postcode)
|
|
119
|
+
continue;
|
|
120
|
+
const state = lookupStateAbbreviation(stateAbbr);
|
|
121
|
+
if (!state)
|
|
122
|
+
continue;
|
|
123
|
+
// venue first — same kryptonite-defending insertion order as HRSA.
|
|
124
|
+
const components = {
|
|
125
|
+
venue,
|
|
126
|
+
...(split.house_number ? { house_number: split.house_number } : {}),
|
|
127
|
+
street: split.street,
|
|
128
|
+
locality: city,
|
|
129
|
+
region: state.abbreviation,
|
|
130
|
+
postcode,
|
|
131
|
+
};
|
|
132
|
+
const raw = composeRaw(venue, split.house_number, split.street, city, state.abbreviation, postcode);
|
|
133
|
+
if (!raw)
|
|
134
|
+
continue;
|
|
135
|
+
const aligned = reconcileComponents(components, raw);
|
|
136
|
+
if (Object.keys(aligned).length === 0)
|
|
137
|
+
continue;
|
|
138
|
+
const frId = (record.frid ?? "").trim();
|
|
139
|
+
const sourceId = frId
|
|
140
|
+
? `${USGOV_SAMHSA_ADAPTER_ID}-${frId}`
|
|
141
|
+
: stableSourceId(USGOV_SAMHSA_ADAPTER_ID, aligned);
|
|
142
|
+
yield {
|
|
143
|
+
raw,
|
|
144
|
+
components: aligned,
|
|
145
|
+
country: "US",
|
|
146
|
+
locale: "en-US",
|
|
147
|
+
source: USGOV_SAMHSA_ADAPTER_ID,
|
|
148
|
+
source_id: sourceId,
|
|
149
|
+
corpus_version: "",
|
|
150
|
+
license: USGOV_SAMHSA_DEFAULT_LICENSE,
|
|
151
|
+
};
|
|
152
|
+
emitted++;
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
finally {
|
|
156
|
+
stream.destroy();
|
|
157
|
+
}
|
|
158
|
+
},
|
|
159
|
+
};
|
|
160
|
+
}
|
|
161
|
+
export const usgovSamhsaTreatmentLocatorAdapter = createUsgovSamhsaTreatmentLocatorAdapter();
|
|
162
|
+
//# sourceMappingURL=adapter.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"adapter.js","sourceRoot":"","sources":["../../../../src/adapters/usgov-samhsa-treatment-locator/adapter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA4BG;AAEH,OAAO,EAAE,KAAK,IAAI,QAAQ,EAAE,MAAM,WAAW,CAAA;AAC7C,OAAO,EAAE,gBAAgB,EAAE,MAAM,SAAS,CAAA;AAC1C,OAAO,EAAE,cAAc,EAAE,MAAM,kBAAkB,CAAA;AACjD,OAAO,EAAE,uBAAuB,EAAE,MAAM,8BAA8B,CAAA;AACtE,OAAO,EAAE,mBAAmB,EAAE,MAAM,iBAAiB,CAAA;AAGrD,MAAM,CAAC,MAAM,uBAAuB,GAAG,gCAAgC,CAAA;AACvE,MAAM,CAAC,MAAM,4BAA4B,GAAG,eAAe,CAAA;AAmB3D,MAAM,mBAAmB,GAAG,kCAAkC,CAAA;AAE9D,SAAS,YAAY,CAAC,OAAe;IACpC,MAAM,OAAO,GAAG,OAAO,CAAC,IAAI,EAAE,CAAA;IAC9B,IAAI,CAAC,OAAO;QAAE,OAAO,IAAI,CAAA;IACzB,MAAM,CAAC,GAAG,mBAAmB,CAAC,IAAI,CAAC,OAAO,CAAC,CAAA;IAC3C,IAAI,CAAC;QAAE,OAAO,EAAE,YAAY,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC,CAAC,CAAE,CAAC,IAAI,EAAE,EAAE,CAAA;IAC1D,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,CAAA;AAC3B,CAAC;AAED;;;;GAIG;AACH,SAAS,iBAAiB,CAAC,OAAe,EAAE,OAA2B;IACtE,MAAM,EAAE,GAAG,OAAO,CAAC,IAAI,EAAE,CAAA;IACzB,MAAM,EAAE,GAAG,CAAC,OAAO,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;IACjC,IAAI,CAAC,EAAE,IAAI,CAAC,EAAE;QAAE,OAAO,EAAE,CAAA;IACzB,IAAI,CAAC,EAAE;QAAE,OAAO,EAAE,CAAA;IAClB,IAAI,CAAC,EAAE;QAAE,OAAO,EAAE,CAAA;IAClB,OAAO,GAAG,EAAE,KAAK,EAAE,EAAE,CAAA;AACtB,CAAC;AAED;;;;;;;;GAQG;AACH,SAAS,YAAY,CAAC,KAAa,EAAE,KAAyB;IAC7D,MAAM,EAAE,GAAG,KAAK,CAAC,IAAI,EAAE,CAAA;IACvB,MAAM,EAAE,GAAG,CAAC,KAAK,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;IAC/B,IAAI,CAAC,EAAE,IAAI,CAAC,EAAE;QAAE,OAAO,EAAE,CAAA;IACzB,IAAI,CAAC,EAAE;QAAE,OAAO,EAAE,CAAA;IAClB,IAAI,CAAC,EAAE;QAAE,OAAO,EAAE,CAAA;IAClB,OAAO,GAAG,EAAE,MAAM,EAAE,EAAE,CAAA;AACvB,CAAC;AAED,4FAA4F;AAC5F,SAAS,UAAU,CAClB,KAAa,EACb,KAAyB,EACzB,MAAc,EACd,IAAY,EACZ,KAAa,EACb,QAAgB;IAEhB,MAAM,UAAU,GAAG,CAAC,KAAK,EAAE,MAAM,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAA;IACnE,MAAM,QAAQ,GAAG,CAAC,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC,KAAK,EAAE,QAAQ,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;IAC7G,OAAO,CAAC,KAAK,CAAC,IAAI,EAAE,EAAE,UAAU,EAAE,QAAQ,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;AACvE,CAAC;AAED,MAAM,UAAU,wCAAwC;IACvD,OAAO;QACN,EAAE,EAAE,uBAAuB;QAC3B,cAAc,EAAE,4BAA4B;QAC5C,WAAW,EACV,gJAAgJ;QAEjJ,KAAK,CAAC,CAAC,IAAI,CAAC,IAAoB;YAC/B,IAAI,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,OAAO,KAAK,IAAI,EAAE,CAAC;gBAC3C,MAAM,IAAI,KAAK,CAAC,wDAAwD,IAAI,CAAC,OAAO,EAAE,CAAC,CAAA;YACxF,CAAC;YAED,MAAM,MAAM,GAAG,gBAAgB,CAAC,IAAI,CAAC,SAAS,EAAE,EAAE,QAAQ,EAAE,MAAM,EAAE,CAAC,CAAA;YACrE,MAAM,MAAM,GAAG,MAAM,CAAC,IAAI,CACzB,QAAQ,CAAC;gBACR,OAAO,EAAE,IAAI;gBACb,gBAAgB,EAAE,IAAI;gBACtB,YAAY,EAAE,IAAI;gBAClB,kBAAkB,EAAE,IAAI;aACxB,CAAC,CACF,CAAA;YAED,IAAI,OAAO,GAAG,CAAC,CAAA;YACf,IAAI,CAAC;gBACJ,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,MAAsC,EAAE,CAAC;oBACnE,IAAI,IAAI,CAAC,MAAM,EAAE,OAAO;wBAAE,MAAK;oBAC/B,IAAI,IAAI,CAAC,KAAK,KAAK,SAAS,IAAI,OAAO,IAAI,IAAI,CAAC,KAAK;wBAAE,MAAK;oBAE5D,MAAM,KAAK,GAAG,YAAY,CAAC,MAAM,CAAC,KAAK,IAAI,EAAE,EAAE,MAAM,CAAC,KAAK,CAAC,CAAA;oBAC5D,MAAM,MAAM,GAAG,iBAAiB,CAAC,MAAM,CAAC,OAAO,IAAI,EAAE,EAAE,MAAM,CAAC,OAAO,CAAC,CAAA;oBACtE,MAAM,KAAK,GAAG,YAAY,CAAC,MAAM,CAAC,CAAA;oBAClC,MAAM,IAAI,GAAG,CAAC,MAAM,CAAC,IAAI,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBACvC,MAAM,SAAS,GAAG,CAAC,MAAM,CAAC,KAAK,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBAC7C,MAAM,QAAQ,GAAG,CAAC,MAAM,CAAC,GAAG,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBAE1C,IAAI,CAAC,KAAK,IAAI,CAAC,KAAK,IAAI,CAAC,IAAI,IAAI,CAAC,QAAQ;wBAAE,SAAQ;oBACpD,MAAM,KAAK,GAAG,uBAAuB,CAAC,SAAS,CAAC,CAAA;oBAChD,IAAI,CAAC,KAAK;wBAAE,SAAQ;oBAEpB,mEAAmE;oBACnE,MAAM,UAAU,GAA+B;wBAC9C,KAAK;wBACL,GAAG,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC,CAAC,EAAE,YAAY,EAAE,KAAK,CAAC,YAAY,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;wBACnE,MAAM,EAAE,KAAK,CAAC,MAAM;wBACpB,QAAQ,EAAE,IAAI;wBACd,MAAM,EAAE,KAAK,CAAC,YAAY;wBAC1B,QAAQ;qBACR,CAAA;oBAED,MAAM,GAAG,GAAG,UAAU,CAAC,KAAK,EAAE,KAAK,CAAC,YAAY,EAAE,KAAK,CAAC,MAAM,EAAE,IAAI,EAAE,KAAK,CAAC,YAAY,EAAE,QAAQ,CAAC,CAAA;oBACnG,IAAI,CAAC,GAAG;wBAAE,SAAQ;oBAElB,MAAM,OAAO,GAAG,mBAAmB,CAAC,UAAU,EAAE,GAAG,CAAC,CAAA;oBACpD,IAAI,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,MAAM,KAAK,CAAC;wBAAE,SAAQ;oBAE/C,MAAM,IAAI,GAAG,CAAC,MAAM,CAAC,IAAI,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBACvC,MAAM,QAAQ,GAAG,IAAI;wBACpB,CAAC,CAAC,GAAG,uBAAuB,IAAI,IAAI,EAAE;wBACtC,CAAC,CAAC,cAAc,CAAC,uBAAuB,EAAE,OAAO,CAAC,CAAA;oBAEnD,MAAM;wBACL,GAAG;wBACH,UAAU,EAAE,OAAO;wBACnB,OAAO,EAAE,IAAI;wBACb,MAAM,EAAE,OAAO;wBACf,MAAM,EAAE,uBAAuB;wBAC/B,SAAS,EAAE,QAAQ;wBACnB,cAAc,EAAE,EAAE;wBAClB,OAAO,EAAE,4BAA4B;qBACrC,CAAA;oBACD,OAAO,EAAE,CAAA;gBACV,CAAC;YACF,CAAC;oBAAS,CAAC;gBACV,MAAM,CAAC,OAAO,EAAE,CAAA;YACjB,CAAC;QACF,CAAC;KACD,CAAA;AACF,CAAC;AAED,MAAM,CAAC,MAAM,kCAAkC,GAAG,wCAAwC,EAAE,CAAA"}
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* `wof-admin`: Who's On First admin GeoJSON-bundle adapter.
|
|
7
|
+
*
|
|
8
|
+
* **Phase 1.5.1 pivot.** The original Phase 1.5 SQLite adapter (formerly at
|
|
9
|
+
* `packages/corpus/src/adapters/wof-admin/`, removed in this same change) was replaced by this
|
|
10
|
+
* one because the SQLite distribution path was unworkable for the real corpus build:
|
|
11
|
+
*
|
|
12
|
+
* 1. `dist.whosonfirst.org/sqlite/` is dead (NXDOMAIN); the Geocode-Earth mirror is the only one.
|
|
13
|
+
* 2. The Geocode-Earth-hosted postalcode DB tags every row `mz:is_current = -1` ("unknown but treated
|
|
14
|
+
* as active"); the SQLite adapter's `is_current = 1` predicate emitted zero rows.
|
|
15
|
+
* 3. The `names` table in the SQLite distribution is empty — localized `name:*` variants live in a
|
|
16
|
+
* separate distribution. The St. Petersburg / Mt. Vernon / Ft. Lauderdale alternation cases
|
|
17
|
+
* (the original Phase 1.5.1 motivator) cannot be solved on the SQLite path even with a
|
|
18
|
+
* patched `is_current` predicate.
|
|
19
|
+
*
|
|
20
|
+
* Input: a directory containing one or more cloned `whosonfirst-data-admin-<cc>` GitHub repos. Each
|
|
21
|
+
* repo has `data/XXX/YYY/ZZZ/<wof-id>.geojson` files; `**\/*.geojson` walks the tree recursively.
|
|
22
|
+
* Alternate-geometry siblings (`-alt-*`) are skipped — they're separate exports of the same
|
|
23
|
+
* record, not new records.
|
|
24
|
+
*
|
|
25
|
+
* Per record, the adapter emits one row per `(name-variant, hierarchy-variant)` pair:
|
|
26
|
+
*
|
|
27
|
+
* - **Name variants**: the canonical `wof:name` (slot key `default`) plus every `name:*` localized
|
|
28
|
+
* variant present on the feature (`name:eng_x_preferred`, `name:eng_x_colloquial`,
|
|
29
|
+
* `name:rus_x_preferred`, ...). This is the Phase 1.5.1 fix for the St. Petersburg case:
|
|
30
|
+
* `"Saint Petersburg"` (canonical) and `"St. Petersburg"` (eng_x_colloquial) both become
|
|
31
|
+
* training rows for the same WOF id.
|
|
32
|
+
* - **Hierarchy variants** (unchanged from the SQLite adapter): locality → 3 variants, region → 2,
|
|
33
|
+
* country → 1, county → 1.
|
|
34
|
+
*
|
|
35
|
+
* `source_id` is `wof-admin-<wof_id>-<name-slot>-<hierarchy-variant>`. The previous SQLite adapter
|
|
36
|
+
* used `wof-admin-<wof_id>-<hierarchy-variant>` (no name slot); the new format adds a name-slot
|
|
37
|
+
* segment so the colloquial / preferred / per-locale variants survive dedup independently.
|
|
38
|
+
*
|
|
39
|
+
* License: CC0. The adapter stamps every row with `CC0-1.0`.
|
|
40
|
+
*/
|
|
41
|
+
import type { ComponentTag } from "@mailwoman/core/types";
|
|
42
|
+
import type { CorpusAdapter } from "../../types.js";
|
|
43
|
+
import { type WofRecord } from "../../wof-json.js";
|
|
44
|
+
interface VariantSpec {
|
|
45
|
+
/** Hierarchy-variant id appended to `source_id`. */
|
|
46
|
+
suffix: string;
|
|
47
|
+
/** Component tag → display string the adapter will hand to the runner. */
|
|
48
|
+
components: Partial<Record<ComponentTag, string>>;
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* Compute the hierarchy variants for a record given its ancestry chain and the chosen `selfName`.
|
|
52
|
+
*
|
|
53
|
+
* `selfName` is the surface form to use for the record's own component (locality / region / country
|
|
54
|
+
* / subregion). Callers pass the canonical `wof:name` for the `"default"` slot and a `name:*`
|
|
55
|
+
* localized value for variant slots; ancestor names always come from the ancestor's canonical
|
|
56
|
+
* `wof:name`.
|
|
57
|
+
*
|
|
58
|
+
* Country variants substitute `COUNTRY_DISPLAY_NAME` for the default slot so the OpenCage template
|
|
59
|
+
* produces the canonicalized form (`"United States of America"`), matching the legacy SQLite
|
|
60
|
+
* adapter's behavior.
|
|
61
|
+
*/
|
|
62
|
+
export declare function variantsFor(row: WofRecord, ancestry: WofRecord[], selfName: string): VariantSpec[];
|
|
63
|
+
/**
|
|
64
|
+
* Build the per-record name-slot list. The canonical `"default"` slot uses the OpenCage-canonical
|
|
65
|
+
* country form when the record is itself a country (matches SQLite-adapter behavior); every other
|
|
66
|
+
* placetype's default slot uses `wof:name` verbatim.
|
|
67
|
+
*
|
|
68
|
+
* Subsequent slots come from `name:*` variants, deduplicated against the default name so we don't
|
|
69
|
+
* emit a redundant `"default"`-equivalent row under a localized key.
|
|
70
|
+
*/
|
|
71
|
+
export declare function nameSlotsFor(rec: WofRecord): Array<{
|
|
72
|
+
key: string;
|
|
73
|
+
value: string;
|
|
74
|
+
}>;
|
|
75
|
+
export declare const WOF_ADMIN_ADAPTER_ID = "wof-admin";
|
|
76
|
+
/**
|
|
77
|
+
* Construct the wof-admin JSON-bundle adapter. The adapter is stateless across runs; calling this
|
|
78
|
+
* twice with the same input directory produces byte-identical `canonical.jsonl` (records are
|
|
79
|
+
* emitted in sorted `wof:id` order to be insensitive to filesystem walk ordering).
|
|
80
|
+
*/
|
|
81
|
+
export declare function createWofAdminAdapter(): CorpusAdapter;
|
|
82
|
+
/** Single shared instance, suitable for `defaultAdapterRegistry`. */
|
|
83
|
+
export declare const wofAdminAdapter: CorpusAdapter;
|
|
84
|
+
export {};
|
|
85
|
+
//# sourceMappingURL=adapter.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"adapter.d.ts","sourceRoot":"","sources":["../../../../src/adapters/wof-admin-json/adapter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAuCG;AAGH,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,uBAAuB,CAAA;AAEzD,OAAO,KAAK,EAAgC,aAAa,EAAE,MAAM,gBAAgB,CAAA;AACjF,OAAO,EAAsD,KAAK,SAAS,EAAE,MAAM,mBAAmB,CAAA;AAiDtG,UAAU,WAAW;IACpB,oDAAoD;IACpD,MAAM,EAAE,MAAM,CAAA;IAEd,0EAA0E;IAC1E,UAAU,EAAE,OAAO,CAAC,MAAM,CAAC,YAAY,EAAE,MAAM,CAAC,CAAC,CAAA;CACjD;AAED;;;;;;;;;;;GAWG;AACH,wBAAgB,WAAW,CAAC,GAAG,EAAE,SAAS,EAAE,QAAQ,EAAE,SAAS,EAAE,EAAE,QAAQ,EAAE,MAAM,GAAG,WAAW,EAAE,CA0DlG;AAED;;;;;;;GAOG;AACH,wBAAgB,YAAY,CAAC,GAAG,EAAE,SAAS,GAAG,KAAK,CAAC;IAAE,GAAG,EAAE,MAAM,CAAC;IAAC,KAAK,EAAE,MAAM,CAAA;CAAE,CAAC,CAclF;AAED,eAAO,MAAM,oBAAoB,cAAc,CAAA;AAE/C;;;;GAIG;AACH,wBAAgB,qBAAqB,IAAI,aAAa,CA2DrD;AAED,qEAAqE;AACrE,eAAO,MAAM,eAAe,eAA0B,CAAA"}
|