@mailwoman/corpus 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. package/out/src/adapter.d.ts +96 -0
  2. package/out/src/adapter.d.ts.map +1 -0
  3. package/out/src/adapter.js +107 -0
  4. package/out/src/adapter.js.map +1 -0
  5. package/out/src/adapters/ban/adapter.d.ts +32 -0
  6. package/out/src/adapters/ban/adapter.d.ts.map +1 -0
  7. package/out/src/adapters/ban/adapter.js +133 -0
  8. package/out/src/adapters/ban/adapter.js.map +1 -0
  9. package/out/src/adapters/fcc-bdc/adapter.d.ts +61 -0
  10. package/out/src/adapters/fcc-bdc/adapter.d.ts.map +1 -0
  11. package/out/src/adapters/fcc-bdc/adapter.js +153 -0
  12. package/out/src/adapters/fcc-bdc/adapter.js.map +1 -0
  13. package/out/src/adapters/index.d.ts +42 -0
  14. package/out/src/adapters/index.d.ts.map +1 -0
  15. package/out/src/adapters/index.js +76 -0
  16. package/out/src/adapters/index.js.map +1 -0
  17. package/out/src/adapters/openaddresses/adapter.d.ts +60 -0
  18. package/out/src/adapters/openaddresses/adapter.d.ts.map +1 -0
  19. package/out/src/adapters/openaddresses/adapter.js +174 -0
  20. package/out/src/adapters/openaddresses/adapter.js.map +1 -0
  21. package/out/src/adapters/state-ia-contractors/adapter.d.ts +23 -0
  22. package/out/src/adapters/state-ia-contractors/adapter.d.ts.map +1 -0
  23. package/out/src/adapters/state-ia-contractors/adapter.js +113 -0
  24. package/out/src/adapters/state-ia-contractors/adapter.js.map +1 -0
  25. package/out/src/adapters/state-ny-notaries/adapter.d.ts +21 -0
  26. package/out/src/adapters/state-ny-notaries/adapter.d.ts.map +1 -0
  27. package/out/src/adapters/state-ny-notaries/adapter.js +132 -0
  28. package/out/src/adapters/state-ny-notaries/adapter.js.map +1 -0
  29. package/out/src/adapters/state-tx-notaries/adapter.d.ts +22 -0
  30. package/out/src/adapters/state-tx-notaries/adapter.d.ts.map +1 -0
  31. package/out/src/adapters/state-tx-notaries/adapter.js +125 -0
  32. package/out/src/adapters/state-tx-notaries/adapter.js.map +1 -0
  33. package/out/src/adapters/tiger/adapter.d.ts +45 -0
  34. package/out/src/adapters/tiger/adapter.d.ts.map +1 -0
  35. package/out/src/adapters/tiger/adapter.js +179 -0
  36. package/out/src/adapters/tiger/adapter.js.map +1 -0
  37. package/out/src/adapters/usgov-hrsa-fqhc/adapter.d.ts +36 -0
  38. package/out/src/adapters/usgov-hrsa-fqhc/adapter.d.ts.map +1 -0
  39. package/out/src/adapters/usgov-hrsa-fqhc/adapter.js +147 -0
  40. package/out/src/adapters/usgov-hrsa-fqhc/adapter.js.map +1 -0
  41. package/out/src/adapters/usgov-imls-pls/adapter.d.ts +25 -0
  42. package/out/src/adapters/usgov-imls-pls/adapter.d.ts.map +1 -0
  43. package/out/src/adapters/usgov-imls-pls/adapter.js +118 -0
  44. package/out/src/adapters/usgov-imls-pls/adapter.js.map +1 -0
  45. package/out/src/adapters/usgov-nad/adapter.d.ts +37 -0
  46. package/out/src/adapters/usgov-nad/adapter.d.ts.map +1 -0
  47. package/out/src/adapters/usgov-nad/adapter.js +227 -0
  48. package/out/src/adapters/usgov-nad/adapter.js.map +1 -0
  49. package/out/src/adapters/usgov-nppes/adapter.d.ts +28 -0
  50. package/out/src/adapters/usgov-nppes/adapter.d.ts.map +1 -0
  51. package/out/src/adapters/usgov-nppes/adapter.js +123 -0
  52. package/out/src/adapters/usgov-nppes/adapter.js.map +1 -0
  53. package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.d.ts +35 -0
  54. package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.d.ts.map +1 -0
  55. package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.js +162 -0
  56. package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.js.map +1 -0
  57. package/out/src/adapters/wof-admin-json/adapter.d.ts +85 -0
  58. package/out/src/adapters/wof-admin-json/adapter.d.ts.map +1 -0
  59. package/out/src/adapters/wof-admin-json/adapter.js +241 -0
  60. package/out/src/adapters/wof-admin-json/adapter.js.map +1 -0
  61. package/out/src/adapters/wof-postalcode-json/adapter.d.ts +63 -0
  62. package/out/src/adapters/wof-postalcode-json/adapter.d.ts.map +1 -0
  63. package/out/src/adapters/wof-postalcode-json/adapter.js +178 -0
  64. package/out/src/adapters/wof-postalcode-json/adapter.js.map +1 -0
  65. package/out/src/align.d.ts +58 -0
  66. package/out/src/align.d.ts.map +1 -0
  67. package/out/src/align.js +139 -0
  68. package/out/src/align.js.map +1 -0
  69. package/out/src/build.d.ts +104 -0
  70. package/out/src/build.d.ts.map +1 -0
  71. package/out/src/build.js +201 -0
  72. package/out/src/build.js.map +1 -0
  73. package/out/src/codex/us-fips-state.d.ts +44 -0
  74. package/out/src/codex/us-fips-state.d.ts.map +1 -0
  75. package/out/src/codex/us-fips-state.js +105 -0
  76. package/out/src/codex/us-fips-state.js.map +1 -0
  77. package/out/src/codex/us-street-suffix.d.ts +259 -0
  78. package/out/src/codex/us-street-suffix.d.ts.map +1 -0
  79. package/out/src/codex/us-street-suffix.js +285 -0
  80. package/out/src/codex/us-street-suffix.js.map +1 -0
  81. package/out/src/format.d.ts +79 -0
  82. package/out/src/format.d.ts.map +1 -0
  83. package/out/src/format.js +151 -0
  84. package/out/src/format.js.map +1 -0
  85. package/out/src/golden.d.ts +50 -0
  86. package/out/src/golden.d.ts.map +1 -0
  87. package/out/src/golden.js +104 -0
  88. package/out/src/golden.js.map +1 -0
  89. package/out/src/index.d.ts +18 -0
  90. package/out/src/index.d.ts.map +1 -0
  91. package/out/src/index.js +18 -0
  92. package/out/src/index.js.map +1 -0
  93. package/out/src/parquet-wrapper/index.d.ts +12 -0
  94. package/out/src/parquet-wrapper/index.d.ts.map +1 -0
  95. package/out/src/parquet-wrapper/index.js +12 -0
  96. package/out/src/parquet-wrapper/index.js.map +1 -0
  97. package/out/src/parquet-wrapper/reader.d.ts +31 -0
  98. package/out/src/parquet-wrapper/reader.d.ts.map +1 -0
  99. package/out/src/parquet-wrapper/reader.js +54 -0
  100. package/out/src/parquet-wrapper/reader.js.map +1 -0
  101. package/out/src/parquet-wrapper/schema.d.ts +45 -0
  102. package/out/src/parquet-wrapper/schema.d.ts.map +1 -0
  103. package/out/src/parquet-wrapper/schema.js +55 -0
  104. package/out/src/parquet-wrapper/schema.js.map +1 -0
  105. package/out/src/parquet-wrapper/writer.d.ts +41 -0
  106. package/out/src/parquet-wrapper/writer.d.ts.map +1 -0
  107. package/out/src/parquet-wrapper/writer.js +71 -0
  108. package/out/src/parquet-wrapper/writer.js.map +1 -0
  109. package/out/src/parquet.d.ts +122 -0
  110. package/out/src/parquet.d.ts.map +1 -0
  111. package/out/src/parquet.js +220 -0
  112. package/out/src/parquet.js.map +1 -0
  113. package/out/src/runner.d.ts +100 -0
  114. package/out/src/runner.d.ts.map +1 -0
  115. package/out/src/runner.js +183 -0
  116. package/out/src/runner.js.map +1 -0
  117. package/out/src/split.d.ts +108 -0
  118. package/out/src/split.d.ts.map +1 -0
  119. package/out/src/split.js +191 -0
  120. package/out/src/split.js.map +1 -0
  121. package/out/src/synthesize.d.ts +146 -0
  122. package/out/src/synthesize.d.ts.map +1 -0
  123. package/out/src/synthesize.js +472 -0
  124. package/out/src/synthesize.js.map +1 -0
  125. package/out/src/tokenize.d.ts +47 -0
  126. package/out/src/tokenize.d.ts.map +1 -0
  127. package/out/src/tokenize.js +49 -0
  128. package/out/src/tokenize.js.map +1 -0
  129. package/out/src/types.d.ts +168 -0
  130. package/out/src/types.d.ts.map +1 -0
  131. package/out/src/types.js +19 -0
  132. package/out/src/types.js.map +1 -0
  133. package/out/src/wof-json.d.ts +105 -0
  134. package/out/src/wof-json.d.ts.map +1 -0
  135. package/out/src/wof-json.js +174 -0
  136. package/out/src/wof-json.js.map +1 -0
  137. package/package.json +36 -0
@@ -0,0 +1,147 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * `usgov-hrsa-fqhc`: HRSA "Health Center Service Delivery Site Locations" CSV consumer.
7
+ *
8
+ * Federally Qualified Health Centers (FQHCs) are HRSA-funded community health programs that
9
+ * self-report site addresses to the HRSA Data Warehouse. The published CSV (`data.hrsa.gov`)
10
+ * carries the site name, the postal-formatted street address, and the locality/region/postcode
11
+ * quad. Phase 1.6 §1.2 (#22) selects this source for its adversarial-value-per-row: every
12
+ * facility name is a human-typed venue string and the addresses pass through enough hands to
13
+ * accumulate the abbreviation drift + suite designator chaos that pure gazetteer data does not.
14
+ *
15
+ * The adapter consumes a CSV file the operator pre-downloads. The HRSA data is published as a
16
+ * single national CSV (~10K rows), small enough that the operator can re-fetch on every corpus
17
+ * rebuild without an intermediate SQLite step. Column names below match the HRSA Data Warehouse's
18
+ * "Health Center Service Delivery Site" public dataset. Operators substituting the
19
+ * closely-related "Site Address" or "Health Center" public extracts may need to remap columns;
20
+ * the README documents the expected set.
21
+ *
22
+ * Output: one row per CSV record, with `venue` component carrying the site name and the address
23
+ * quad on `(house_number, street, locality, region, postcode)`. Component order is load-bearing:
24
+ * `venue` is inserted FIRST so alignment claims its surface span before `locality` searches for
25
+ * its own (the kryptonite case "Buffalo Health Clinic, …, Buffalo, NY" relies on `venue`
26
+ * consuming the first "Buffalo" so locality lands on the second).
27
+ *
28
+ * License: stamped `"Public Domain"` per the HRSA Data Warehouse's federal government distribution
29
+ * terms.
30
+ */
31
+ import { parse as csvParse } from "csv-parse";
32
+ import { createReadStream } from "node:fs";
33
+ import { stableSourceId } from "../../adapter.js";
34
+ import { lookupStateAbbreviation } from "../../codex/us-fips-state.js";
35
+ import { reconcileComponents } from "../../format.js";
36
+ export const USGOV_HRSA_FQHC_ADAPTER_ID = "usgov-hrsa-fqhc";
37
+ export const USGOV_HRSA_FQHC_DEFAULT_LICENSE = "Public Domain";
38
+ /**
39
+ * Split a "123 Main St Suite 4" surface form into `(house_number, street)`. The regex tolerates one
40
+ * trailing letter on the number (`"123A Main St"`) and a hyphenated form (`"40-12 Bell Blvd"`);
41
+ * anything else falls back to street-only.
42
+ *
43
+ * Suite / Apt / Unit designators stay on `street` for Phase 1 — Mailwoman's `unit` component exists
44
+ * but the address-formatter does not have a clean slot for it, and HRSA addresses do not separate
45
+ * the suite into its own column. Leaving the surface form intact in `street` preserves the
46
+ * adversarial training signal (the model learns that a trailing "Suite 4" is part of the road line
47
+ * in this distribution).
48
+ */
49
+ const HOUSE_NUMBER_PREFIX = /^(\d+(?:-\d+)?[A-Za-z]?)\s+(.+)$/;
50
+ function splitAddress(address) {
51
+ const trimmed = address.trim();
52
+ if (!trimmed)
53
+ return null;
54
+ const m = HOUSE_NUMBER_PREFIX.exec(trimmed);
55
+ if (m)
56
+ return { house_number: m[1], street: m[2].trim() };
57
+ return { street: trimmed };
58
+ }
59
+ /**
60
+ * Compose the raw envelope-style address line. Format:
61
+ *
62
+ * "<Site Name>, <house> <street>, <city>, <state> <postcode>"
63
+ *
64
+ * The site name leads (US conventional addressee-then-address ordering) so a downstream model sees
65
+ * the venue-prefix-then-address shape that HRSA users actually type into geocoders.
66
+ */
67
+ function composeRaw(venue, house, street, city, state, postcode) {
68
+ const streetPart = [house, street].filter(Boolean).join(" ").trim();
69
+ const cityPart = [city.trim(), [state, postcode].filter(Boolean).join(" ").trim()].filter(Boolean).join(", ");
70
+ return [venue.trim(), streetPart, cityPart].filter(Boolean).join(", ");
71
+ }
72
+ export function createUsgovHrsaFqhcAdapter() {
73
+ return {
74
+ id: USGOV_HRSA_FQHC_ADAPTER_ID,
75
+ defaultLicense: USGOV_HRSA_FQHC_DEFAULT_LICENSE,
76
+ description: "HRSA Federally Qualified Health Center site locations (public-domain). Adversarial source: venue + address co-occurrence, hand-entered.",
77
+ async *rows(opts) {
78
+ if (opts.country && opts.country !== "US") {
79
+ throw new Error(`usgov-hrsa-fqhc adapter: only US supported, got country=${opts.country}`);
80
+ }
81
+ const stream = createReadStream(opts.inputPath, { encoding: "utf8" });
82
+ const parser = stream.pipe(csvParse({
83
+ columns: true,
84
+ skip_empty_lines: true,
85
+ relax_quotes: true,
86
+ relax_column_count: true,
87
+ }));
88
+ let emitted = 0;
89
+ try {
90
+ for await (const record of parser) {
91
+ if (opts.signal?.aborted)
92
+ break;
93
+ if (opts.limit !== undefined && emitted >= opts.limit)
94
+ break;
95
+ const venue = (record["Site Name"] ?? "").trim();
96
+ const split = splitAddress(record["Site Address"] ?? "");
97
+ const city = (record["Site City"] ?? "").trim();
98
+ const stateAbbr = (record["Site State Abbreviation"] ?? "").trim();
99
+ const postcode = (record["Site Postal Code"] ?? "").trim();
100
+ if (!venue || !split || !city || !postcode)
101
+ continue;
102
+ const state = lookupStateAbbreviation(stateAbbr);
103
+ if (!state)
104
+ continue;
105
+ // Insertion order matters here. `venue` first so alignment claims its span
106
+ // (which may contain a token like "Buffalo") before `locality` runs its
107
+ // search — the kryptonite case `Buffalo Health Clinic, Buffalo NY`
108
+ // otherwise mis-labels the venue's "Buffalo" as locality.
109
+ const components = {
110
+ venue,
111
+ ...(split.house_number ? { house_number: split.house_number } : {}),
112
+ street: split.street,
113
+ locality: city,
114
+ region: state.abbreviation,
115
+ postcode,
116
+ };
117
+ const raw = composeRaw(venue, split.house_number, split.street, city, state.abbreviation, postcode);
118
+ if (!raw)
119
+ continue;
120
+ const aligned = reconcileComponents(components, raw);
121
+ if (Object.keys(aligned).length === 0)
122
+ continue;
123
+ const siteId = (record["Site ID"] ?? "").trim();
124
+ const sourceId = siteId
125
+ ? `${USGOV_HRSA_FQHC_ADAPTER_ID}-${siteId}`
126
+ : stableSourceId(USGOV_HRSA_FQHC_ADAPTER_ID, aligned);
127
+ yield {
128
+ raw,
129
+ components: aligned,
130
+ country: "US",
131
+ locale: "en-US",
132
+ source: USGOV_HRSA_FQHC_ADAPTER_ID,
133
+ source_id: sourceId,
134
+ corpus_version: "",
135
+ license: USGOV_HRSA_FQHC_DEFAULT_LICENSE,
136
+ };
137
+ emitted++;
138
+ }
139
+ }
140
+ finally {
141
+ stream.destroy();
142
+ }
143
+ },
144
+ };
145
+ }
146
+ export const usgovHrsaFqhcAdapter = createUsgovHrsaFqhcAdapter();
147
+ //# sourceMappingURL=adapter.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"adapter.js","sourceRoot":"","sources":["../../../../src/adapters/usgov-hrsa-fqhc/adapter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6BG;AAEH,OAAO,EAAE,KAAK,IAAI,QAAQ,EAAE,MAAM,WAAW,CAAA;AAC7C,OAAO,EAAE,gBAAgB,EAAE,MAAM,SAAS,CAAA;AAC1C,OAAO,EAAE,cAAc,EAAE,MAAM,kBAAkB,CAAA;AACjD,OAAO,EAAE,uBAAuB,EAAE,MAAM,8BAA8B,CAAA;AACtE,OAAO,EAAE,mBAAmB,EAAE,MAAM,iBAAiB,CAAA;AAGrD,MAAM,CAAC,MAAM,0BAA0B,GAAG,iBAAiB,CAAA;AAC3D,MAAM,CAAC,MAAM,+BAA+B,GAAG,eAAe,CAAA;AAkB9D;;;;;;;;;;GAUG;AACH,MAAM,mBAAmB,GAAG,kCAAkC,CAAA;AAE9D,SAAS,YAAY,CAAC,OAAe;IACpC,MAAM,OAAO,GAAG,OAAO,CAAC,IAAI,EAAE,CAAA;IAC9B,IAAI,CAAC,OAAO;QAAE,OAAO,IAAI,CAAA;IACzB,MAAM,CAAC,GAAG,mBAAmB,CAAC,IAAI,CAAC,OAAO,CAAC,CAAA;IAC3C,IAAI,CAAC;QAAE,OAAO,EAAE,YAAY,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC,CAAC,CAAE,CAAC,IAAI,EAAE,EAAE,CAAA;IAC1D,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,CAAA;AAC3B,CAAC;AAED;;;;;;;GAOG;AACH,SAAS,UAAU,CAClB,KAAa,EACb,KAAyB,EACzB,MAAc,EACd,IAAY,EACZ,KAAa,EACb,QAAgB;IAEhB,MAAM,UAAU,GAAG,CAAC,KAAK,EAAE,MAAM,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAA;IACnE,MAAM,QAAQ,GAAG,CAAC,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC,KAAK,EAAE,QAAQ,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;IAC7G,OAAO,CAAC,KAAK,CAAC,IAAI,EAAE,EAAE,UAAU,EAAE,QAAQ,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;AACvE,CAAC;AAED,MAAM,UAAU,0BAA0B;IACzC,OAAO;QACN,EAAE,EAAE,0BAA0B;QAC9B,cAAc,EAAE,+BAA+B;QAC/C,WAAW,EACV,yIAAyI;QAE1I,KAAK,CAAC,CAAC,IAAI,CAAC,IAAoB;YAC/B,IAAI,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,OAAO,KAAK,IAAI,EAAE,CAAC;gBAC3C,MAAM,IAAI,KAAK,CAAC,2DAA2D,IAAI,CAAC,OAAO,EAAE,CAAC,CAAA;YAC3F,CAAC;YAED,MAAM,MAAM,GAAG,gBAAgB,CAAC,IAAI,CAAC,SAAS,EAAE,EAAE,QAAQ,EAAE,MAAM,EAAE,CAAC,CAAA;YACrE,MAAM,MAAM,GAAG,MAAM,CAAC,IAAI,CACzB,QAAQ,CAAC;gBACR,OAAO,EAAE,IAAI;gBACb,gBAAgB,EAAE,IAAI;gBACtB,YAAY,EAAE,IAAI;gBAClB,kBAAkB,EAAE,IAAI;aACxB,CAAC,CACF,CAAA;YAED,IAAI,OAAO,GAAG,CAAC,CAAA;YACf,IAAI,CAAC;gBACJ,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,MAAoC,EAAE,CAAC;oBACjE,IAAI,IAAI,CAAC,MAAM,EAAE,OAAO;wBAAE,MAAK;oBAC/B,IAAI,IAAI,CAAC,KAAK,KAAK,SAAS,IAAI,OAAO,IAAI,IAAI,CAAC,KAAK;wBAAE,MAAK;oBAE5D,MAAM,KAAK,GAAG,CAAC,MAAM,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBAChD,MAAM,KAAK,GAAG,YAAY,CAAC,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,CAAC,CAAA;oBACxD,MAAM,IAAI,GAAG,CAAC,MAAM,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBAC/C,MAAM,SAAS,GAAG,CAAC,MAAM,CAAC,yBAAyB,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBAClE,MAAM,QAAQ,GAAG,CAAC,MAAM,CAAC,kBAAkB,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBAE1D,IAAI,CAAC,KAAK,IAAI,CAAC,KAAK,IAAI,CAAC,IAAI,IAAI,CAAC,QAAQ;wBAAE,SAAQ;oBACpD,MAAM,KAAK,GAAG,uBAAuB,CAAC,SAAS,CAAC,CAAA;oBAChD,IAAI,CAAC,KAAK;wBAAE,SAAQ;oBAEpB,2EAA2E;oBAC3E,wEAAwE;oBACxE,mEAAmE;oBACnE,0DAA0D;oBAC1D,MAAM,UAAU,GAA+B;wBAC9C,KAAK;wBACL,GAAG,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC,CAAC,EAAE,YAAY,EAAE,KAAK,CAAC,YAAY,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;wBACnE,MAAM,EAAE,KAAK,CAAC,MAAM;wBACpB,QAAQ,EAAE,IAAI;wBACd,MAAM,EAAE,KAAK,CAAC,YAAY;wBAC1B,QAAQ;qBACR,CAAA;oBAED,MAAM,GAAG,GAAG,UAAU,CAAC,KAAK,EAAE,KAAK,CAAC,YAAY,EAAE,KAAK,CAAC,MAAM,EAAE,IAAI,EAAE,KAAK,CAAC,YAAY,EAAE,QAAQ,CAAC,CAAA;oBACnG,IAAI,CAAC,GAAG;wBAAE,SAAQ;oBAElB,MAAM,OAAO,GAAG,mBAAmB,CAAC,UAAU,EAAE,GAAG,CAAC,CAAA;oBACpD,IAAI,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,MAAM,KAAK,CAAC;wBAAE,SAAQ;oBAE/C,MAAM,MAAM,GAAG,CAAC,MAAM,CAAC,SAAS,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBAC/C,MAAM,QAAQ,GAAG,MAAM;wBACtB,CAAC,CAAC,GAAG,0BAA0B,IAAI,MAAM,EAAE;wBAC3C,CAAC,CAAC,cAAc,CAAC,0BAA0B,EAAE,OAAO,CAAC,CAAA;oBAEtD,MAAM;wBACL,GAAG;wBACH,UAAU,EAAE,OAAO;wBACnB,OAAO,EAAE,IAAI;wBACb,MAAM,EAAE,OAAO;wBACf,MAAM,EAAE,0BAA0B;wBAClC,SAAS,EAAE,QAAQ;wBACnB,cAAc,EAAE,EAAE;wBAClB,OAAO,EAAE,+BAA+B;qBACxC,CAAA;oBACD,OAAO,EAAE,CAAA;gBACV,CAAC;YACF,CAAC;oBAAS,CAAC;gBACV,MAAM,CAAC,OAAO,EAAE,CAAA;YACjB,CAAC;QACF,CAAC;KACD,CAAA;AACF,CAAC;AAED,MAAM,CAAC,MAAM,oBAAoB,GAAG,0BAA0B,EAAE,CAAA"}
@@ -0,0 +1,25 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * `usgov-imls-pls`: IMLS Public Libraries Survey outlet CSV consumer.
7
+ *
8
+ * The Institute of Museum and Library Services publishes an annual Public Libraries Survey with one
9
+ * row per library outlet (~17K rows). Each row carries the library name, street address, city,
10
+ * ZIP, county, and geocoordinates.
11
+ *
12
+ * The adapter consumes the outlet CSV the operator pre-downloads via `fetch-imls-pls.sh`. Column
13
+ * names match the IMLS PLS outlet file header.
14
+ *
15
+ * Output: one row per outlet with `venue` (library name), `(house_number, street, locality,
16
+ * subregion, postcode)`, and lat/lon preserved in `source_id` stability.
17
+ *
18
+ * License: stamped `"Public Domain"` per IMLS federal government distribution terms.
19
+ */
20
+ import type { CorpusAdapter } from "../../types.js";
21
+ export declare const USGOV_IMLS_PLS_ADAPTER_ID = "usgov-imls-pls";
22
+ export declare const USGOV_IMLS_PLS_DEFAULT_LICENSE = "Public Domain";
23
+ export declare function createUsgovImlsPlsAdapter(): CorpusAdapter;
24
+ export declare const usgovImlsPlsAdapter: CorpusAdapter;
25
+ //# sourceMappingURL=adapter.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"adapter.d.ts","sourceRoot":"","sources":["../../../../src/adapters/usgov-imls-pls/adapter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAOH,OAAO,KAAK,EAAgC,aAAa,EAAE,MAAM,gBAAgB,CAAA;AAEjF,eAAO,MAAM,yBAAyB,mBAAmB,CAAA;AACzD,eAAO,MAAM,8BAA8B,kBAAkB,CAAA;AAsB7D,wBAAgB,yBAAyB,IAAI,aAAa,CAsFzD;AAED,eAAO,MAAM,mBAAmB,eAA8B,CAAA"}
@@ -0,0 +1,118 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * `usgov-imls-pls`: IMLS Public Libraries Survey outlet CSV consumer.
7
+ *
8
+ * The Institute of Museum and Library Services publishes an annual Public Libraries Survey with one
9
+ * row per library outlet (~17K rows). Each row carries the library name, street address, city,
10
+ * ZIP, county, and geocoordinates.
11
+ *
12
+ * The adapter consumes the outlet CSV the operator pre-downloads via `fetch-imls-pls.sh`. Column
13
+ * names match the IMLS PLS outlet file header.
14
+ *
15
+ * Output: one row per outlet with `venue` (library name), `(house_number, street, locality,
16
+ * subregion, postcode)`, and lat/lon preserved in `source_id` stability.
17
+ *
18
+ * License: stamped `"Public Domain"` per IMLS federal government distribution terms.
19
+ */
20
+ import { parse as csvParse } from "csv-parse";
21
+ import { createReadStream } from "node:fs";
22
+ import { stableSourceId } from "../../adapter.js";
23
+ import { lookupStateAbbreviation } from "../../codex/us-fips-state.js";
24
+ import { reconcileComponents } from "../../format.js";
25
+ export const USGOV_IMLS_PLS_ADAPTER_ID = "usgov-imls-pls";
26
+ export const USGOV_IMLS_PLS_DEFAULT_LICENSE = "Public Domain";
27
+ const HOUSE_NUMBER_PREFIX = /^(\d+(?:-\d+)?[A-Za-z]?)\s+(.+)$/;
28
+ function splitAddress(address) {
29
+ const trimmed = address.trim();
30
+ if (!trimmed)
31
+ return null;
32
+ const m = HOUSE_NUMBER_PREFIX.exec(trimmed);
33
+ if (m)
34
+ return { house_number: m[1], street: m[2].trim() };
35
+ return { street: trimmed };
36
+ }
37
+ export function createUsgovImlsPlsAdapter() {
38
+ return {
39
+ id: USGOV_IMLS_PLS_ADAPTER_ID,
40
+ defaultLicense: USGOV_IMLS_PLS_DEFAULT_LICENSE,
41
+ description: "IMLS Public Libraries Survey — ~17K library outlets with venue+address (public-domain).",
42
+ async *rows(opts) {
43
+ if (opts.country && opts.country !== "US") {
44
+ throw new Error(`usgov-imls-pls adapter: only US supported, got country=${opts.country}`);
45
+ }
46
+ const stream = createReadStream(opts.inputPath, { encoding: "utf8" });
47
+ const parser = stream.pipe(csvParse({
48
+ columns: true,
49
+ skip_empty_lines: true,
50
+ relax_quotes: true,
51
+ relax_column_count: true,
52
+ }));
53
+ let emitted = 0;
54
+ try {
55
+ for await (const record of parser) {
56
+ if (opts.signal?.aborted)
57
+ break;
58
+ if (opts.limit !== undefined && emitted >= opts.limit)
59
+ break;
60
+ const libName = (record.LIBNAME ?? "").trim();
61
+ const address = (record.ADDRESS ?? "").trim();
62
+ const city = (record.CITY ?? "").trim();
63
+ const zip = (record.ZIP ?? "").trim();
64
+ const stateAbbr = (record.STABR ?? "").trim();
65
+ const county = (record.CNTY ?? "").trim();
66
+ if (!libName || !city || !zip)
67
+ continue;
68
+ const state = lookupStateAbbreviation(stateAbbr);
69
+ if (!state)
70
+ continue;
71
+ const split = splitAddress(address);
72
+ if (!split)
73
+ continue;
74
+ const components = {
75
+ venue: libName,
76
+ ...(split.house_number ? { house_number: split.house_number } : {}),
77
+ street: split.street,
78
+ locality: city,
79
+ region: state.abbreviation,
80
+ postcode: zip,
81
+ ...(county ? { subregion: county } : {}),
82
+ };
83
+ const streetPart = [split.house_number, split.street].filter(Boolean).join(" ").trim();
84
+ const raw = [
85
+ libName,
86
+ streetPart,
87
+ [city, [stateAbbr, zip].filter(Boolean).join(" ")].filter(Boolean).join(", "),
88
+ ]
89
+ .filter(Boolean)
90
+ .join(", ");
91
+ const aligned = reconcileComponents(components, raw);
92
+ if (Object.keys(aligned).length <= 2)
93
+ continue;
94
+ const fscsKey = (record.FSCSKEY ?? "").trim();
95
+ const sourceId = fscsKey
96
+ ? `${USGOV_IMLS_PLS_ADAPTER_ID}-${fscsKey}`
97
+ : stableSourceId(USGOV_IMLS_PLS_ADAPTER_ID, aligned);
98
+ yield {
99
+ raw,
100
+ components: aligned,
101
+ country: "US",
102
+ locale: "en-US",
103
+ source: USGOV_IMLS_PLS_ADAPTER_ID,
104
+ source_id: sourceId,
105
+ corpus_version: "",
106
+ license: USGOV_IMLS_PLS_DEFAULT_LICENSE,
107
+ };
108
+ emitted++;
109
+ }
110
+ }
111
+ finally {
112
+ stream.destroy();
113
+ }
114
+ },
115
+ };
116
+ }
117
+ export const usgovImlsPlsAdapter = createUsgovImlsPlsAdapter();
118
+ //# sourceMappingURL=adapter.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"adapter.js","sourceRoot":"","sources":["../../../../src/adapters/usgov-imls-pls/adapter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAEH,OAAO,EAAE,KAAK,IAAI,QAAQ,EAAE,MAAM,WAAW,CAAA;AAC7C,OAAO,EAAE,gBAAgB,EAAE,MAAM,SAAS,CAAA;AAC1C,OAAO,EAAE,cAAc,EAAE,MAAM,kBAAkB,CAAA;AACjD,OAAO,EAAE,uBAAuB,EAAE,MAAM,8BAA8B,CAAA;AACtE,OAAO,EAAE,mBAAmB,EAAE,MAAM,iBAAiB,CAAA;AAGrD,MAAM,CAAC,MAAM,yBAAyB,GAAG,gBAAgB,CAAA;AACzD,MAAM,CAAC,MAAM,8BAA8B,GAAG,eAAe,CAAA;AAE7D,MAAM,mBAAmB,GAAG,kCAAkC,CAAA;AAY9D,SAAS,YAAY,CAAC,OAAe;IACpC,MAAM,OAAO,GAAG,OAAO,CAAC,IAAI,EAAE,CAAA;IAC9B,IAAI,CAAC,OAAO;QAAE,OAAO,IAAI,CAAA;IACzB,MAAM,CAAC,GAAG,mBAAmB,CAAC,IAAI,CAAC,OAAO,CAAC,CAAA;IAC3C,IAAI,CAAC;QAAE,OAAO,EAAE,YAAY,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC,CAAC,CAAE,CAAC,IAAI,EAAE,EAAE,CAAA;IAC1D,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,CAAA;AAC3B,CAAC;AAED,MAAM,UAAU,yBAAyB;IACxC,OAAO;QACN,EAAE,EAAE,yBAAyB;QAC7B,cAAc,EAAE,8BAA8B;QAC9C,WAAW,EAAE,yFAAyF;QAEtG,KAAK,CAAC,CAAC,IAAI,CAAC,IAAoB;YAC/B,IAAI,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,OAAO,KAAK,IAAI,EAAE,CAAC;gBAC3C,MAAM,IAAI,KAAK,CAAC,0DAA0D,IAAI,CAAC,OAAO,EAAE,CAAC,CAAA;YAC1F,CAAC;YAED,MAAM,MAAM,GAAG,gBAAgB,CAAC,IAAI,CAAC,SAAS,EAAE,EAAE,QAAQ,EAAE,MAAM,EAAE,CAAC,CAAA;YACrE,MAAM,MAAM,GAAG,MAAM,CAAC,IAAI,CACzB,QAAQ,CAAC;gBACR,OAAO,EAAE,IAAI;gBACb,gBAAgB,EAAE,IAAI;gBACtB,YAAY,EAAE,IAAI;gBAClB,kBAAkB,EAAE,IAAI;aACxB,CAAC,CACF,CAAA;YAED,IAAI,OAAO,GAAG,CAAC,CAAA;YACf,IAAI,CAAC;gBACJ,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,MAAsC,EAAE,CAAC;oBACnE,IAAI,IAAI,CAAC,MAAM,EAAE,OAAO;wBAAE,MAAK;oBAC/B,IAAI,IAAI,CAAC,KAAK,KAAK,SAAS,IAAI,OAAO,IAAI,IAAI,CAAC,KAAK;wBAAE,MAAK;oBAE5D,MAAM,OAAO,GAAG,CAAC,MAAM,CAAC,OAAO,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBAC7C,MAAM,OAAO,GAAG,CAAC,MAAM,CAAC,OAAO,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBAC7C,MAAM,IAAI,GAAG,CAAC,MAAM,CAAC,IAAI,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBACvC,MAAM,GAAG,GAAG,CAAC,MAAM,CAAC,GAAG,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBACrC,MAAM,SAAS,GAAG,CAAC,MAAM,CAAC,KAAK,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBAC7C,MAAM,MAAM,GAAG,CAAC,MAAM,CAAC,IAAI,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBAEzC,IAAI,CAAC,OAAO,IAAI,CAAC,IAAI,IAAI,CAAC,GAAG;wBAAE,SAAQ;oBAEvC,MAAM,KAAK,GAAG,uBAAuB,CAAC,SAAS,CAAC,CAAA;oBAChD,IAAI,CAAC,KAAK;wBAAE,SAAQ;oBAEpB,MAAM,KAAK,GAAG,YAAY,CAAC,OAAO,CAAC,CAAA;oBACnC,IAAI,CAAC,KAAK;wBAAE,SAAQ;oBAEpB,MAAM,UAAU,GAA+B;wBAC9C,KAAK,EAAE,OAAO;wBACd,GAAG,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC,CAAC,EAAE,YAAY,EAAE,KAAK,CAAC,YAAY,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;wBACnE,MAAM,EAAE,KAAK,CAAC,MAAM;wBACpB,QAAQ,EAAE,IAAI;wBACd,MAAM,EAAE,KAAK,CAAC,YAAY;wBAC1B,QAAQ,EAAE,GAAG;wBACb,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,SAAS,EAAE,MAAM,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;qBACxC,CAAA;oBAED,MAAM,UAAU,GAAG,CAAC,KAAK,CAAC,YAAY,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAA;oBACtF,MAAM,GAAG,GAAG;wBACX,OAAO;wBACP,UAAU;wBACV,CAAC,IAAI,EAAE,CAAC,SAAS,EAAE,GAAG,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC;qBAC7E;yBACC,MAAM,CAAC,OAAO,CAAC;yBACf,IAAI,CAAC,IAAI,CAAC,CAAA;oBAEZ,MAAM,OAAO,GAAG,mBAAmB,CAAC,UAAU,EAAE,GAAG,CAAC,CAAA;oBACpD,IAAI,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,MAAM,IAAI,CAAC;wBAAE,SAAQ;oBAE9C,MAAM,OAAO,GAAG,CAAC,MAAM,CAAC,OAAO,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBAC7C,MAAM,QAAQ,GAAG,OAAO;wBACvB,CAAC,CAAC,GAAG,yBAAyB,IAAI,OAAO,EAAE;wBAC3C,CAAC,CAAC,cAAc,CAAC,yBAAyB,EAAE,OAAO,CAAC,CAAA;oBAErD,MAAM;wBACL,GAAG;wBACH,UAAU,EAAE,OAAO;wBACnB,OAAO,EAAE,IAAI;wBACb,MAAM,EAAE,OAAO;wBACf,MAAM,EAAE,yBAAyB;wBACjC,SAAS,EAAE,QAAQ;wBACnB,cAAc,EAAE,EAAE;wBAClB,OAAO,EAAE,8BAA8B;qBACvC,CAAA;oBACD,OAAO,EAAE,CAAA;gBACV,CAAC;YACF,CAAC;oBAAS,CAAC;gBACV,MAAM,CAAC,OAAO,EAAE,CAAA;YACjB,CAAC;QACF,CAAC;KACD,CAAA;AACF,CAAC;AAED,MAAM,CAAC,MAAM,mBAAmB,GAAG,yBAAyB,EAAE,CAAA"}
@@ -0,0 +1,37 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * `usgov-nad`: US DOT National Address Database — ~97M structured address-point records.
7
+ *
8
+ * The single largest US address source available — federal aggregation of state + local 911-grade
9
+ * address points (every addressable location). Compared to TIGER ADDRFEAT (~20M segment-level, no
10
+ * city/locality) and NPPES (~7M provider-centric venues), NAD covers the entire residential +
11
+ * commercial address space with full structured components.
12
+ *
13
+ * The adapter consumes NDJSON shards produced by `fetch-nad.ts`'s featureserver mode (operator
14
+ * pre-downloads via `npx tsx packages/corpus/scripts/fetch-nad.ts`). Each shard is per-OID-range
15
+ * `oids_<start>-<end>.ndjson` with a sibling `.manifest.json`. Adapter iterates every `.ndjson`
16
+ * in the input directory, skipping the `quarantined-bash-bug/` subdir (legacy of the bash-
17
+ * fetcher's silent-page-failure bug).
18
+ *
19
+ * Field mapping (NAD v9 → CanonicalRow components):
20
+ *
21
+ * - House_number: `AddNo_Full` (pre-composed); falls back to AddNum_Pre + Add_Number + AddNum_Suf
22
+ * - Street: `StNam_Full` (pre-composed); falls back to St_PreDir + St_PreTyp + St_Name + St_PosTyp
23
+ *
24
+ * - St_PosDir + St_PosMod composition
25
+ * - Locality: `Post_City` > `Inc_Muni` > `Census_Plc` > `Uninc_Comm` (first non-empty)
26
+ * - Region: `State` (2-char USPS code, including territories: PR, GU, VI, AS, MP)
27
+ * - Postcode: `Zip_Code` + `Plus_4` (joined as `XXXXX-NNNN` when both present)
28
+ * - Venue: `LandmkName` (typically a park, school, hospital, named facility — when present)
29
+ *
30
+ * License: stamped `"Public Domain"` per 17 U.S.C. § 105 (US federal works).
31
+ */
32
+ import type { CorpusAdapter } from "../../types.js";
33
+ export declare const USGOV_NAD_ADAPTER_ID = "usgov-nad";
34
+ export declare const USGOV_NAD_DEFAULT_LICENSE = "Public Domain";
35
+ export declare function createUsgovNadAdapter(): CorpusAdapter;
36
+ export declare const usgovNadAdapter: CorpusAdapter;
37
+ //# sourceMappingURL=adapter.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"adapter.d.ts","sourceRoot":"","sources":["../../../../src/adapters/usgov-nad/adapter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA8BG;AAQH,OAAO,KAAK,EAAgC,aAAa,EAAE,MAAM,gBAAgB,CAAA;AAEjF,eAAO,MAAM,oBAAoB,cAAc,CAAA;AAC/C,eAAO,MAAM,yBAAyB,kBAAkB,CAAA;AA+JxD,wBAAgB,qBAAqB,IAAI,aAAa,CAuFrD;AAED,eAAO,MAAM,eAAe,eAA0B,CAAA"}
@@ -0,0 +1,227 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * `usgov-nad`: US DOT National Address Database — ~97M structured address-point records.
7
+ *
8
+ * The single largest US address source available — federal aggregation of state + local 911-grade
9
+ * address points (every addressable location). Compared to TIGER ADDRFEAT (~20M segment-level, no
10
+ * city/locality) and NPPES (~7M provider-centric venues), NAD covers the entire residential +
11
+ * commercial address space with full structured components.
12
+ *
13
+ * The adapter consumes NDJSON shards produced by `fetch-nad.ts`'s featureserver mode (operator
14
+ * pre-downloads via `npx tsx packages/corpus/scripts/fetch-nad.ts`). Each shard is per-OID-range
15
+ * `oids_<start>-<end>.ndjson` with a sibling `.manifest.json`. Adapter iterates every `.ndjson`
16
+ * in the input directory, skipping the `quarantined-bash-bug/` subdir (legacy of the bash-
17
+ * fetcher's silent-page-failure bug).
18
+ *
19
+ * Field mapping (NAD v9 → CanonicalRow components):
20
+ *
21
+ * - House_number: `AddNo_Full` (pre-composed); falls back to AddNum_Pre + Add_Number + AddNum_Suf
22
+ * - Street: `StNam_Full` (pre-composed); falls back to St_PreDir + St_PreTyp + St_Name + St_PosTyp
23
+ *
24
+ * - St_PosDir + St_PosMod composition
25
+ * - Locality: `Post_City` > `Inc_Muni` > `Census_Plc` > `Uninc_Comm` (first non-empty)
26
+ * - Region: `State` (2-char USPS code, including territories: PR, GU, VI, AS, MP)
27
+ * - Postcode: `Zip_Code` + `Plus_4` (joined as `XXXXX-NNNN` when both present)
28
+ * - Venue: `LandmkName` (typically a park, school, hospital, named facility — when present)
29
+ *
30
+ * License: stamped `"Public Domain"` per 17 U.S.C. § 105 (US federal works).
31
+ */
32
+ import { createReadStream } from "node:fs";
33
+ import { readdir } from "node:fs/promises";
34
+ import { join } from "node:path";
35
+ import { createInterface } from "node:readline";
36
+ import { reconcileComponents } from "../../format.js";
37
+ export const USGOV_NAD_ADAPTER_ID = "usgov-nad";
38
+ export const USGOV_NAD_DEFAULT_LICENSE = "Public Domain";
39
+ const US_STATES_SET = new Set([
40
+ "AL",
41
+ "AK",
42
+ "AZ",
43
+ "AR",
44
+ "CA",
45
+ "CO",
46
+ "CT",
47
+ "DE",
48
+ "DC",
49
+ "FL",
50
+ "GA",
51
+ "HI",
52
+ "ID",
53
+ "IL",
54
+ "IN",
55
+ "IA",
56
+ "KS",
57
+ "KY",
58
+ "LA",
59
+ "ME",
60
+ "MD",
61
+ "MA",
62
+ "MI",
63
+ "MN",
64
+ "MS",
65
+ "MO",
66
+ "MT",
67
+ "NE",
68
+ "NV",
69
+ "NH",
70
+ "NJ",
71
+ "NM",
72
+ "NY",
73
+ "NC",
74
+ "ND",
75
+ "OH",
76
+ "OK",
77
+ "OR",
78
+ "PA",
79
+ "RI",
80
+ "SC",
81
+ "SD",
82
+ "TN",
83
+ "TX",
84
+ "UT",
85
+ "VT",
86
+ "VA",
87
+ "WA",
88
+ "WV",
89
+ "WI",
90
+ "WY",
91
+ // Territories that ship in NAD
92
+ "PR",
93
+ "GU",
94
+ "VI",
95
+ "AS",
96
+ "MP",
97
+ ]);
98
+ function nonEmpty(...values) {
99
+ for (const v of values) {
100
+ const trimmed = (v ?? "").toString().trim();
101
+ if (trimmed)
102
+ return trimmed;
103
+ }
104
+ return undefined;
105
+ }
106
+ function composeHouseNumber(r) {
107
+ const full = (r.AddNo_Full ?? "").toString().trim();
108
+ if (full)
109
+ return full;
110
+ const num = r.Add_Number == null ? "" : String(r.Add_Number).trim();
111
+ if (!num)
112
+ return undefined;
113
+ const pre = (r.AddNum_Pre ?? "").toString().trim();
114
+ const suf = (r.AddNum_Suf ?? "").toString().trim();
115
+ return [pre, num, suf].filter(Boolean).join(" ").trim() || undefined;
116
+ }
117
+ function composeStreet(r) {
118
+ const full = (r.StNam_Full ?? "").toString().trim();
119
+ if (full)
120
+ return full;
121
+ const parts = [r.St_PreMod, r.St_PreDir, r.St_PreTyp, r.St_PreSep, r.St_Name, r.St_PosTyp, r.St_PosDir, r.St_PosMod]
122
+ .map((p) => (p ?? "").toString().trim())
123
+ .filter(Boolean);
124
+ return parts.length ? parts.join(" ") : undefined;
125
+ }
126
+ function composeLocality(r) {
127
+ return nonEmpty(r.Post_City, r.Inc_Muni, r.Census_Plc, r.Uninc_Comm);
128
+ }
129
+ function composePostcode(r) {
130
+ const zip = (r.Zip_Code ?? "").toString().trim();
131
+ if (!zip)
132
+ return undefined;
133
+ const plus4 = (r.Plus_4 ?? "").toString().trim();
134
+ return plus4 ? `${zip}-${plus4}` : zip;
135
+ }
136
+ function composeRaw(parts) {
137
+ const streetLine = [parts.houseNumber, parts.street].filter(Boolean).join(" ").trim();
138
+ const tail = `${parts.locality}, ${parts.region} ${parts.postcode}`;
139
+ return [parts.venue, streetLine || undefined, tail].filter(Boolean).join(", ");
140
+ }
141
+ export function createUsgovNadAdapter() {
142
+ return {
143
+ id: USGOV_NAD_ADAPTER_ID,
144
+ defaultLicense: USGOV_NAD_DEFAULT_LICENSE,
145
+ description: "US DOT National Address Database — ~97M structured US address points (911-grade). Single largest US source.",
146
+ async *rows(opts) {
147
+ if (opts.country && opts.country !== "US") {
148
+ throw new Error(`usgov-nad adapter: only US supported, got country=${opts.country}`);
149
+ }
150
+ // inputPath is a directory of NDJSON shards (per fetch-nad.ts featureserver output).
151
+ // Single-file inputs (e.g. a bulk-extracted CSV) are not currently supported — the
152
+ // featureserver shard pattern is the primary distribution.
153
+ const entries = await readdir(opts.inputPath);
154
+ const shards = entries.filter((n) => n.endsWith(".ndjson")).sort();
155
+ let emitted = 0;
156
+ outer: for (const shard of shards) {
157
+ if (opts.signal?.aborted)
158
+ break;
159
+ const stream = createReadStream(join(opts.inputPath, shard), { encoding: "utf8" });
160
+ const rl = createInterface({ input: stream, crlfDelay: Infinity });
161
+ try {
162
+ for await (const line of rl) {
163
+ if (opts.signal?.aborted)
164
+ break outer;
165
+ if (opts.limit !== undefined && emitted >= opts.limit)
166
+ break outer;
167
+ if (!line)
168
+ continue;
169
+ let record;
170
+ try {
171
+ record = JSON.parse(line);
172
+ }
173
+ catch {
174
+ continue; // malformed line — skip silently
175
+ }
176
+ const state = (record.State ?? "").toString().trim().toUpperCase();
177
+ if (!US_STATES_SET.has(state))
178
+ continue;
179
+ const locality = composeLocality(record);
180
+ if (!locality)
181
+ continue;
182
+ const postcode = composePostcode(record);
183
+ if (!postcode)
184
+ continue;
185
+ const street = composeStreet(record);
186
+ const houseNumber = composeHouseNumber(record);
187
+ const venue = nonEmpty(record.LandmkName);
188
+ const components = {
189
+ ...(venue ? { venue } : {}),
190
+ ...(houseNumber ? { house_number: houseNumber } : {}),
191
+ ...(street ? { street } : {}),
192
+ locality,
193
+ region: state,
194
+ postcode,
195
+ };
196
+ const raw = composeRaw({ venue, houseNumber, street, locality, region: state, postcode });
197
+ if (!raw)
198
+ continue;
199
+ const aligned = reconcileComponents(components, raw);
200
+ if (Object.keys(aligned).length <= 2)
201
+ continue;
202
+ const sourceId = record.UUID
203
+ ? `${USGOV_NAD_ADAPTER_ID}-${record.UUID}`
204
+ : `${USGOV_NAD_ADAPTER_ID}-${record.OBJECTID ?? `${shard}:${emitted}`}`;
205
+ yield {
206
+ raw,
207
+ components: aligned,
208
+ country: "US",
209
+ locale: "en-US",
210
+ source: USGOV_NAD_ADAPTER_ID,
211
+ source_id: sourceId,
212
+ corpus_version: "",
213
+ license: USGOV_NAD_DEFAULT_LICENSE,
214
+ };
215
+ emitted++;
216
+ }
217
+ }
218
+ finally {
219
+ rl.close();
220
+ stream.destroy();
221
+ }
222
+ }
223
+ },
224
+ };
225
+ }
226
+ export const usgovNadAdapter = createUsgovNadAdapter();
227
+ //# sourceMappingURL=adapter.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"adapter.js","sourceRoot":"","sources":["../../../../src/adapters/usgov-nad/adapter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA8BG;AAEH,OAAO,EAAE,gBAAgB,EAAE,MAAM,SAAS,CAAA;AAC1C,OAAO,EAAE,OAAO,EAAE,MAAM,kBAAkB,CAAA;AAC1C,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAA;AAChC,OAAO,EAAE,eAAe,EAAE,MAAM,eAAe,CAAA;AAE/C,OAAO,EAAE,mBAAmB,EAAE,MAAM,iBAAiB,CAAA;AAGrD,MAAM,CAAC,MAAM,oBAAoB,GAAG,WAAW,CAAA;AAC/C,MAAM,CAAC,MAAM,yBAAyB,GAAG,eAAe,CAAA;AAgDxD,MAAM,aAAa,GAAG,IAAI,GAAG,CAAC;IAC7B,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,+BAA+B;IAC/B,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;CACJ,CAAC,CAAA;AAEF,SAAS,QAAQ,CAAC,GAAG,MAAwC;IAC5D,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;QACxB,MAAM,OAAO,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,IAAI,EAAE,CAAA;QAC3C,IAAI,OAAO;YAAE,OAAO,OAAO,CAAA;IAC5B,CAAC;IACD,OAAO,SAAS,CAAA;AACjB,CAAC;AAED,SAAS,kBAAkB,CAAC,CAAY;IACvC,MAAM,IAAI,GAAG,CAAC,CAAC,CAAC,UAAU,IAAI,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,IAAI,EAAE,CAAA;IACnD,IAAI,IAAI;QAAE,OAAO,IAAI,CAAA;IACrB,MAAM,GAAG,GAAG,CAAC,CAAC,UAAU,IAAI,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,IAAI,EAAE,CAAA;IACnE,IAAI,CAAC,GAAG;QAAE,OAAO,SAAS,CAAA;IAC1B,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,UAAU,IAAI,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,IAAI,EAAE,CAAA;IAClD,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,UAAU,IAAI,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,IAAI,EAAE,CAAA;IAClD,OAAO,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,IAAI,SAAS,CAAA;AACrE,CAAC;AAED,SAAS,aAAa,CAAC,CAAY;IAClC,MAAM,IAAI,GAAG,CAAC,CAAC,CAAC,UAAU,IAAI,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,IAAI,EAAE,CAAA;IACnD,IAAI,IAAI;QAAE,OAAO,IAAI,CAAA;IACrB,MAAM,KAAK,GAAG,CAAC,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,SAAS,CAAC;SAClH,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,IAAI,EAAE,CAAC;SACvC,MAAM,CAAC,OAAO,CAAC,CAAA;IACjB,OAAO,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,SAAS,CAAA;AAClD,CAAC;AAED,SAAS,eAAe,CAAC,CAAY;IACpC,OAAO,QAAQ,CAAC,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC,UAAU,EAAE,CAAC,CAAC,UAAU,CAAC,CAAA;AACrE,CAAC;AAED,SAAS,eAAe,CAAC,CAAY;IACpC,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,QAAQ,IAAI,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,IAAI,EAAE,CAAA;IAChD,IAAI,CAAC,GAAG;QAAE,OAAO,SAAS,CAAA;IAC1B,MAAM,KAAK,GAAG,CAAC,CAAC,CAAC,MAAM,IAAI,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,IAAI,EAAE,CAAA;IAChD,OAAO,KAAK,CAAC,CAAC,CAAC,GAAG,GAAG,IAAI,KAAK,EAAE,CAAC,CAAC,CAAC,GAAG,CAAA;AACvC,CAAC;AAED,SAAS,UAAU,CAAC,KAOnB;IACA,MAAM,UAAU,GAAG,CAAC,KAAK,CAAC,WAAW,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAA;IACrF,MAAM,IAAI,GAAG,GAAG,KAAK,CAAC,QAAQ,KAAK,KAAK,CAAC,MAAM,IAAI,KAAK,CAAC,QAAQ,EAAE,CAAA;IACnE,OAAO,CAAC,KAAK,CAAC,KAAK,EAAE,UAAU,IAAI,SAAS,EAAE,IAAI,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;AAC/E,CAAC;AAED,MAAM,UAAU,qBAAqB;IACpC,OAAO;QACN,EAAE,EAAE,oBAAoB;QACxB,cAAc,EAAE,yBAAyB;QACzC,WAAW,EACV,6GAA6G;QAE9G,KAAK,CAAC,CAAC,IAAI,CAAC,IAAoB;YAC/B,IAAI,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,OAAO,KAAK,IAAI,EAAE,CAAC;gBAC3C,MAAM,IAAI,KAAK,CAAC,qDAAqD,IAAI,CAAC,OAAO,EAAE,CAAC,CAAA;YACrF,CAAC;YAED,qFAAqF;YACrF,mFAAmF;YACnF,2DAA2D;YAC3D,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,IAAI,CAAC,SAAS,CAAC,CAAA;YAC7C,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC,CAAC,IAAI,EAAE,CAAA;YAElE,IAAI,OAAO,GAAG,CAAC,CAAA;YACf,KAAK,EAAE,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;gBACnC,IAAI,IAAI,CAAC,MAAM,EAAE,OAAO;oBAAE,MAAK;gBAC/B,MAAM,MAAM,GAAG,gBAAgB,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,KAAK,CAAC,EAAE,EAAE,QAAQ,EAAE,MAAM,EAAE,CAAC,CAAA;gBAClF,MAAM,EAAE,GAAG,eAAe,CAAC,EAAE,KAAK,EAAE,MAAM,EAAE,SAAS,EAAE,QAAQ,EAAE,CAAC,CAAA;gBAClE,IAAI,CAAC;oBACJ,IAAI,KAAK,EAAE,MAAM,IAAI,IAAI,EAAE,EAAE,CAAC;wBAC7B,IAAI,IAAI,CAAC,MAAM,EAAE,OAAO;4BAAE,MAAM,KAAK,CAAA;wBACrC,IAAI,IAAI,CAAC,KAAK,KAAK,SAAS,IAAI,OAAO,IAAI,IAAI,CAAC,KAAK;4BAAE,MAAM,KAAK,CAAA;wBAClE,IAAI,CAAC,IAAI;4BAAE,SAAQ;wBAEnB,IAAI,MAAiB,CAAA;wBACrB,IAAI,CAAC;4BACJ,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAc,CAAA;wBACvC,CAAC;wBAAC,MAAM,CAAC;4BACR,SAAQ,CAAC,iCAAiC;wBAC3C,CAAC;wBAED,MAAM,KAAK,GAAG,CAAC,MAAM,CAAC,KAAK,IAAI,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAA;wBAClE,IAAI,CAAC,aAAa,CAAC,GAAG,CAAC,KAAK,CAAC;4BAAE,SAAQ;wBAEvC,MAAM,QAAQ,GAAG,eAAe,CAAC,MAAM,CAAC,CAAA;wBACxC,IAAI,CAAC,QAAQ;4BAAE,SAAQ;wBAEvB,MAAM,QAAQ,GAAG,eAAe,CAAC,MAAM,CAAC,CAAA;wBACxC,IAAI,CAAC,QAAQ;4BAAE,SAAQ;wBAEvB,MAAM,MAAM,GAAG,aAAa,CAAC,MAAM,CAAC,CAAA;wBACpC,MAAM,WAAW,GAAG,kBAAkB,CAAC,MAAM,CAAC,CAAA;wBAC9C,MAAM,KAAK,GAAG,QAAQ,CAAC,MAAM,CAAC,UAAU,CAAC,CAAA;wBAEzC,MAAM,UAAU,GAA+B;4BAC9C,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;4BAC3B,GAAG,CAAC,WAAW,CAAC,CAAC,CAAC,EAAE,YAAY,EAAE,WAAW,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;4BACrD,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;4BAC7B,QAAQ;4BACR,MAAM,EAAE,KAAK;4BACb,QAAQ;yBACR,CAAA;wBAED,MAAM,GAAG,GAAG,UAAU,CAAC,EAAE,KAAK,EAAE,WAAW,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC,CAAA;wBACzF,IAAI,CAAC,GAAG;4BAAE,SAAQ;wBAElB,MAAM,OAAO,GAAG,mBAAmB,CAAC,UAAU,EAAE,GAAG,CAAC,CAAA;wBACpD,IAAI,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,MAAM,IAAI,CAAC;4BAAE,SAAQ;wBAE9C,MAAM,QAAQ,GAAG,MAAM,CAAC,IAAI;4BAC3B,CAAC,CAAC,GAAG,oBAAoB,IAAI,MAAM,CAAC,IAAI,EAAE;4BAC1C,CAAC,CAAC,GAAG,oBAAoB,IAAI,MAAM,CAAC,QAAQ,IAAI,GAAG,KAAK,IAAI,OAAO,EAAE,EAAE,CAAA;wBAExE,MAAM;4BACL,GAAG;4BACH,UAAU,EAAE,OAAO;4BACnB,OAAO,EAAE,IAAI;4BACb,MAAM,EAAE,OAAO;4BACf,MAAM,EAAE,oBAAoB;4BAC5B,SAAS,EAAE,QAAQ;4BACnB,cAAc,EAAE,EAAE;4BAClB,OAAO,EAAE,yBAAyB;yBAClC,CAAA;wBACD,OAAO,EAAE,CAAA;oBACV,CAAC;gBACF,CAAC;wBAAS,CAAC;oBACV,EAAE,CAAC,KAAK,EAAE,CAAA;oBACV,MAAM,CAAC,OAAO,EAAE,CAAA;gBACjB,CAAC;YACF,CAAC;QACF,CAAC;KACD,CAAA;AACF,CAAC;AAED,MAAM,CAAC,MAAM,eAAe,GAAG,qBAAqB,EAAE,CAAA"}
@@ -0,0 +1,28 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * `usgov-nppes`: CMS National Plan and Provider Enumeration System (NPI registry) CSV consumer.
7
+ *
8
+ * NPPES is the authoritative US healthcare provider registry, published monthly by CMS. Each row
9
+ * carries a provider's business practice location address together with their legal business name
10
+ * or individual name. At ~7M rows it is the single largest venue+address signal source
11
+ * available.
12
+ *
13
+ * The adapter consumes the monthly full-replacement CSV (operator pre-downloads via
14
+ * `fetch-nppes.sh`). Column names match the canonical NPPES "Full Replacement Monthly NPI File"
15
+ * header published at `https://download.cms.gov/nppes/NPI_Files.html`.
16
+ *
17
+ * Output: one row per CSV record where the practice location address is populated. Organization
18
+ * rows carry `venue` from the legal business name; individual rows compose `attention` from
19
+ * last+first name. Address quad goes on `(house_number, street, locality, region, postcode)`.
20
+ *
21
+ * License: stamped `"Public Domain"` per CMS's federal government distribution terms.
22
+ */
23
+ import type { CorpusAdapter } from "../../types.js";
24
+ export declare const USGOV_NPPES_ADAPTER_ID = "usgov-nppes";
25
+ export declare const USGOV_NPPES_DEFAULT_LICENSE = "Public Domain";
26
+ export declare function createUsgovNppesAdapter(): CorpusAdapter;
27
+ export declare const usgovNppesAdapter: CorpusAdapter;
28
+ //# sourceMappingURL=adapter.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"adapter.d.ts","sourceRoot":"","sources":["../../../../src/adapters/usgov-nppes/adapter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;GAqBG;AAOH,OAAO,KAAK,EAAgC,aAAa,EAAE,MAAM,gBAAgB,CAAA;AAEjF,eAAO,MAAM,sBAAsB,gBAAgB,CAAA;AACnD,eAAO,MAAM,2BAA2B,kBAAkB,CAAA;AAsC1D,wBAAgB,uBAAuB,IAAI,aAAa,CAqFvD;AAED,eAAO,MAAM,iBAAiB,eAA4B,CAAA"}