@mailwoman/corpus 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/out/src/adapter.d.ts +96 -0
- package/out/src/adapter.d.ts.map +1 -0
- package/out/src/adapter.js +107 -0
- package/out/src/adapter.js.map +1 -0
- package/out/src/adapters/ban/adapter.d.ts +32 -0
- package/out/src/adapters/ban/adapter.d.ts.map +1 -0
- package/out/src/adapters/ban/adapter.js +133 -0
- package/out/src/adapters/ban/adapter.js.map +1 -0
- package/out/src/adapters/fcc-bdc/adapter.d.ts +61 -0
- package/out/src/adapters/fcc-bdc/adapter.d.ts.map +1 -0
- package/out/src/adapters/fcc-bdc/adapter.js +153 -0
- package/out/src/adapters/fcc-bdc/adapter.js.map +1 -0
- package/out/src/adapters/index.d.ts +42 -0
- package/out/src/adapters/index.d.ts.map +1 -0
- package/out/src/adapters/index.js +76 -0
- package/out/src/adapters/index.js.map +1 -0
- package/out/src/adapters/openaddresses/adapter.d.ts +60 -0
- package/out/src/adapters/openaddresses/adapter.d.ts.map +1 -0
- package/out/src/adapters/openaddresses/adapter.js +174 -0
- package/out/src/adapters/openaddresses/adapter.js.map +1 -0
- package/out/src/adapters/state-ia-contractors/adapter.d.ts +23 -0
- package/out/src/adapters/state-ia-contractors/adapter.d.ts.map +1 -0
- package/out/src/adapters/state-ia-contractors/adapter.js +113 -0
- package/out/src/adapters/state-ia-contractors/adapter.js.map +1 -0
- package/out/src/adapters/state-ny-notaries/adapter.d.ts +21 -0
- package/out/src/adapters/state-ny-notaries/adapter.d.ts.map +1 -0
- package/out/src/adapters/state-ny-notaries/adapter.js +132 -0
- package/out/src/adapters/state-ny-notaries/adapter.js.map +1 -0
- package/out/src/adapters/state-tx-notaries/adapter.d.ts +22 -0
- package/out/src/adapters/state-tx-notaries/adapter.d.ts.map +1 -0
- package/out/src/adapters/state-tx-notaries/adapter.js +125 -0
- package/out/src/adapters/state-tx-notaries/adapter.js.map +1 -0
- package/out/src/adapters/tiger/adapter.d.ts +45 -0
- package/out/src/adapters/tiger/adapter.d.ts.map +1 -0
- package/out/src/adapters/tiger/adapter.js +179 -0
- package/out/src/adapters/tiger/adapter.js.map +1 -0
- package/out/src/adapters/usgov-hrsa-fqhc/adapter.d.ts +36 -0
- package/out/src/adapters/usgov-hrsa-fqhc/adapter.d.ts.map +1 -0
- package/out/src/adapters/usgov-hrsa-fqhc/adapter.js +147 -0
- package/out/src/adapters/usgov-hrsa-fqhc/adapter.js.map +1 -0
- package/out/src/adapters/usgov-imls-pls/adapter.d.ts +25 -0
- package/out/src/adapters/usgov-imls-pls/adapter.d.ts.map +1 -0
- package/out/src/adapters/usgov-imls-pls/adapter.js +118 -0
- package/out/src/adapters/usgov-imls-pls/adapter.js.map +1 -0
- package/out/src/adapters/usgov-nad/adapter.d.ts +37 -0
- package/out/src/adapters/usgov-nad/adapter.d.ts.map +1 -0
- package/out/src/adapters/usgov-nad/adapter.js +227 -0
- package/out/src/adapters/usgov-nad/adapter.js.map +1 -0
- package/out/src/adapters/usgov-nppes/adapter.d.ts +28 -0
- package/out/src/adapters/usgov-nppes/adapter.d.ts.map +1 -0
- package/out/src/adapters/usgov-nppes/adapter.js +123 -0
- package/out/src/adapters/usgov-nppes/adapter.js.map +1 -0
- package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.d.ts +35 -0
- package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.d.ts.map +1 -0
- package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.js +162 -0
- package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.js.map +1 -0
- package/out/src/adapters/wof-admin-json/adapter.d.ts +85 -0
- package/out/src/adapters/wof-admin-json/adapter.d.ts.map +1 -0
- package/out/src/adapters/wof-admin-json/adapter.js +241 -0
- package/out/src/adapters/wof-admin-json/adapter.js.map +1 -0
- package/out/src/adapters/wof-postalcode-json/adapter.d.ts +63 -0
- package/out/src/adapters/wof-postalcode-json/adapter.d.ts.map +1 -0
- package/out/src/adapters/wof-postalcode-json/adapter.js +178 -0
- package/out/src/adapters/wof-postalcode-json/adapter.js.map +1 -0
- package/out/src/align.d.ts +58 -0
- package/out/src/align.d.ts.map +1 -0
- package/out/src/align.js +139 -0
- package/out/src/align.js.map +1 -0
- package/out/src/build.d.ts +104 -0
- package/out/src/build.d.ts.map +1 -0
- package/out/src/build.js +201 -0
- package/out/src/build.js.map +1 -0
- package/out/src/codex/us-fips-state.d.ts +44 -0
- package/out/src/codex/us-fips-state.d.ts.map +1 -0
- package/out/src/codex/us-fips-state.js +105 -0
- package/out/src/codex/us-fips-state.js.map +1 -0
- package/out/src/codex/us-street-suffix.d.ts +259 -0
- package/out/src/codex/us-street-suffix.d.ts.map +1 -0
- package/out/src/codex/us-street-suffix.js +285 -0
- package/out/src/codex/us-street-suffix.js.map +1 -0
- package/out/src/format.d.ts +79 -0
- package/out/src/format.d.ts.map +1 -0
- package/out/src/format.js +151 -0
- package/out/src/format.js.map +1 -0
- package/out/src/golden.d.ts +50 -0
- package/out/src/golden.d.ts.map +1 -0
- package/out/src/golden.js +104 -0
- package/out/src/golden.js.map +1 -0
- package/out/src/index.d.ts +18 -0
- package/out/src/index.d.ts.map +1 -0
- package/out/src/index.js +18 -0
- package/out/src/index.js.map +1 -0
- package/out/src/parquet-wrapper/index.d.ts +12 -0
- package/out/src/parquet-wrapper/index.d.ts.map +1 -0
- package/out/src/parquet-wrapper/index.js +12 -0
- package/out/src/parquet-wrapper/index.js.map +1 -0
- package/out/src/parquet-wrapper/reader.d.ts +31 -0
- package/out/src/parquet-wrapper/reader.d.ts.map +1 -0
- package/out/src/parquet-wrapper/reader.js +54 -0
- package/out/src/parquet-wrapper/reader.js.map +1 -0
- package/out/src/parquet-wrapper/schema.d.ts +45 -0
- package/out/src/parquet-wrapper/schema.d.ts.map +1 -0
- package/out/src/parquet-wrapper/schema.js +55 -0
- package/out/src/parquet-wrapper/schema.js.map +1 -0
- package/out/src/parquet-wrapper/writer.d.ts +41 -0
- package/out/src/parquet-wrapper/writer.d.ts.map +1 -0
- package/out/src/parquet-wrapper/writer.js +71 -0
- package/out/src/parquet-wrapper/writer.js.map +1 -0
- package/out/src/parquet.d.ts +122 -0
- package/out/src/parquet.d.ts.map +1 -0
- package/out/src/parquet.js +220 -0
- package/out/src/parquet.js.map +1 -0
- package/out/src/runner.d.ts +100 -0
- package/out/src/runner.d.ts.map +1 -0
- package/out/src/runner.js +183 -0
- package/out/src/runner.js.map +1 -0
- package/out/src/split.d.ts +108 -0
- package/out/src/split.d.ts.map +1 -0
- package/out/src/split.js +191 -0
- package/out/src/split.js.map +1 -0
- package/out/src/synthesize.d.ts +146 -0
- package/out/src/synthesize.d.ts.map +1 -0
- package/out/src/synthesize.js +472 -0
- package/out/src/synthesize.js.map +1 -0
- package/out/src/tokenize.d.ts +47 -0
- package/out/src/tokenize.d.ts.map +1 -0
- package/out/src/tokenize.js +49 -0
- package/out/src/tokenize.js.map +1 -0
- package/out/src/types.d.ts +168 -0
- package/out/src/types.d.ts.map +1 -0
- package/out/src/types.js +19 -0
- package/out/src/types.js.map +1 -0
- package/out/src/wof-json.d.ts +105 -0
- package/out/src/wof-json.d.ts.map +1 -0
- package/out/src/wof-json.js +174 -0
- package/out/src/wof-json.js.map +1 -0
- package/package.json +36 -0
package/out/src/align.js
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Alignment: turn a `CanonicalRow` (raw + components) into a `LabeledRow` (raw + tokens + BIO
|
|
7
|
+
* labels) or a `QuarantinedRow` (raw + reason) per the Phase 1 plan.
|
|
8
|
+
*
|
|
9
|
+
* Pipeline:
|
|
10
|
+
*
|
|
11
|
+
* 1. For each `(tag, value)` in `components`, find the value's character span in `raw`. First try a
|
|
12
|
+
* verbatim substring match (case-insensitive, whitespace-collapsed). If that fails, fall
|
|
13
|
+
* back to fuzzy match via `fastest-levenshtein`, with a tunable edit distance threshold.
|
|
14
|
+
* 2. If any component cannot be located, reject the row with a human-readable reason and send it to
|
|
15
|
+
* the quarantine pile (`reason: "component-not-found:<tag>"` or
|
|
16
|
+
* `"edit-distance-exceeded:<tag>:<dist>"`).
|
|
17
|
+
* 3. Tokenize `raw` with the supplied `Tokenizer` (defaults to the whitespace tokenizer).
|
|
18
|
+
* 4. For each token: walk the list of component spans, pick the one whose span contains the token's
|
|
19
|
+
* character range. First token in a component span → `B-<tag>`; subsequent tokens →
|
|
20
|
+
* `I-<tag>`; no overlap → `O`.
|
|
21
|
+
*
|
|
22
|
+
* Two structural invariants the function preserves:
|
|
23
|
+
*
|
|
24
|
+
* - `tokens.length === labels.length` always.
|
|
25
|
+
* - Each component contributes at most one contiguous BIO run (no `B-tag … O … I-tag` gaps). This is
|
|
26
|
+
* enforced by greedy first-match span assignment + ordered token iteration.
|
|
27
|
+
*/
|
|
28
|
+
import { distance as levenshteinDistance } from "fastest-levenshtein";
|
|
29
|
+
import { whitespaceTokenizer } from "./tokenize.js";
|
|
30
|
+
/** Align a single row. */
|
|
31
|
+
export function alignRow(row, opts = {}) {
|
|
32
|
+
const tokenizer = opts.tokenizer ?? whitespaceTokenizer();
|
|
33
|
+
const maxEditDistance = opts.maxEditDistance ?? 2;
|
|
34
|
+
const caseInsensitive = opts.caseInsensitive ?? true;
|
|
35
|
+
if (!row.raw) {
|
|
36
|
+
return { kind: "quarantined", row: { row, reason: "raw-empty" } };
|
|
37
|
+
}
|
|
38
|
+
const componentSpans = [];
|
|
39
|
+
const claimed = [];
|
|
40
|
+
const haystack = caseInsensitive ? row.raw.toLowerCase() : row.raw;
|
|
41
|
+
for (const [tag, value] of Object.entries(row.components)) {
|
|
42
|
+
if (!value)
|
|
43
|
+
continue;
|
|
44
|
+
const needle = caseInsensitive ? value.toLowerCase() : value;
|
|
45
|
+
const span = locateSpan({ haystack, needle, raw: row.raw, claimed, maxEditDistance });
|
|
46
|
+
if (!span) {
|
|
47
|
+
return {
|
|
48
|
+
kind: "quarantined",
|
|
49
|
+
row: { row, reason: `component-not-found:${tag}` },
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
componentSpans.push({ tag, start: span.start, end: span.end });
|
|
53
|
+
claimed.push([span.start, span.end]);
|
|
54
|
+
}
|
|
55
|
+
componentSpans.sort((a, b) => a.start - b.start);
|
|
56
|
+
const tokens = tokenizer.tokenize(row.raw);
|
|
57
|
+
const labels = labelTokens(tokens, componentSpans);
|
|
58
|
+
const labeled = {
|
|
59
|
+
...row,
|
|
60
|
+
tokens: tokens.map((t) => t.text),
|
|
61
|
+
labels,
|
|
62
|
+
};
|
|
63
|
+
return { kind: "labeled", row: labeled };
|
|
64
|
+
}
|
|
65
|
+
/**
|
|
66
|
+
* Locate `needle` in `haystack` (both already normalized for case if requested), preferring
|
|
67
|
+
* verbatim substring match. Falls back to a fuzzy window scan when verbatim fails and
|
|
68
|
+
* `maxEditDistance > 0`. Already-claimed spans are skipped so two components don't grab overlapping
|
|
69
|
+
* ranges.
|
|
70
|
+
*
|
|
71
|
+
* Returns the span in the original `raw` (not the lower-cased `haystack`).
|
|
72
|
+
*/
|
|
73
|
+
function locateSpan(args) {
|
|
74
|
+
const { haystack, needle, claimed, maxEditDistance } = args;
|
|
75
|
+
if (needle.length === 0)
|
|
76
|
+
return undefined;
|
|
77
|
+
// Pass 1: verbatim substring, leftmost non-claimed.
|
|
78
|
+
let from = 0;
|
|
79
|
+
while (true) {
|
|
80
|
+
const idx = haystack.indexOf(needle, from);
|
|
81
|
+
if (idx < 0)
|
|
82
|
+
break;
|
|
83
|
+
const end = idx + needle.length;
|
|
84
|
+
if (!overlapsClaimed(idx, end, claimed))
|
|
85
|
+
return { start: idx, end };
|
|
86
|
+
from = idx + 1;
|
|
87
|
+
}
|
|
88
|
+
if (maxEditDistance <= 0)
|
|
89
|
+
return undefined;
|
|
90
|
+
// Pass 2: fuzzy sliding-window. Walk over candidate windows of length `needle.length`
|
|
91
|
+
// across haystack, compute Levenshtein, pick the leftmost window under the threshold.
|
|
92
|
+
const len = needle.length;
|
|
93
|
+
for (let i = 0; i + len <= haystack.length; i++) {
|
|
94
|
+
if (overlapsClaimed(i, i + len, claimed))
|
|
95
|
+
continue;
|
|
96
|
+
const window = haystack.slice(i, i + len);
|
|
97
|
+
if (window === needle)
|
|
98
|
+
return { start: i, end: i + len }; // covered by pass 1, but cheap
|
|
99
|
+
const d = levenshteinDistance(window, needle);
|
|
100
|
+
if (d <= maxEditDistance)
|
|
101
|
+
return { start: i, end: i + len };
|
|
102
|
+
}
|
|
103
|
+
return undefined;
|
|
104
|
+
}
|
|
105
|
+
function overlapsClaimed(start, end, claimed) {
|
|
106
|
+
for (const [a, b] of claimed) {
|
|
107
|
+
if (start < b && a < end)
|
|
108
|
+
return true;
|
|
109
|
+
}
|
|
110
|
+
return false;
|
|
111
|
+
}
|
|
112
|
+
/**
|
|
113
|
+
* Assign BIO labels to tokens given the component spans. Components MUST be sorted by start offset.
|
|
114
|
+
* For each token, find the first component span that contains the token's start offset; if the
|
|
115
|
+
* token is the first one inside that span emit `B-<tag>`, else `I-<tag>`.
|
|
116
|
+
*/
|
|
117
|
+
function labelTokens(tokens, spans) {
|
|
118
|
+
const out = [];
|
|
119
|
+
const seenSpan = new Set(); // index into `spans`
|
|
120
|
+
for (const token of tokens) {
|
|
121
|
+
let assigned = "O";
|
|
122
|
+
for (let i = 0; i < spans.length; i++) {
|
|
123
|
+
const s = spans[i];
|
|
124
|
+
if (token.start >= s.start && token.end <= s.end) {
|
|
125
|
+
if (!seenSpan.has(i)) {
|
|
126
|
+
assigned = `B-${s.tag}`;
|
|
127
|
+
seenSpan.add(i);
|
|
128
|
+
}
|
|
129
|
+
else {
|
|
130
|
+
assigned = `I-${s.tag}`;
|
|
131
|
+
}
|
|
132
|
+
break;
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
out.push(assigned);
|
|
136
|
+
}
|
|
137
|
+
return out;
|
|
138
|
+
}
|
|
139
|
+
//# sourceMappingURL=align.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"align.js","sourceRoot":"","sources":["../../src/align.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;GA0BG;AAGH,OAAO,EAAE,QAAQ,IAAI,mBAAmB,EAAE,MAAM,qBAAqB,CAAA;AACrE,OAAO,EAAE,mBAAmB,EAAkC,MAAM,eAAe,CAAA;AAiCnF,0BAA0B;AAC1B,MAAM,UAAU,QAAQ,CAAC,GAAiB,EAAE,OAAqB,EAAE;IAClE,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,IAAI,mBAAmB,EAAE,CAAA;IACzD,MAAM,eAAe,GAAG,IAAI,CAAC,eAAe,IAAI,CAAC,CAAA;IACjD,MAAM,eAAe,GAAG,IAAI,CAAC,eAAe,IAAI,IAAI,CAAA;IAEpD,IAAI,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC;QACd,OAAO,EAAE,IAAI,EAAE,aAAa,EAAE,GAAG,EAAE,EAAE,GAAG,EAAE,MAAM,EAAE,WAAW,EAAE,EAAE,CAAA;IAClE,CAAC;IAED,MAAM,cAAc,GAAoB,EAAE,CAAA;IAC1C,MAAM,OAAO,GAA4B,EAAE,CAAA;IAE3C,MAAM,QAAQ,GAAG,eAAe,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAA;IAElE,KAAK,MAAM,CAAC,GAAG,EAAE,KAAK,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,UAAU,CAA8C,EAAE,CAAC;QACxG,IAAI,CAAC,KAAK;YAAE,SAAQ;QAEpB,MAAM,MAAM,GAAG,eAAe,CAAC,CAAC,CAAC,KAAK,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC,KAAK,CAAA;QAC5D,MAAM,IAAI,GAAG,UAAU,CAAC,EAAE,QAAQ,EAAE,MAAM,EAAE,GAAG,EAAE,GAAG,CAAC,GAAG,EAAE,OAAO,EAAE,eAAe,EAAE,CAAC,CAAA;QAErF,IAAI,CAAC,IAAI,EAAE,CAAC;YACX,OAAO;gBACN,IAAI,EAAE,aAAa;gBACnB,GAAG,EAAE,EAAE,GAAG,EAAE,MAAM,EAAE,uBAAuB,GAAG,EAAE,EAAE;aAClD,CAAA;QACF,CAAC;QAED,cAAc,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,KAAK,EAAE,IAAI,CAAC,KAAK,EAAE,GAAG,EAAE,IAAI,CAAC,GAAG,EAAE,CAAC,CAAA;QAC9D,OAAO,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,KAAK,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,CAAA;IACrC,CAAC;IAED,cAAc,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAA;IAChD,MAAM,MAAM,GAAG,SAAS,CAAC,QAAQ,CAAC,GAAG,CAAC,GAAG,CAAC,CAAA;IAC1C,MAAM,MAAM,GAAG,WAAW,CAAC,MAAM,EAAE,cAAc,CAAC,CAAA;IAElD,MAAM,OAAO,GAAe;QAC3B,GAAG,GAAG;QACN,MAAM,EAAE,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;QACjC,MAAM;KACN,CAAA;IACD,OAAO,EAAE,IAAI,EAAE,SAAS,EAAE,GAAG,EAAE,OAAO,EAAE,CAAA;AACzC,CAAC;AAED;;;;;;;GAOG;AACH,SAAS,UAAU,CAAC,IAMnB;IACA,MAAM,EAAE,QAAQ,EAAE,MAAM,EAAE,OAAO,EAAE,eAAe,EAAE,GAAG,IAAI,CAAA;IAC3D,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,SAAS,CAAA;IAEzC,oDAAoD;IACpD,IAAI,IAAI,GAAG,CAAC,CAAA;IACZ,OAAO,IAAI,EAAE,CAAC;QACb,MAAM,GAAG,GAAG,QAAQ,CAAC,OAAO,CAAC,MAAM,EAAE,IAAI,CAAC,CAAA;QAC1C,IAAI,GAAG,GAAG,CAAC;YAAE,MAAK;QAClB,MAAM,GAAG,GAAG,GAAG,GAAG,MAAM,CAAC,MAAM,CAAA;QAC/B,IAAI,CAAC,eAAe,CAAC,GAAG,EAAE,GAAG,EAAE,OAAO,CAAC;YAAE,OAAO,EAAE,KAAK,EAAE,GAAG,EAAE,GAAG,EAAE,CAAA;QACnE,IAAI,GAAG,GAAG,GAAG,CAAC,CAAA;IACf,CAAC;IAED,IAAI,eAAe,IAAI,CAAC;QAAE,OAAO,SAAS,CAAA;IAE1C,sFAAsF;IACtF,sFAAsF;IACtF,MAAM,GAAG,GAAG,MAAM,CAAC,MAAM,CAAA;IACzB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,IAAI,QAAQ,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACjD,IAAI,eAAe,CAAC,CAAC,EAAE,CAAC,GAAG,GAAG,EAAE,OAAO,CAAC;YAAE,SAAQ;QAClD,MAAM,MAAM,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,CAAA;QACzC,IAAI,MAAM,KAAK,MAAM;YAAE,OAAO,EAAE,KAAK,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,GAAG,GAAG,EAAE,CAAA,CAAC,+BAA+B;QACxF,MAAM,CAAC,GAAG,mBAAmB,CAAC,MAAM,EAAE,MAAM,CAAC,CAAA;QAC7C,IAAI,CAAC,IAAI,eAAe;YAAE,OAAO,EAAE,KAAK,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,GAAG,GAAG,EAAE,CAAA;IAC5D,CAAC;IAED,OAAO,SAAS,CAAA;AACjB,CAAC;AAED,SAAS,eAAe,CAAC,KAAa,EAAE,GAAW,EAAE,OAAgC;IACpF,KAAK,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,OAAO,EAAE,CAAC;QAC9B,IAAI,KAAK,GAAG,CAAC,IAAI,CAAC,GAAG,GAAG;YAAE,OAAO,IAAI,CAAA;IACtC,CAAC;IACD,OAAO,KAAK,CAAA;AACb,CAAC;AAED;;;;GAIG;AACH,SAAS,WAAW,CAAC,MAA4B,EAAE,KAA+B;IACjF,MAAM,GAAG,GAAe,EAAE,CAAA;IAC1B,MAAM,QAAQ,GAAG,IAAI,GAAG,EAAU,CAAA,CAAC,qBAAqB;IAExD,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC5B,IAAI,QAAQ,GAAa,GAAG,CAAA;QAC5B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACvC,MAAM,CAAC,GAAG,KAAK,CAAC,CAAC,CAAE,CAAA;YACnB,IAAI,KAAK,CAAC,KAAK,IAAI,CAAC,CAAC,KAAK,IAAI,KAAK,CAAC,GAAG,IAAI,CAAC,CAAC,GAAG,EAAE,CAAC;gBAClD,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;oBACtB,QAAQ,GAAG,KAAK,CAAC,CAAC,GAAG,EAAc,CAAA;oBACnC,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,CAAA;gBAChB,CAAC;qBAAM,CAAC;oBACP,QAAQ,GAAG,KAAK,CAAC,CAAC,GAAG,EAAc,CAAA;gBACpC,CAAC;gBACD,MAAK;YACN,CAAC;QACF,CAAC;QACD,GAAG,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAA;IACnB,CAAC;IAED,OAAO,GAAG,CAAA;AACX,CAAC"}
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* End-to-end corpus build (Phase 1 task #10 in the plan).
|
|
7
|
+
*
|
|
8
|
+
* `buildCorpus(opts)` orchestrates every stage of the pipeline:
|
|
9
|
+
*
|
|
10
|
+
* 1. **Adapter runs** — drives every adapter in turn (via `runAdapter`), writing
|
|
11
|
+
* `<intermediate>/<adapter.id>/canonical.jsonl` shards.
|
|
12
|
+
* 2. **Synthesis** — optional. For each canonical row, every applicable augmentation in the row's
|
|
13
|
+
* country-default policy emits an augmented row alongside the original.
|
|
14
|
+
* 3. **Alignment** — every row (original + augmented) is aligned via `alignRow`. Successes go to
|
|
15
|
+
* `labeled.jsonl`; quarantines are appended to `quarantine.jsonl` with reasons.
|
|
16
|
+
* 4. **Splits** — `splitRows` partitions labeled `source_id`s into train/val/test by locality holdout.
|
|
17
|
+
* Manifest written to `splits/SPLIT_MANIFEST.json` + per-split `train.txt` / `val.txt` /
|
|
18
|
+
* `test.txt`.
|
|
19
|
+
* 5. **Parquet shards** — `writeShards` streams labeled rows into 1M-row `.parquet` shards per split
|
|
20
|
+
* under `corpus-v<version>/{train,val,test}/part-NNNN.parquet` (SNAPPY-compressed, 50k-row
|
|
21
|
+
* row groups), with per-shard checksums + per-stage manifest in
|
|
22
|
+
* `corpus-v<version>/MANIFEST.json`.
|
|
23
|
+
* 6. **Top-level manifest** — `<outputDir>/MANIFEST.json` ties every per-stage manifest together with
|
|
24
|
+
* a top-level corpus_version, built_at, and aggregate counts.
|
|
25
|
+
*
|
|
26
|
+
* Output layout:
|
|
27
|
+
*
|
|
28
|
+
* ```
|
|
29
|
+
* <outputDir>/
|
|
30
|
+
* MANIFEST.json
|
|
31
|
+
* intermediate/
|
|
32
|
+
* <adapter.id>/canonical.jsonl # one per adapter
|
|
33
|
+
* labeled.jsonl # post-alignment, pre-shard
|
|
34
|
+
* quarantine.jsonl # rows that failed alignment
|
|
35
|
+
* splits/
|
|
36
|
+
* SPLIT_MANIFEST.json
|
|
37
|
+
* train.txt / val.txt / test.txt
|
|
38
|
+
* corpus-v<version>/
|
|
39
|
+
* MANIFEST.json
|
|
40
|
+
* train/part-NNNN.parquet
|
|
41
|
+
* val/part-NNNN.parquet
|
|
42
|
+
* test/part-NNNN.parquet
|
|
43
|
+
* ```
|
|
44
|
+
*
|
|
45
|
+
* The intermediate files live alongside the final shards for reproducibility + debugging. Operators
|
|
46
|
+
* can `rm -rf intermediate/` after the build if disk is tight; the final `corpus-v<version>/` is
|
|
47
|
+
* self-contained.
|
|
48
|
+
*/
|
|
49
|
+
import { type ShardManifest } from "./parquet.js";
|
|
50
|
+
import { type AdapterRunManifest } from "./runner.js";
|
|
51
|
+
import { type SplitManifest } from "./split.js";
|
|
52
|
+
import type { AdapterOptions, CorpusAdapter } from "./types.js";
|
|
53
|
+
/** Stage tags surfaced to `onProgress`. */
|
|
54
|
+
export type BuildStage = "adapter-run" | "align" | "split" | "shard" | "manifest";
|
|
55
|
+
/** Per-invocation options for `buildCorpus`. */
|
|
56
|
+
export interface BuildCorpusOptions {
|
|
57
|
+
/** Root output directory. All build artifacts land beneath it. */
|
|
58
|
+
outputDir: string;
|
|
59
|
+
/** Corpus version (e.g. `"0.1.0"`). Stamped onto every row + into the output dir name. */
|
|
60
|
+
corpusVersion: string;
|
|
61
|
+
/**
|
|
62
|
+
* Adapters to drive, in order. Defaults to `defaultAdapterRegistry.list()`. Pass an explicit list
|
|
63
|
+
* to filter (e.g. `[wofAdminAdapter]` for a smoke run).
|
|
64
|
+
*/
|
|
65
|
+
adapters?: readonly CorpusAdapter[];
|
|
66
|
+
/**
|
|
67
|
+
* Per-adapter `AdapterOptions` — looked up by adapter id. Adapters whose id is missing from this
|
|
68
|
+
* map are skipped (and noted in the manifest).
|
|
69
|
+
*/
|
|
70
|
+
adapterInputs: Record<string, AdapterOptions>;
|
|
71
|
+
/** Enable synthesis pass. Default `true`. Set `false` for fixture-driven smoke tests. */
|
|
72
|
+
synthesize?: boolean;
|
|
73
|
+
/** Forwarded to `writeShards`. Default 1_000_000. */
|
|
74
|
+
rowsPerShard?: number;
|
|
75
|
+
/** Progress hook. Errors thrown abort the build. */
|
|
76
|
+
onProgress?: (stage: BuildStage, message: string) => void;
|
|
77
|
+
}
|
|
78
|
+
/** Top-level manifest tying every stage together. */
|
|
79
|
+
export interface BuildCorpusManifest {
|
|
80
|
+
corpus_version: string;
|
|
81
|
+
built_at: string;
|
|
82
|
+
adapters: AdapterRunManifest[];
|
|
83
|
+
skipped_adapters: string[];
|
|
84
|
+
splits: {
|
|
85
|
+
counts: SplitManifest["counts"];
|
|
86
|
+
holdouts: SplitManifest["holdouts"];
|
|
87
|
+
};
|
|
88
|
+
shards: {
|
|
89
|
+
counts: ShardManifest["counts"];
|
|
90
|
+
total_rows: number;
|
|
91
|
+
};
|
|
92
|
+
quarantine_count: number;
|
|
93
|
+
total_aligned_rows: number;
|
|
94
|
+
}
|
|
95
|
+
/**
|
|
96
|
+
* Drive the full corpus build to completion.
|
|
97
|
+
*
|
|
98
|
+
* Memory profile: the function maintains an in-memory `Map<source_id, SplitName>` to bridge the
|
|
99
|
+
* align → shard hand-off. For Phase 1 fixture-scale runs (≤ 10⁴ rows) this is trivial. For real 5M+
|
|
100
|
+
* runs, the map fits comfortably in a few hundred MB; the canonical.jsonl and labeled.jsonl
|
|
101
|
+
* payloads stream and never sit in memory.
|
|
102
|
+
*/
|
|
103
|
+
export declare function buildCorpus(opts: BuildCorpusOptions): Promise<BuildCorpusManifest>;
|
|
104
|
+
//# sourceMappingURL=build.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"build.d.ts","sourceRoot":"","sources":["../../src/build.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA+CG;AAQH,OAAO,EAAe,KAAK,aAAa,EAAE,MAAM,cAAc,CAAA;AAC9D,OAAO,EAAc,KAAK,kBAAkB,EAAE,MAAM,aAAa,CAAA;AACjE,OAAO,EAIN,KAAK,aAAa,EAElB,MAAM,YAAY,CAAA;AAEnB,OAAO,KAAK,EAAE,cAAc,EAAgB,aAAa,EAAc,MAAM,YAAY,CAAA;AAEzF,2CAA2C;AAC3C,MAAM,MAAM,UAAU,GAAG,aAAa,GAAG,OAAO,GAAG,OAAO,GAAG,OAAO,GAAG,UAAU,CAAA;AAEjF,gDAAgD;AAChD,MAAM,WAAW,kBAAkB;IAClC,kEAAkE;IAClE,SAAS,EAAE,MAAM,CAAA;IAEjB,0FAA0F;IAC1F,aAAa,EAAE,MAAM,CAAA;IAErB;;;OAGG;IACH,QAAQ,CAAC,EAAE,SAAS,aAAa,EAAE,CAAA;IAEnC;;;OAGG;IACH,aAAa,EAAE,MAAM,CAAC,MAAM,EAAE,cAAc,CAAC,CAAA;IAE7C,yFAAyF;IACzF,UAAU,CAAC,EAAE,OAAO,CAAA;IAEpB,qDAAqD;IACrD,YAAY,CAAC,EAAE,MAAM,CAAA;IAErB,oDAAoD;IACpD,UAAU,CAAC,EAAE,CAAC,KAAK,EAAE,UAAU,EAAE,OAAO,EAAE,MAAM,KAAK,IAAI,CAAA;CACzD;AAED,qDAAqD;AACrD,MAAM,WAAW,mBAAmB;IACnC,cAAc,EAAE,MAAM,CAAA;IACtB,QAAQ,EAAE,MAAM,CAAA;IAChB,QAAQ,EAAE,kBAAkB,EAAE,CAAA;IAC9B,gBAAgB,EAAE,MAAM,EAAE,CAAA;IAC1B,MAAM,EAAE;QAAE,MAAM,EAAE,aAAa,CAAC,QAAQ,CAAC,CAAC;QAAC,QAAQ,EAAE,aAAa,CAAC,UAAU,CAAC,CAAA;KAAE,CAAA;IAChF,MAAM,EAAE;QAAE,MAAM,EAAE,aAAa,CAAC,QAAQ,CAAC,CAAC;QAAC,UAAU,EAAE,MAAM,CAAA;KAAE,CAAA;IAC/D,gBAAgB,EAAE,MAAM,CAAA;IACxB,kBAAkB,EAAE,MAAM,CAAA;CAC1B;AAED;;;;;;;GAOG;AACH,wBAAsB,WAAW,CAAC,IAAI,EAAE,kBAAkB,GAAG,OAAO,CAAC,mBAAmB,CAAC,CAgIxF"}
|
package/out/src/build.js
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* End-to-end corpus build (Phase 1 task #10 in the plan).
|
|
7
|
+
*
|
|
8
|
+
* `buildCorpus(opts)` orchestrates every stage of the pipeline:
|
|
9
|
+
*
|
|
10
|
+
* 1. **Adapter runs** — drives every adapter in turn (via `runAdapter`), writing
|
|
11
|
+
* `<intermediate>/<adapter.id>/canonical.jsonl` shards.
|
|
12
|
+
* 2. **Synthesis** — optional. For each canonical row, every applicable augmentation in the row's
|
|
13
|
+
* country-default policy emits an augmented row alongside the original.
|
|
14
|
+
* 3. **Alignment** — every row (original + augmented) is aligned via `alignRow`. Successes go to
|
|
15
|
+
* `labeled.jsonl`; quarantines are appended to `quarantine.jsonl` with reasons.
|
|
16
|
+
* 4. **Splits** — `splitRows` partitions labeled `source_id`s into train/val/test by locality holdout.
|
|
17
|
+
* Manifest written to `splits/SPLIT_MANIFEST.json` + per-split `train.txt` / `val.txt` /
|
|
18
|
+
* `test.txt`.
|
|
19
|
+
* 5. **Parquet shards** — `writeShards` streams labeled rows into 1M-row `.parquet` shards per split
|
|
20
|
+
* under `corpus-v<version>/{train,val,test}/part-NNNN.parquet` (SNAPPY-compressed, 50k-row
|
|
21
|
+
* row groups), with per-shard checksums + per-stage manifest in
|
|
22
|
+
* `corpus-v<version>/MANIFEST.json`.
|
|
23
|
+
* 6. **Top-level manifest** — `<outputDir>/MANIFEST.json` ties every per-stage manifest together with
|
|
24
|
+
* a top-level corpus_version, built_at, and aggregate counts.
|
|
25
|
+
*
|
|
26
|
+
* Output layout:
|
|
27
|
+
*
|
|
28
|
+
* ```
|
|
29
|
+
* <outputDir>/
|
|
30
|
+
* MANIFEST.json
|
|
31
|
+
* intermediate/
|
|
32
|
+
* <adapter.id>/canonical.jsonl # one per adapter
|
|
33
|
+
* labeled.jsonl # post-alignment, pre-shard
|
|
34
|
+
* quarantine.jsonl # rows that failed alignment
|
|
35
|
+
* splits/
|
|
36
|
+
* SPLIT_MANIFEST.json
|
|
37
|
+
* train.txt / val.txt / test.txt
|
|
38
|
+
* corpus-v<version>/
|
|
39
|
+
* MANIFEST.json
|
|
40
|
+
* train/part-NNNN.parquet
|
|
41
|
+
* val/part-NNNN.parquet
|
|
42
|
+
* test/part-NNNN.parquet
|
|
43
|
+
* ```
|
|
44
|
+
*
|
|
45
|
+
* The intermediate files live alongside the final shards for reproducibility + debugging. Operators
|
|
46
|
+
* can `rm -rf intermediate/` after the build if disk is tight; the final `corpus-v<version>/` is
|
|
47
|
+
* self-contained.
|
|
48
|
+
*/
|
|
49
|
+
import { createReadStream, createWriteStream } from "node:fs";
|
|
50
|
+
import { mkdir, writeFile } from "node:fs/promises";
|
|
51
|
+
import { join } from "node:path";
|
|
52
|
+
import { createInterface } from "node:readline";
|
|
53
|
+
import { defaultAdapterRegistry } from "./adapter.js";
|
|
54
|
+
import { alignRow } from "./align.js";
|
|
55
|
+
import { writeShards } from "./parquet.js";
|
|
56
|
+
import { runAdapter } from "./runner.js";
|
|
57
|
+
import { defaultHoldouts, splitForRow, writeSplitManifestsFromLabeledFiles, } from "./split.js";
|
|
58
|
+
import { defaultAugmentationsForCountry, synthesizeRow } from "./synthesize.js";
|
|
59
|
+
/**
|
|
60
|
+
* Drive the full corpus build to completion.
|
|
61
|
+
*
|
|
62
|
+
* Memory profile: the function maintains an in-memory `Map<source_id, SplitName>` to bridge the
|
|
63
|
+
* align → shard hand-off. For Phase 1 fixture-scale runs (≤ 10⁴ rows) this is trivial. For real 5M+
|
|
64
|
+
* runs, the map fits comfortably in a few hundred MB; the canonical.jsonl and labeled.jsonl
|
|
65
|
+
* payloads stream and never sit in memory.
|
|
66
|
+
*/
|
|
67
|
+
export async function buildCorpus(opts) {
|
|
68
|
+
const adapters = opts.adapters ?? defaultAdapterRegistry.list();
|
|
69
|
+
const synthesize = opts.synthesize ?? true;
|
|
70
|
+
const rowsPerShard = opts.rowsPerShard ?? 1_000_000;
|
|
71
|
+
const built_at = new Date().toISOString();
|
|
72
|
+
await mkdir(opts.outputDir, { recursive: true });
|
|
73
|
+
const intermediateDir = join(opts.outputDir, "intermediate");
|
|
74
|
+
await mkdir(intermediateDir, { recursive: true });
|
|
75
|
+
// 1. Adapter runs.
|
|
76
|
+
const adapterRuns = [];
|
|
77
|
+
const skipped = [];
|
|
78
|
+
for (const adapter of adapters) {
|
|
79
|
+
const adapterOptions = opts.adapterInputs[adapter.id];
|
|
80
|
+
if (!adapterOptions) {
|
|
81
|
+
skipped.push(adapter.id);
|
|
82
|
+
opts.onProgress?.("adapter-run", `skipped ${adapter.id} (no input configured)`);
|
|
83
|
+
continue;
|
|
84
|
+
}
|
|
85
|
+
opts.onProgress?.("adapter-run", `running ${adapter.id}`);
|
|
86
|
+
const m = await runAdapter({
|
|
87
|
+
adapter,
|
|
88
|
+
adapterOptions,
|
|
89
|
+
outputDir: intermediateDir,
|
|
90
|
+
corpusVersion: opts.corpusVersion,
|
|
91
|
+
});
|
|
92
|
+
adapterRuns.push(m);
|
|
93
|
+
}
|
|
94
|
+
// 2 + 3. Synthesis + alignment: stream every canonical.jsonl, optionally augment, align,
|
|
95
|
+
// and route each labeled row directly to its split-specific JSONL (`labeled-{train,val,test}.
|
|
96
|
+
// jsonl`). Memory cost is O(1) — the prior in-memory `splitInputs` array + `splitByIdMap`
|
|
97
|
+
// + `SplitManifest.{train,val,test}` arrays are gone; per-row split is decided inline via
|
|
98
|
+
// `splitForRow` (a pure function of source_id + region + holdout policy).
|
|
99
|
+
const labeledPaths = {
|
|
100
|
+
train: join(intermediateDir, "labeled-train.jsonl"),
|
|
101
|
+
val: join(intermediateDir, "labeled-val.jsonl"),
|
|
102
|
+
test: join(intermediateDir, "labeled-test.jsonl"),
|
|
103
|
+
};
|
|
104
|
+
const labeledStreams = {
|
|
105
|
+
train: createWriteStream(labeledPaths.train, { encoding: "utf8" }),
|
|
106
|
+
val: createWriteStream(labeledPaths.val, { encoding: "utf8" }),
|
|
107
|
+
test: createWriteStream(labeledPaths.test, { encoding: "utf8" }),
|
|
108
|
+
};
|
|
109
|
+
const quarantinePath = join(intermediateDir, "quarantine.jsonl");
|
|
110
|
+
const quarantineStream = createWriteStream(quarantinePath, { encoding: "utf8" });
|
|
111
|
+
let aligned = 0;
|
|
112
|
+
let quarantined = 0;
|
|
113
|
+
const counts = { train: 0, val: 0, test: 0 };
|
|
114
|
+
const holdouts = defaultHoldouts();
|
|
115
|
+
const writeQuarantine = (row, reason) => {
|
|
116
|
+
quarantineStream.write(`${JSON.stringify({ row, reason })}\n`);
|
|
117
|
+
};
|
|
118
|
+
for (const adapterRun of adapterRuns) {
|
|
119
|
+
opts.onProgress?.("align", `aligning ${adapterRun.adapter_id}`);
|
|
120
|
+
for await (const row of streamJsonl(adapterRun.jsonl_path)) {
|
|
121
|
+
const fanned = [row];
|
|
122
|
+
if (synthesize) {
|
|
123
|
+
for (const aug of synthesizeRow(row, defaultAugmentationsForCountry(row.country))) {
|
|
124
|
+
fanned.push(aug);
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
for (const r of fanned) {
|
|
128
|
+
const result = alignRow(r);
|
|
129
|
+
if (result.kind === "labeled") {
|
|
130
|
+
const split = splitForRow(result.row, holdouts);
|
|
131
|
+
labeledStreams[split].write(`${JSON.stringify(result.row)}\n`);
|
|
132
|
+
counts[split]++;
|
|
133
|
+
aligned++;
|
|
134
|
+
}
|
|
135
|
+
else {
|
|
136
|
+
writeQuarantine(r, result.row.reason);
|
|
137
|
+
quarantined++;
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
for (const s of Object.values(labeledStreams))
|
|
143
|
+
s.end();
|
|
144
|
+
quarantineStream.end();
|
|
145
|
+
await Promise.all([...Object.values(labeledStreams).map(streamEnd), streamEnd(quarantineStream)]);
|
|
146
|
+
// 4. Splits — manifest derived by streaming the per-split labeled files; no in-memory
|
|
147
|
+
// source-id arrays. `sort(1)` from coreutils produces the deterministic per-split .txt
|
|
148
|
+
// manifests with disk spill for splits that exceed in-memory thresholds.
|
|
149
|
+
opts.onProgress?.("split", `splitting ${aligned} aligned rows`);
|
|
150
|
+
const splitsDir = join(opts.outputDir, "splits");
|
|
151
|
+
const splitCounts = await writeSplitManifestsFromLabeledFiles({
|
|
152
|
+
labeledPaths,
|
|
153
|
+
outputDir: splitsDir,
|
|
154
|
+
corpusVersion: opts.corpusVersion,
|
|
155
|
+
counts,
|
|
156
|
+
holdouts,
|
|
157
|
+
});
|
|
158
|
+
// 5. Parquet shards — per-split labeled JSONL streams in, sharded `.parquet` out. The prior
|
|
159
|
+
// `splitFor(source_id)` callback (and the `Map<source_id, SplitName>` behind it) is gone.
|
|
160
|
+
opts.onProgress?.("shard", "writing parquet shards");
|
|
161
|
+
const shardManifest = await writeShards({
|
|
162
|
+
train: streamJsonl(labeledPaths.train),
|
|
163
|
+
val: streamJsonl(labeledPaths.val),
|
|
164
|
+
test: streamJsonl(labeledPaths.test),
|
|
165
|
+
}, {
|
|
166
|
+
outputDir: opts.outputDir,
|
|
167
|
+
corpusVersion: opts.corpusVersion,
|
|
168
|
+
rowsPerShard,
|
|
169
|
+
});
|
|
170
|
+
// 6. Top-level manifest.
|
|
171
|
+
opts.onProgress?.("manifest", "writing top-level MANIFEST.json");
|
|
172
|
+
const manifest = {
|
|
173
|
+
corpus_version: opts.corpusVersion,
|
|
174
|
+
built_at,
|
|
175
|
+
adapters: adapterRuns,
|
|
176
|
+
skipped_adapters: skipped,
|
|
177
|
+
splits: { counts: splitCounts, holdouts },
|
|
178
|
+
shards: { counts: shardManifest.counts, total_rows: shardManifest.total_rows },
|
|
179
|
+
quarantine_count: quarantined,
|
|
180
|
+
total_aligned_rows: aligned,
|
|
181
|
+
};
|
|
182
|
+
await writeFile(join(opts.outputDir, "MANIFEST.json"), `${JSON.stringify(manifest, null, 2)}\n`, "utf8");
|
|
183
|
+
return manifest;
|
|
184
|
+
}
|
|
185
|
+
async function* streamJsonl(path) {
|
|
186
|
+
const stream = createReadStream(path, { encoding: "utf8" });
|
|
187
|
+
const rl = createInterface({ input: stream, crlfDelay: Infinity });
|
|
188
|
+
for await (const line of rl) {
|
|
189
|
+
const trimmed = line.trim();
|
|
190
|
+
if (!trimmed)
|
|
191
|
+
continue;
|
|
192
|
+
yield JSON.parse(trimmed);
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
function streamEnd(s) {
|
|
196
|
+
return new Promise((resolve, reject) => {
|
|
197
|
+
s.once("close", resolve);
|
|
198
|
+
s.once("error", reject);
|
|
199
|
+
});
|
|
200
|
+
}
|
|
201
|
+
//# sourceMappingURL=build.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"build.js","sourceRoot":"","sources":["../../src/build.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA+CG;AAEH,OAAO,EAAE,gBAAgB,EAAE,iBAAiB,EAAoB,MAAM,SAAS,CAAA;AAC/E,OAAO,EAAE,KAAK,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAA;AACnD,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAA;AAChC,OAAO,EAAE,eAAe,EAAE,MAAM,eAAe,CAAA;AAC/C,OAAO,EAAE,sBAAsB,EAAE,MAAM,cAAc,CAAA;AACrD,OAAO,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAA;AACrC,OAAO,EAAE,WAAW,EAAsB,MAAM,cAAc,CAAA;AAC9D,OAAO,EAAE,UAAU,EAA2B,MAAM,aAAa,CAAA;AACjE,OAAO,EACN,eAAe,EACf,WAAW,EACX,mCAAmC,GAGnC,MAAM,YAAY,CAAA;AACnB,OAAO,EAAE,8BAA8B,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAA;AAgD/E;;;;;;;GAOG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAAC,IAAwB;IACzD,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,IAAI,sBAAsB,CAAC,IAAI,EAAE,CAAA;IAC/D,MAAM,UAAU,GAAG,IAAI,CAAC,UAAU,IAAI,IAAI,CAAA;IAC1C,MAAM,YAAY,GAAG,IAAI,CAAC,YAAY,IAAI,SAAS,CAAA;IACnD,MAAM,QAAQ,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAA;IAEzC,MAAM,KAAK,CAAC,IAAI,CAAC,SAAS,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAA;IAChD,MAAM,eAAe,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,cAAc,CAAC,CAAA;IAC5D,MAAM,KAAK,CAAC,eAAe,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAA;IAEjD,mBAAmB;IACnB,MAAM,WAAW,GAAyB,EAAE,CAAA;IAC5C,MAAM,OAAO,GAAa,EAAE,CAAA;IAC5B,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;QAChC,MAAM,cAAc,GAAG,IAAI,CAAC,aAAa,CAAC,OAAO,CAAC,EAAE,CAAC,CAAA;QACrD,IAAI,CAAC,cAAc,EAAE,CAAC;YACrB,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC,CAAA;YACxB,IAAI,CAAC,UAAU,EAAE,CAAC,aAAa,EAAE,WAAW,OAAO,CAAC,EAAE,wBAAwB,CAAC,CAAA;YAC/E,SAAQ;QACT,CAAC;QACD,IAAI,CAAC,UAAU,EAAE,CAAC,aAAa,EAAE,WAAW,OAAO,CAAC,EAAE,EAAE,CAAC,CAAA;QACzD,MAAM,CAAC,GAAG,MAAM,UAAU,CAAC;YAC1B,OAAO;YACP,cAAc;YACd,SAAS,EAAE,eAAe;YAC1B,aAAa,EAAE,IAAI,CAAC,aAAa;SACjC,CAAC,CAAA;QACF,WAAW,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IACpB,CAAC;IAED,yFAAyF;IACzF,8FAA8F;IAC9F,0FAA0F;IAC1F,0FAA0F;IAC1F,0EAA0E;IAC1E,MAAM,YAAY,GAA8B;QAC/C,KAAK,EAAE,IAAI,CAAC,eAAe,EAAE,qBAAqB,CAAC;QACnD,GAAG,EAAE,IAAI,CAAC,eAAe,EAAE,mBAAmB,CAAC;QAC/C,IAAI,EAAE,IAAI,CAAC,eAAe,EAAE,oBAAoB,CAAC;KACjD,CAAA;IACD,MAAM,cAAc,GAAmC;QACtD,KAAK,EAAE,iBAAiB,CAAC,YAAY,CAAC,KAAK,EAAE,EAAE,QAAQ,EAAE,MAAM,EAAE,CAAC;QAClE,GAAG,EAAE,iBAAiB,CAAC,YAAY,CAAC,GAAG,EAAE,EAAE,QAAQ,EAAE,MAAM,EAAE,CAAC;QAC9D,IAAI,EAAE,iBAAiB,CAAC,YAAY,CAAC,IAAI,EAAE,EAAE,QAAQ,EAAE,MAAM,EAAE,CAAC;KAChE,CAAA;IACD,MAAM,cAAc,GAAG,IAAI,CAAC,eAAe,EAAE,kBAAkB,CAAC,CAAA;IAChE,MAAM,gBAAgB,GAAG,iBAAiB,CAAC,cAAc,EAAE,EAAE,QAAQ,EAAE,MAAM,EAAE,CAAC,CAAA;IAEhF,IAAI,OAAO,GAAG,CAAC,CAAA;IACf,IAAI,WAAW,GAAG,CAAC,CAAA;IACnB,MAAM,MAAM,GAA8B,EAAE,KAAK,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,CAAA;IACvE,MAAM,QAAQ,GAAG,eAAe,EAAE,CAAA;IAElC,MAAM,eAAe,GAAG,CAAC,GAAiB,EAAE,MAAc,EAAQ,EAAE;QACnE,gBAAgB,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC,SAAS,CAAC,EAAE,GAAG,EAAE,MAAM,EAAE,CAAC,IAAI,CAAC,CAAA;IAC/D,CAAC,CAAA;IAED,KAAK,MAAM,UAAU,IAAI,WAAW,EAAE,CAAC;QACtC,IAAI,CAAC,UAAU,EAAE,CAAC,OAAO,EAAE,YAAY,UAAU,CAAC,UAAU,EAAE,CAAC,CAAA;QAC/D,IAAI,KAAK,EAAE,MAAM,GAAG,IAAI,WAAW,CAAe,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;YAC1E,MAAM,MAAM,GAAmB,CAAC,GAAG,CAAC,CAAA;YACpC,IAAI,UAAU,EAAE,CAAC;gBAChB,KAAK,MAAM,GAAG,IAAI,aAAa,CAAC,GAAG,EAAE,8BAA8B,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,EAAE,CAAC;oBACnF,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;gBACjB,CAAC;YACF,CAAC;YACD,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;gBACxB,MAAM,MAAM,GAAG,QAAQ,CAAC,CAAC,CAAC,CAAA;gBAC1B,IAAI,MAAM,CAAC,IAAI,KAAK,SAAS,EAAE,CAAC;oBAC/B,MAAM,KAAK,GAAG,WAAW,CAAC,MAAM,CAAC,GAAG,EAAE,QAAQ,CAAC,CAAA;oBAC/C,cAAc,CAAC,KAAK,CAAC,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC,GAAG,CAAC,IAAI,CAAC,CAAA;oBAC9D,MAAM,CAAC,KAAK,CAAC,EAAE,CAAA;oBACf,OAAO,EAAE,CAAA;gBACV,CAAC;qBAAM,CAAC;oBACP,eAAe,CAAC,CAAC,EAAE,MAAM,CAAC,GAAG,CAAC,MAAM,CAAC,CAAA;oBACrC,WAAW,EAAE,CAAA;gBACd,CAAC;YACF,CAAC;QACF,CAAC;IACF,CAAC;IAED,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC;QAAE,CAAC,CAAC,GAAG,EAAE,CAAA;IACtD,gBAAgB,CAAC,GAAG,EAAE,CAAA;IACtB,MAAM,OAAO,CAAC,GAAG,CAAC,CAAC,GAAG,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,EAAE,SAAS,CAAC,gBAAgB,CAAC,CAAC,CAAC,CAAA;IAEjG,sFAAsF;IACtF,uFAAuF;IACvF,yEAAyE;IACzE,IAAI,CAAC,UAAU,EAAE,CAAC,OAAO,EAAE,aAAa,OAAO,eAAe,CAAC,CAAA;IAC/D,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,QAAQ,CAAC,CAAA;IAChD,MAAM,WAAW,GAAG,MAAM,mCAAmC,CAAC;QAC7D,YAAY;QACZ,SAAS,EAAE,SAAS;QACpB,aAAa,EAAE,IAAI,CAAC,aAAa;QACjC,MAAM;QACN,QAAQ;KACR,CAAC,CAAA;IAEF,4FAA4F;IAC5F,0FAA0F;IAC1F,IAAI,CAAC,UAAU,EAAE,CAAC,OAAO,EAAE,wBAAwB,CAAC,CAAA;IACpD,MAAM,aAAa,GAAG,MAAM,WAAW,CACtC;QACC,KAAK,EAAE,WAAW,CAAa,YAAY,CAAC,KAAK,CAAC;QAClD,GAAG,EAAE,WAAW,CAAa,YAAY,CAAC,GAAG,CAAC;QAC9C,IAAI,EAAE,WAAW,CAAa,YAAY,CAAC,IAAI,CAAC;KAChD,EACD;QACC,SAAS,EAAE,IAAI,CAAC,SAAS;QACzB,aAAa,EAAE,IAAI,CAAC,aAAa;QACjC,YAAY;KACZ,CACD,CAAA;IAED,yBAAyB;IACzB,IAAI,CAAC,UAAU,EAAE,CAAC,UAAU,EAAE,iCAAiC,CAAC,CAAA;IAChE,MAAM,QAAQ,GAAwB;QACrC,cAAc,EAAE,IAAI,CAAC,aAAa;QAClC,QAAQ;QACR,QAAQ,EAAE,WAAW;QACrB,gBAAgB,EAAE,OAAO;QACzB,MAAM,EAAE,EAAE,MAAM,EAAE,WAAW,EAAE,QAAQ,EAAE;QACzC,MAAM,EAAE,EAAE,MAAM,EAAE,aAAa,CAAC,MAAM,EAAE,UAAU,EAAE,aAAa,CAAC,UAAU,EAAE;QAC9E,gBAAgB,EAAE,WAAW;QAC7B,kBAAkB,EAAE,OAAO;KAC3B,CAAA;IACD,MAAM,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,eAAe,CAAC,EAAE,GAAG,IAAI,CAAC,SAAS,CAAC,QAAQ,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,MAAM,CAAC,CAAA;IACxG,OAAO,QAAQ,CAAA;AAChB,CAAC;AAED,KAAK,SAAS,CAAC,CAAC,WAAW,CAAI,IAAY;IAC1C,MAAM,MAAM,GAAG,gBAAgB,CAAC,IAAI,EAAE,EAAE,QAAQ,EAAE,MAAM,EAAE,CAAC,CAAA;IAC3D,MAAM,EAAE,GAAG,eAAe,CAAC,EAAE,KAAK,EAAE,MAAM,EAAE,SAAS,EAAE,QAAQ,EAAE,CAAC,CAAA;IAClE,IAAI,KAAK,EAAE,MAAM,IAAI,IAAI,EAAE,EAAE,CAAC;QAC7B,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAA;QAC3B,IAAI,CAAC,OAAO;YAAE,SAAQ;QACtB,MAAM,IAAI,CAAC,KAAK,CAAC,OAAO,CAAM,CAAA;IAC/B,CAAC;AACF,CAAC;AAED,SAAS,SAAS,CAAC,CAAc;IAChC,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;QACtC,CAAC,CAAC,IAAI,CAAC,OAAO,EAAE,OAAO,CAAC,CAAA;QACxB,CAAC,CAAC,IAAI,CAAC,OAAO,EAAE,MAAM,CAAC,CAAA;IACxB,CAAC,CAAC,CAAA;AACH,CAAC"}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* US Census FIPS state codes → two-letter abbreviation + full display name.
|
|
7
|
+
*
|
|
8
|
+
* Covers all 50 states + DC + the five primary territories (PR, GU, VI, MP, AS).
|
|
9
|
+
*
|
|
10
|
+
* The data is public-domain US Census reference (FIPS PUB 5-2, withdrawn but still the canonical
|
|
11
|
+
* numeric ID used by every TIGER product). It is reproduced here so the TIGER adapter can resolve
|
|
12
|
+
* `statefp` columns (e.g. `"50"`) into a `region` component (e.g. `"VT"`) without an extra DB
|
|
13
|
+
* join.
|
|
14
|
+
*
|
|
15
|
+
* Salvaged 2026-05-17 from `isp-nexus/universe@6eeb7bd99643a6d62a8b8abbd50968a1e492b90b`
|
|
16
|
+
* `tiger/state.ts` (originally copyright OpenISP, Inc.; both projects are AGPL-3.0). isp-nexus
|
|
17
|
+
* ships these as TypeScript enums + a TypeORM-backed service layer; mailwoman only needs the flat
|
|
18
|
+
* lookup so the file is a plain `Record` plus a small helper.
|
|
19
|
+
*/
|
|
20
|
+
/** Per-state record: two-letter postal abbreviation + full canonical display name. */
|
|
21
|
+
export interface UsStateInfo {
|
|
22
|
+
abbreviation: string;
|
|
23
|
+
name: string;
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* FIPS state-or-territory code → `{ abbreviation, name }`. Includes all 50 states, DC, and the five
|
|
27
|
+
* primary territories (PR, GU, VI, MP, AS). Codes are two-digit zero-padded strings to match TIGER
|
|
28
|
+
* column `statefp`.
|
|
29
|
+
*/
|
|
30
|
+
export declare const US_FIPS_STATE: Readonly<Record<string, UsStateInfo>>;
|
|
31
|
+
/** Lookup helper. Returns null when the FIPS code isn't recognized. */
|
|
32
|
+
export declare function lookupFipsState(statefp: string | null | undefined): UsStateInfo | null;
|
|
33
|
+
/**
|
|
34
|
+
* Inverted view: two-letter postal abbreviation → `UsStateInfo`. Built once at module load. Used by
|
|
35
|
+
* adapters whose source data ships the abbreviation rather than the FIPS code (FCC BDC, most
|
|
36
|
+
* federal CSVs).
|
|
37
|
+
*/
|
|
38
|
+
export declare const US_STATE_BY_ABBREVIATION: Readonly<Record<string, UsStateInfo>>;
|
|
39
|
+
/**
|
|
40
|
+
* Lookup helper for adapters carrying 2-char USPS abbreviations (`"CA"`, `"VT"`). Case-folded; null
|
|
41
|
+
* for any value outside the 50 states + DC + the five primary territories.
|
|
42
|
+
*/
|
|
43
|
+
export declare function lookupStateAbbreviation(abbreviation: string | null | undefined): UsStateInfo | null;
|
|
44
|
+
//# sourceMappingURL=us-fips-state.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"us-fips-state.d.ts","sourceRoot":"","sources":["../../../src/codex/us-fips-state.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAEH,sFAAsF;AACtF,MAAM,WAAW,WAAW;IAC3B,YAAY,EAAE,MAAM,CAAA;IACpB,IAAI,EAAE,MAAM,CAAA;CACZ;AAED;;;;GAIG;AACH,eAAO,MAAM,aAAa,EAAE,QAAQ,CAAC,MAAM,CAAC,MAAM,EAAE,WAAW,CAAC,CA0D9D,CAAA;AAEF,uEAAuE;AACvE,wBAAgB,eAAe,CAAC,OAAO,EAAE,MAAM,GAAG,IAAI,GAAG,SAAS,GAAG,WAAW,GAAG,IAAI,CAGtF;AAED;;;;GAIG;AACH,eAAO,MAAM,wBAAwB,EAAE,QAAQ,CAAC,MAAM,CAAC,MAAM,EAAE,WAAW,CAAC,CAE1E,CAAA;AAED;;;GAGG;AACH,wBAAgB,uBAAuB,CAAC,YAAY,EAAE,MAAM,GAAG,IAAI,GAAG,SAAS,GAAG,WAAW,GAAG,IAAI,CAGnG"}
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* US Census FIPS state codes → two-letter abbreviation + full display name.
|
|
7
|
+
*
|
|
8
|
+
* Covers all 50 states + DC + the five primary territories (PR, GU, VI, MP, AS).
|
|
9
|
+
*
|
|
10
|
+
* The data is public-domain US Census reference (FIPS PUB 5-2, withdrawn but still the canonical
|
|
11
|
+
* numeric ID used by every TIGER product). It is reproduced here so the TIGER adapter can resolve
|
|
12
|
+
* `statefp` columns (e.g. `"50"`) into a `region` component (e.g. `"VT"`) without an extra DB
|
|
13
|
+
* join.
|
|
14
|
+
*
|
|
15
|
+
* Salvaged 2026-05-17 from `isp-nexus/universe@6eeb7bd99643a6d62a8b8abbd50968a1e492b90b`
|
|
16
|
+
* `tiger/state.ts` (originally copyright OpenISP, Inc.; both projects are AGPL-3.0). isp-nexus
|
|
17
|
+
* ships these as TypeScript enums + a TypeORM-backed service layer; mailwoman only needs the flat
|
|
18
|
+
* lookup so the file is a plain `Record` plus a small helper.
|
|
19
|
+
*/
|
|
20
|
+
/**
|
|
21
|
+
* FIPS state-or-territory code → `{ abbreviation, name }`. Includes all 50 states, DC, and the five
|
|
22
|
+
* primary territories (PR, GU, VI, MP, AS). Codes are two-digit zero-padded strings to match TIGER
|
|
23
|
+
* column `statefp`.
|
|
24
|
+
*/
|
|
25
|
+
export const US_FIPS_STATE = Object.freeze({
|
|
26
|
+
"01": { abbreviation: "AL", name: "Alabama" },
|
|
27
|
+
"02": { abbreviation: "AK", name: "Alaska" },
|
|
28
|
+
"04": { abbreviation: "AZ", name: "Arizona" },
|
|
29
|
+
"05": { abbreviation: "AR", name: "Arkansas" },
|
|
30
|
+
"06": { abbreviation: "CA", name: "California" },
|
|
31
|
+
"08": { abbreviation: "CO", name: "Colorado" },
|
|
32
|
+
"09": { abbreviation: "CT", name: "Connecticut" },
|
|
33
|
+
"10": { abbreviation: "DE", name: "Delaware" },
|
|
34
|
+
"11": { abbreviation: "DC", name: "District of Columbia" },
|
|
35
|
+
"12": { abbreviation: "FL", name: "Florida" },
|
|
36
|
+
"13": { abbreviation: "GA", name: "Georgia" },
|
|
37
|
+
"15": { abbreviation: "HI", name: "Hawaii" },
|
|
38
|
+
"16": { abbreviation: "ID", name: "Idaho" },
|
|
39
|
+
"17": { abbreviation: "IL", name: "Illinois" },
|
|
40
|
+
"18": { abbreviation: "IN", name: "Indiana" },
|
|
41
|
+
"19": { abbreviation: "IA", name: "Iowa" },
|
|
42
|
+
"20": { abbreviation: "KS", name: "Kansas" },
|
|
43
|
+
"21": { abbreviation: "KY", name: "Kentucky" },
|
|
44
|
+
"22": { abbreviation: "LA", name: "Louisiana" },
|
|
45
|
+
"23": { abbreviation: "ME", name: "Maine" },
|
|
46
|
+
"24": { abbreviation: "MD", name: "Maryland" },
|
|
47
|
+
"25": { abbreviation: "MA", name: "Massachusetts" },
|
|
48
|
+
"26": { abbreviation: "MI", name: "Michigan" },
|
|
49
|
+
"27": { abbreviation: "MN", name: "Minnesota" },
|
|
50
|
+
"28": { abbreviation: "MS", name: "Mississippi" },
|
|
51
|
+
"29": { abbreviation: "MO", name: "Missouri" },
|
|
52
|
+
"30": { abbreviation: "MT", name: "Montana" },
|
|
53
|
+
"31": { abbreviation: "NE", name: "Nebraska" },
|
|
54
|
+
"32": { abbreviation: "NV", name: "Nevada" },
|
|
55
|
+
"33": { abbreviation: "NH", name: "New Hampshire" },
|
|
56
|
+
"34": { abbreviation: "NJ", name: "New Jersey" },
|
|
57
|
+
"35": { abbreviation: "NM", name: "New Mexico" },
|
|
58
|
+
"36": { abbreviation: "NY", name: "New York" },
|
|
59
|
+
"37": { abbreviation: "NC", name: "North Carolina" },
|
|
60
|
+
"38": { abbreviation: "ND", name: "North Dakota" },
|
|
61
|
+
"39": { abbreviation: "OH", name: "Ohio" },
|
|
62
|
+
"40": { abbreviation: "OK", name: "Oklahoma" },
|
|
63
|
+
"41": { abbreviation: "OR", name: "Oregon" },
|
|
64
|
+
"42": { abbreviation: "PA", name: "Pennsylvania" },
|
|
65
|
+
"44": { abbreviation: "RI", name: "Rhode Island" },
|
|
66
|
+
"45": { abbreviation: "SC", name: "South Carolina" },
|
|
67
|
+
"46": { abbreviation: "SD", name: "South Dakota" },
|
|
68
|
+
"47": { abbreviation: "TN", name: "Tennessee" },
|
|
69
|
+
"48": { abbreviation: "TX", name: "Texas" },
|
|
70
|
+
"49": { abbreviation: "UT", name: "Utah" },
|
|
71
|
+
"50": { abbreviation: "VT", name: "Vermont" },
|
|
72
|
+
"51": { abbreviation: "VA", name: "Virginia" },
|
|
73
|
+
"53": { abbreviation: "WA", name: "Washington" },
|
|
74
|
+
"54": { abbreviation: "WV", name: "West Virginia" },
|
|
75
|
+
"55": { abbreviation: "WI", name: "Wisconsin" },
|
|
76
|
+
"56": { abbreviation: "WY", name: "Wyoming" },
|
|
77
|
+
// Territories
|
|
78
|
+
"60": { abbreviation: "AS", name: "American Samoa" },
|
|
79
|
+
"66": { abbreviation: "GU", name: "Guam" },
|
|
80
|
+
"69": { abbreviation: "MP", name: "Northern Mariana Islands" },
|
|
81
|
+
"72": { abbreviation: "PR", name: "Puerto Rico" },
|
|
82
|
+
"78": { abbreviation: "VI", name: "Virgin Islands" },
|
|
83
|
+
});
|
|
84
|
+
/** Lookup helper. Returns null when the FIPS code isn't recognized. */
|
|
85
|
+
export function lookupFipsState(statefp) {
|
|
86
|
+
if (!statefp)
|
|
87
|
+
return null;
|
|
88
|
+
return US_FIPS_STATE[statefp] ?? null;
|
|
89
|
+
}
|
|
90
|
+
/**
|
|
91
|
+
* Inverted view: two-letter postal abbreviation → `UsStateInfo`. Built once at module load. Used by
|
|
92
|
+
* adapters whose source data ships the abbreviation rather than the FIPS code (FCC BDC, most
|
|
93
|
+
* federal CSVs).
|
|
94
|
+
*/
|
|
95
|
+
export const US_STATE_BY_ABBREVIATION = Object.freeze(Object.fromEntries(Object.values(US_FIPS_STATE).map((info) => [info.abbreviation, info])));
|
|
96
|
+
/**
|
|
97
|
+
* Lookup helper for adapters carrying 2-char USPS abbreviations (`"CA"`, `"VT"`). Case-folded; null
|
|
98
|
+
* for any value outside the 50 states + DC + the five primary territories.
|
|
99
|
+
*/
|
|
100
|
+
export function lookupStateAbbreviation(abbreviation) {
|
|
101
|
+
if (!abbreviation)
|
|
102
|
+
return null;
|
|
103
|
+
return US_STATE_BY_ABBREVIATION[abbreviation.toUpperCase()] ?? null;
|
|
104
|
+
}
|
|
105
|
+
//# sourceMappingURL=us-fips-state.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"us-fips-state.js","sourceRoot":"","sources":["../../../src/codex/us-fips-state.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAQH;;;;GAIG;AACH,MAAM,CAAC,MAAM,aAAa,GAA0C,MAAM,CAAC,MAAM,CAAC;IACjF,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE;IAC7C,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,QAAQ,EAAE;IAC5C,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE;IAC7C,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,UAAU,EAAE;IAC9C,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,YAAY,EAAE;IAChD,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,UAAU,EAAE;IAC9C,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,aAAa,EAAE;IACjD,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,UAAU,EAAE;IAC9C,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,sBAAsB,EAAE;IAC1D,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE;IAC7C,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE;IAC7C,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,QAAQ,EAAE;IAC5C,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,OAAO,EAAE;IAC3C,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,UAAU,EAAE;IAC9C,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE;IAC7C,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,MAAM,EAAE;IAC1C,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,QAAQ,EAAE;IAC5C,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,UAAU,EAAE;IAC9C,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,WAAW,EAAE;IAC/C,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,OAAO,EAAE;IAC3C,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,UAAU,EAAE;IAC9C,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,eAAe,EAAE;IACnD,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,UAAU,EAAE;IAC9C,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,WAAW,EAAE;IAC/C,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,aAAa,EAAE;IACjD,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,UAAU,EAAE;IAC9C,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE;IAC7C,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,UAAU,EAAE;IAC9C,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,QAAQ,EAAE;IAC5C,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,eAAe,EAAE;IACnD,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,YAAY,EAAE;IAChD,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,YAAY,EAAE;IAChD,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,UAAU,EAAE;IAC9C,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,gBAAgB,EAAE;IACpD,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,cAAc,EAAE;IAClD,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,MAAM,EAAE;IAC1C,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,UAAU,EAAE;IAC9C,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,QAAQ,EAAE;IAC5C,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,cAAc,EAAE;IAClD,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,cAAc,EAAE;IAClD,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,gBAAgB,EAAE;IACpD,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,cAAc,EAAE;IAClD,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,WAAW,EAAE;IAC/C,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,OAAO,EAAE;IAC3C,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,MAAM,EAAE;IAC1C,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE;IAC7C,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,UAAU,EAAE;IAC9C,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,YAAY,EAAE;IAChD,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,eAAe,EAAE;IACnD,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,WAAW,EAAE;IAC/C,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE;IAC7C,cAAc;IACd,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,gBAAgB,EAAE;IACpD,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,MAAM,EAAE;IAC1C,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,0BAA0B,EAAE;IAC9D,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,aAAa,EAAE;IACjD,IAAI,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,IAAI,EAAE,gBAAgB,EAAE;CACpD,CAAC,CAAA;AAEF,uEAAuE;AACvE,MAAM,UAAU,eAAe,CAAC,OAAkC;IACjE,IAAI,CAAC,OAAO;QAAE,OAAO,IAAI,CAAA;IACzB,OAAO,aAAa,CAAC,OAAO,CAAC,IAAI,IAAI,CAAA;AACtC,CAAC;AAED;;;;GAIG;AACH,MAAM,CAAC,MAAM,wBAAwB,GAA0C,MAAM,CAAC,MAAM,CAC3F,MAAM,CAAC,WAAW,CAAC,MAAM,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,IAAI,CAAC,YAAY,EAAE,IAAI,CAAC,CAAC,CAAC,CACzF,CAAA;AAED;;;GAGG;AACH,MAAM,UAAU,uBAAuB,CAAC,YAAuC;IAC9E,IAAI,CAAC,YAAY;QAAE,OAAO,IAAI,CAAA;IAC9B,OAAO,wBAAwB,CAAC,YAAY,CAAC,WAAW,EAAE,CAAC,IAAI,IAAI,CAAA;AACpE,CAAC"}
|