@mailwoman/corpus 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/out/src/adapter.d.ts +96 -0
- package/out/src/adapter.d.ts.map +1 -0
- package/out/src/adapter.js +107 -0
- package/out/src/adapter.js.map +1 -0
- package/out/src/adapters/ban/adapter.d.ts +32 -0
- package/out/src/adapters/ban/adapter.d.ts.map +1 -0
- package/out/src/adapters/ban/adapter.js +133 -0
- package/out/src/adapters/ban/adapter.js.map +1 -0
- package/out/src/adapters/fcc-bdc/adapter.d.ts +61 -0
- package/out/src/adapters/fcc-bdc/adapter.d.ts.map +1 -0
- package/out/src/adapters/fcc-bdc/adapter.js +153 -0
- package/out/src/adapters/fcc-bdc/adapter.js.map +1 -0
- package/out/src/adapters/index.d.ts +42 -0
- package/out/src/adapters/index.d.ts.map +1 -0
- package/out/src/adapters/index.js +76 -0
- package/out/src/adapters/index.js.map +1 -0
- package/out/src/adapters/openaddresses/adapter.d.ts +60 -0
- package/out/src/adapters/openaddresses/adapter.d.ts.map +1 -0
- package/out/src/adapters/openaddresses/adapter.js +174 -0
- package/out/src/adapters/openaddresses/adapter.js.map +1 -0
- package/out/src/adapters/state-ia-contractors/adapter.d.ts +23 -0
- package/out/src/adapters/state-ia-contractors/adapter.d.ts.map +1 -0
- package/out/src/adapters/state-ia-contractors/adapter.js +113 -0
- package/out/src/adapters/state-ia-contractors/adapter.js.map +1 -0
- package/out/src/adapters/state-ny-notaries/adapter.d.ts +21 -0
- package/out/src/adapters/state-ny-notaries/adapter.d.ts.map +1 -0
- package/out/src/adapters/state-ny-notaries/adapter.js +132 -0
- package/out/src/adapters/state-ny-notaries/adapter.js.map +1 -0
- package/out/src/adapters/state-tx-notaries/adapter.d.ts +22 -0
- package/out/src/adapters/state-tx-notaries/adapter.d.ts.map +1 -0
- package/out/src/adapters/state-tx-notaries/adapter.js +125 -0
- package/out/src/adapters/state-tx-notaries/adapter.js.map +1 -0
- package/out/src/adapters/tiger/adapter.d.ts +45 -0
- package/out/src/adapters/tiger/adapter.d.ts.map +1 -0
- package/out/src/adapters/tiger/adapter.js +179 -0
- package/out/src/adapters/tiger/adapter.js.map +1 -0
- package/out/src/adapters/usgov-hrsa-fqhc/adapter.d.ts +36 -0
- package/out/src/adapters/usgov-hrsa-fqhc/adapter.d.ts.map +1 -0
- package/out/src/adapters/usgov-hrsa-fqhc/adapter.js +147 -0
- package/out/src/adapters/usgov-hrsa-fqhc/adapter.js.map +1 -0
- package/out/src/adapters/usgov-imls-pls/adapter.d.ts +25 -0
- package/out/src/adapters/usgov-imls-pls/adapter.d.ts.map +1 -0
- package/out/src/adapters/usgov-imls-pls/adapter.js +118 -0
- package/out/src/adapters/usgov-imls-pls/adapter.js.map +1 -0
- package/out/src/adapters/usgov-nad/adapter.d.ts +37 -0
- package/out/src/adapters/usgov-nad/adapter.d.ts.map +1 -0
- package/out/src/adapters/usgov-nad/adapter.js +227 -0
- package/out/src/adapters/usgov-nad/adapter.js.map +1 -0
- package/out/src/adapters/usgov-nppes/adapter.d.ts +28 -0
- package/out/src/adapters/usgov-nppes/adapter.d.ts.map +1 -0
- package/out/src/adapters/usgov-nppes/adapter.js +123 -0
- package/out/src/adapters/usgov-nppes/adapter.js.map +1 -0
- package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.d.ts +35 -0
- package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.d.ts.map +1 -0
- package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.js +162 -0
- package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.js.map +1 -0
- package/out/src/adapters/wof-admin-json/adapter.d.ts +85 -0
- package/out/src/adapters/wof-admin-json/adapter.d.ts.map +1 -0
- package/out/src/adapters/wof-admin-json/adapter.js +241 -0
- package/out/src/adapters/wof-admin-json/adapter.js.map +1 -0
- package/out/src/adapters/wof-postalcode-json/adapter.d.ts +63 -0
- package/out/src/adapters/wof-postalcode-json/adapter.d.ts.map +1 -0
- package/out/src/adapters/wof-postalcode-json/adapter.js +178 -0
- package/out/src/adapters/wof-postalcode-json/adapter.js.map +1 -0
- package/out/src/align.d.ts +58 -0
- package/out/src/align.d.ts.map +1 -0
- package/out/src/align.js +139 -0
- package/out/src/align.js.map +1 -0
- package/out/src/build.d.ts +104 -0
- package/out/src/build.d.ts.map +1 -0
- package/out/src/build.js +201 -0
- package/out/src/build.js.map +1 -0
- package/out/src/codex/us-fips-state.d.ts +44 -0
- package/out/src/codex/us-fips-state.d.ts.map +1 -0
- package/out/src/codex/us-fips-state.js +105 -0
- package/out/src/codex/us-fips-state.js.map +1 -0
- package/out/src/codex/us-street-suffix.d.ts +259 -0
- package/out/src/codex/us-street-suffix.d.ts.map +1 -0
- package/out/src/codex/us-street-suffix.js +285 -0
- package/out/src/codex/us-street-suffix.js.map +1 -0
- package/out/src/format.d.ts +79 -0
- package/out/src/format.d.ts.map +1 -0
- package/out/src/format.js +151 -0
- package/out/src/format.js.map +1 -0
- package/out/src/golden.d.ts +50 -0
- package/out/src/golden.d.ts.map +1 -0
- package/out/src/golden.js +104 -0
- package/out/src/golden.js.map +1 -0
- package/out/src/index.d.ts +18 -0
- package/out/src/index.d.ts.map +1 -0
- package/out/src/index.js +18 -0
- package/out/src/index.js.map +1 -0
- package/out/src/parquet-wrapper/index.d.ts +12 -0
- package/out/src/parquet-wrapper/index.d.ts.map +1 -0
- package/out/src/parquet-wrapper/index.js +12 -0
- package/out/src/parquet-wrapper/index.js.map +1 -0
- package/out/src/parquet-wrapper/reader.d.ts +31 -0
- package/out/src/parquet-wrapper/reader.d.ts.map +1 -0
- package/out/src/parquet-wrapper/reader.js +54 -0
- package/out/src/parquet-wrapper/reader.js.map +1 -0
- package/out/src/parquet-wrapper/schema.d.ts +45 -0
- package/out/src/parquet-wrapper/schema.d.ts.map +1 -0
- package/out/src/parquet-wrapper/schema.js +55 -0
- package/out/src/parquet-wrapper/schema.js.map +1 -0
- package/out/src/parquet-wrapper/writer.d.ts +41 -0
- package/out/src/parquet-wrapper/writer.d.ts.map +1 -0
- package/out/src/parquet-wrapper/writer.js +71 -0
- package/out/src/parquet-wrapper/writer.js.map +1 -0
- package/out/src/parquet.d.ts +122 -0
- package/out/src/parquet.d.ts.map +1 -0
- package/out/src/parquet.js +220 -0
- package/out/src/parquet.js.map +1 -0
- package/out/src/runner.d.ts +100 -0
- package/out/src/runner.d.ts.map +1 -0
- package/out/src/runner.js +183 -0
- package/out/src/runner.js.map +1 -0
- package/out/src/split.d.ts +108 -0
- package/out/src/split.d.ts.map +1 -0
- package/out/src/split.js +191 -0
- package/out/src/split.js.map +1 -0
- package/out/src/synthesize.d.ts +146 -0
- package/out/src/synthesize.d.ts.map +1 -0
- package/out/src/synthesize.js +472 -0
- package/out/src/synthesize.js.map +1 -0
- package/out/src/tokenize.d.ts +47 -0
- package/out/src/tokenize.d.ts.map +1 -0
- package/out/src/tokenize.js +49 -0
- package/out/src/tokenize.js.map +1 -0
- package/out/src/types.d.ts +168 -0
- package/out/src/types.d.ts.map +1 -0
- package/out/src/types.js +19 -0
- package/out/src/types.js.map +1 -0
- package/out/src/wof-json.d.ts +105 -0
- package/out/src/wof-json.d.ts.map +1 -0
- package/out/src/wof-json.js +174 -0
- package/out/src/wof-json.js.map +1 -0
- package/package.json +36 -0
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Adapter runner — drives a `CorpusAdapter` to completion and writes intermediate JSONL + a
|
|
7
|
+
* per-shard manifest.
|
|
8
|
+
*
|
|
9
|
+
* Output layout under `outputDir`:
|
|
10
|
+
*
|
|
11
|
+
* ```
|
|
12
|
+
* <outputDir>/<adapter.id>/
|
|
13
|
+
* canonical.jsonl # one row per line, in emission order
|
|
14
|
+
* MANIFEST.json # adapter id, version, row count, sha256, license, started_at, ended_at
|
|
15
|
+
* ```
|
|
16
|
+
*
|
|
17
|
+
* The runner is responsible for everything an adapter is **not** responsible for:
|
|
18
|
+
*
|
|
19
|
+
* - Stamping `corpus_version` on every row (adapters must NOT set it).
|
|
20
|
+
* - Applying `canonicalDedupKey` and skipping duplicates.
|
|
21
|
+
* - Streaming sha256 over JSONL bytes so the manifest checksum doesn't require a re-read.
|
|
22
|
+
* - Honoring backpressure on the output write stream.
|
|
23
|
+
* - Counting + emitting periodic progress to an optional callback.
|
|
24
|
+
* - Honoring `signal` (delegates to adapter's iteration boundary).
|
|
25
|
+
*
|
|
26
|
+
* The runner does NOT perform alignment, tokenization, synthesis, or sharding into Parquet. Those
|
|
27
|
+
* steps run later, consuming the JSONL shards this writes.
|
|
28
|
+
*/
|
|
29
|
+
import { createWriteStream } from "node:fs";
|
|
30
|
+
import { mkdir, writeFile } from "node:fs/promises";
|
|
31
|
+
import { dirname, join } from "node:path";
|
|
32
|
+
import { canonicalDedupKey, streamingSha256 } from "./adapter.js";
|
|
33
|
+
/**
|
|
34
|
+
* Drive a single adapter to completion.
|
|
35
|
+
*
|
|
36
|
+
* Returns the manifest describing the run. Writes `canonical.jsonl` + `MANIFEST.json` under
|
|
37
|
+
* `outputDir/<adapter.id>/`. Throws if the output directory cannot be created, if a row arrives
|
|
38
|
+
* with a missing required field, or if the abort signal fires.
|
|
39
|
+
*/
|
|
40
|
+
export async function runAdapter(opts) {
|
|
41
|
+
const { adapter, adapterOptions, outputDir, corpusVersion } = opts;
|
|
42
|
+
const progressEvery = opts.progressEvery ?? 1_000;
|
|
43
|
+
const adapterDir = join(outputDir, adapter.id);
|
|
44
|
+
await mkdir(adapterDir, { recursive: true });
|
|
45
|
+
const jsonlPath = join(adapterDir, "canonical.jsonl");
|
|
46
|
+
const manifestPath = join(adapterDir, "MANIFEST.json");
|
|
47
|
+
const startedAt = new Date();
|
|
48
|
+
const t0 = performance.now();
|
|
49
|
+
const stream = createWriteStream(jsonlPath, { encoding: "utf8" });
|
|
50
|
+
const hasher = streamingSha256();
|
|
51
|
+
const seen = new Set();
|
|
52
|
+
const DEDUP_MAX_SIZE = 10_000_000;
|
|
53
|
+
let dedupExhausted = false;
|
|
54
|
+
let yielded = 0;
|
|
55
|
+
let written = 0;
|
|
56
|
+
let bytes = 0;
|
|
57
|
+
const emitProgress = () => {
|
|
58
|
+
opts.onProgress?.({
|
|
59
|
+
adapterId: adapter.id,
|
|
60
|
+
yielded,
|
|
61
|
+
written,
|
|
62
|
+
bytes,
|
|
63
|
+
elapsed_ms: performance.now() - t0,
|
|
64
|
+
});
|
|
65
|
+
};
|
|
66
|
+
try {
|
|
67
|
+
for await (const row of adapter.rows(adapterOptions)) {
|
|
68
|
+
if (adapterOptions.signal?.aborted) {
|
|
69
|
+
throw new DOMException("Adapter run aborted by signal", "AbortError");
|
|
70
|
+
}
|
|
71
|
+
yielded++;
|
|
72
|
+
assertEmittedRow(adapter, row);
|
|
73
|
+
const stamped = { ...row, corpus_version: corpusVersion };
|
|
74
|
+
const key = canonicalDedupKey(stamped);
|
|
75
|
+
if (!dedupExhausted) {
|
|
76
|
+
if (seen.has(key)) {
|
|
77
|
+
if (yielded % progressEvery === 0)
|
|
78
|
+
emitProgress();
|
|
79
|
+
continue;
|
|
80
|
+
}
|
|
81
|
+
if (seen.size >= DEDUP_MAX_SIZE) {
|
|
82
|
+
dedupExhausted = true;
|
|
83
|
+
process.stderr.write(` runner: dedup set full at ${DEDUP_MAX_SIZE.toLocaleString()} — skipping dedup for remaining rows\n`);
|
|
84
|
+
}
|
|
85
|
+
else {
|
|
86
|
+
seen.add(key);
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
const line = `${JSON.stringify(stamped)}\n`;
|
|
90
|
+
hasher.update(line);
|
|
91
|
+
bytes += Buffer.byteLength(line, "utf8");
|
|
92
|
+
written++;
|
|
93
|
+
if (!stream.write(line)) {
|
|
94
|
+
await once(stream, "drain");
|
|
95
|
+
}
|
|
96
|
+
if (yielded % progressEvery === 0)
|
|
97
|
+
emitProgress();
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
finally {
|
|
101
|
+
stream.end();
|
|
102
|
+
await once(stream, "close");
|
|
103
|
+
}
|
|
104
|
+
const endedAt = new Date();
|
|
105
|
+
const elapsed_ms = performance.now() - t0;
|
|
106
|
+
emitProgress();
|
|
107
|
+
const manifest = {
|
|
108
|
+
adapter_id: adapter.id,
|
|
109
|
+
corpus_version: corpusVersion,
|
|
110
|
+
default_license: adapter.defaultLicense,
|
|
111
|
+
description: adapter.description,
|
|
112
|
+
yielded,
|
|
113
|
+
written,
|
|
114
|
+
deduped: yielded - written,
|
|
115
|
+
bytes,
|
|
116
|
+
sha256: hasher.digest(),
|
|
117
|
+
jsonl_path: jsonlPath,
|
|
118
|
+
started_at: startedAt.toISOString(),
|
|
119
|
+
ended_at: endedAt.toISOString(),
|
|
120
|
+
elapsed_ms,
|
|
121
|
+
};
|
|
122
|
+
await writeFile(manifestPath, `${JSON.stringify(manifest, null, 2)}\n`, "utf8");
|
|
123
|
+
return manifest;
|
|
124
|
+
}
|
|
125
|
+
/**
|
|
126
|
+
* Drive every adapter in a registry sequentially. Stops on the first failure (caller can filter the
|
|
127
|
+
* registry before calling if partial-failure is desired).
|
|
128
|
+
*
|
|
129
|
+
* Returns the manifests in registry insertion order.
|
|
130
|
+
*/
|
|
131
|
+
export async function runAllAdapters(registry, common) {
|
|
132
|
+
const out = [];
|
|
133
|
+
for (const adapter of registry.list()) {
|
|
134
|
+
const adapterOptions = common.adapterOptionsFor?.(adapter) ?? common.adapterOptions;
|
|
135
|
+
out.push(await runAdapter({
|
|
136
|
+
...common,
|
|
137
|
+
adapter,
|
|
138
|
+
adapterOptions,
|
|
139
|
+
}));
|
|
140
|
+
}
|
|
141
|
+
return out;
|
|
142
|
+
}
|
|
143
|
+
/**
|
|
144
|
+
* Validate an emitted row. Cheap; runs once per row. Catches adapter bugs early so the JSONL
|
|
145
|
+
* doesn't end up half-malformed.
|
|
146
|
+
*/
|
|
147
|
+
function assertEmittedRow(adapter, row) {
|
|
148
|
+
if (row.source !== adapter.id) {
|
|
149
|
+
throw new Error(`adapter ${adapter.id}: row.source must equal adapter.id (got ${JSON.stringify(row.source)})`);
|
|
150
|
+
}
|
|
151
|
+
if (!row.source_id) {
|
|
152
|
+
throw new Error(`adapter ${adapter.id}: row.source_id is empty`);
|
|
153
|
+
}
|
|
154
|
+
if (!row.raw) {
|
|
155
|
+
throw new Error(`adapter ${adapter.id}: row.raw is empty for source_id=${row.source_id}`);
|
|
156
|
+
}
|
|
157
|
+
if (!row.country) {
|
|
158
|
+
throw new Error(`adapter ${adapter.id}: row.country is empty for source_id=${row.source_id}`);
|
|
159
|
+
}
|
|
160
|
+
if (!row.license) {
|
|
161
|
+
throw new Error(`adapter ${adapter.id}: row.license is empty for source_id=${row.source_id}`);
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
/** Promise-ify a single event emission. Used to await `drain` / `close` on the write stream. */
|
|
165
|
+
function once(emitter, event) {
|
|
166
|
+
return new Promise((resolve, reject) => {
|
|
167
|
+
const onEvent = () => {
|
|
168
|
+
emitter.off("error", onError);
|
|
169
|
+
resolve();
|
|
170
|
+
};
|
|
171
|
+
const onError = (err) => {
|
|
172
|
+
emitter.off(event, onEvent);
|
|
173
|
+
reject(err);
|
|
174
|
+
};
|
|
175
|
+
emitter.once(event, onEvent);
|
|
176
|
+
emitter.once("error", onError);
|
|
177
|
+
});
|
|
178
|
+
}
|
|
179
|
+
/** Convenience: ensure the parent directory of `filePath` exists. */
|
|
180
|
+
export async function ensureParentDir(filePath) {
|
|
181
|
+
await mkdir(dirname(filePath), { recursive: true });
|
|
182
|
+
}
|
|
183
|
+
//# sourceMappingURL=runner.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"runner.js","sourceRoot":"","sources":["../../src/runner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;GA2BG;AAEH,OAAO,EAAE,iBAAiB,EAAoB,MAAM,SAAS,CAAA;AAC7D,OAAO,EAAE,KAAK,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAA;AACnD,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAA;AACzC,OAAO,EAAE,iBAAiB,EAAE,eAAe,EAA8C,MAAM,cAAc,CAAA;AAiE7G;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,UAAU,CAAC,IAAuB;IACvD,MAAM,EAAE,OAAO,EAAE,cAAc,EAAE,SAAS,EAAE,aAAa,EAAE,GAAG,IAAI,CAAA;IAClE,MAAM,aAAa,GAAG,IAAI,CAAC,aAAa,IAAI,KAAK,CAAA;IAEjD,MAAM,UAAU,GAAG,IAAI,CAAC,SAAS,EAAE,OAAO,CAAC,EAAE,CAAC,CAAA;IAC9C,MAAM,KAAK,CAAC,UAAU,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAA;IAE5C,MAAM,SAAS,GAAG,IAAI,CAAC,UAAU,EAAE,iBAAiB,CAAC,CAAA;IACrD,MAAM,YAAY,GAAG,IAAI,CAAC,UAAU,EAAE,eAAe,CAAC,CAAA;IAEtD,MAAM,SAAS,GAAG,IAAI,IAAI,EAAE,CAAA;IAC5B,MAAM,EAAE,GAAG,WAAW,CAAC,GAAG,EAAE,CAAA;IAE5B,MAAM,MAAM,GAAG,iBAAiB,CAAC,SAAS,EAAE,EAAE,QAAQ,EAAE,MAAM,EAAE,CAAC,CAAA;IACjE,MAAM,MAAM,GAAoB,eAAe,EAAE,CAAA;IACjD,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAA;IAC9B,MAAM,cAAc,GAAG,UAAU,CAAA;IACjC,IAAI,cAAc,GAAG,KAAK,CAAA;IAE1B,IAAI,OAAO,GAAG,CAAC,CAAA;IACf,IAAI,OAAO,GAAG,CAAC,CAAA;IACf,IAAI,KAAK,GAAG,CAAC,CAAA;IAEb,MAAM,YAAY,GAAG,GAAS,EAAE;QAC/B,IAAI,CAAC,UAAU,EAAE,CAAC;YACjB,SAAS,EAAE,OAAO,CAAC,EAAE;YACrB,OAAO;YACP,OAAO;YACP,KAAK;YACL,UAAU,EAAE,WAAW,CAAC,GAAG,EAAE,GAAG,EAAE;SAClC,CAAC,CAAA;IACH,CAAC,CAAA;IAED,IAAI,CAAC;QACJ,IAAI,KAAK,EAAE,MAAM,GAAG,IAAI,OAAO,CAAC,IAAI,CAAC,cAAc,CAAC,EAAE,CAAC;YACtD,IAAI,cAAc,CAAC,MAAM,EAAE,OAAO,EAAE,CAAC;gBACpC,MAAM,IAAI,YAAY,CAAC,+BAA+B,EAAE,YAAY,CAAC,CAAA;YACtE,CAAC;YAED,OAAO,EAAE,CAAA;YACT,gBAAgB,CAAC,OAAO,EAAE,GAAG,CAAC,CAAA;YAE9B,MAAM,OAAO,GAAiB,EAAE,GAAG,GAAG,EAAE,cAAc,EAAE,aAAa,EAAE,CAAA;YACvE,MAAM,GAAG,GAAG,iBAAiB,CAAC,OAAO,CAAC,CAAA;YACtC,IAAI,CAAC,cAAc,EAAE,CAAC;gBACrB,IAAI,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;oBACnB,IAAI,OAAO,GAAG,aAAa,KAAK,CAAC;wBAAE,YAAY,EAAE,CAAA;oBACjD,SAAQ;gBACT,CAAC;gBACD,IAAI,IAAI,CAAC,IAAI,IAAI,cAAc,EAAE,CAAC;oBACjC,cAAc,GAAG,IAAI,CAAA;oBACrB,OAAO,CAAC,MAAM,CAAC,KAAK,CACnB,+BAA+B,cAAc,CAAC,cAAc,EAAE,wCAAwC,CACtG,CAAA;gBACF,CAAC;qBAAM,CAAC;oBACP,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,CAAA;gBACd,CAAC;YACF,CAAC;YAED,MAAM,IAAI,GAAG,GAAG,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC,IAAI,CAAA;YAC3C,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAA;YACnB,KAAK,IAAI,MAAM,CAAC,UAAU,CAAC,IAAI,EAAE,MAAM,CAAC,CAAA;YACxC,OAAO,EAAE,CAAA;YAET,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC;gBACzB,MAAM,IAAI,CAAC,MAAM,EAAE,OAAO,CAAC,CAAA;YAC5B,CAAC;YAED,IAAI,OAAO,GAAG,aAAa,KAAK,CAAC;gBAAE,YAAY,EAAE,CAAA;QAClD,CAAC;IACF,CAAC;YAAS,CAAC;QACV,MAAM,CAAC,GAAG,EAAE,CAAA;QACZ,MAAM,IAAI,CAAC,MAAM,EAAE,OAAO,CAAC,CAAA;IAC5B,CAAC;IAED,MAAM,OAAO,GAAG,IAAI,IAAI,EAAE,CAAA;IAC1B,MAAM,UAAU,GAAG,WAAW,CAAC,GAAG,EAAE,GAAG,EAAE,CAAA;IACzC,YAAY,EAAE,CAAA;IAEd,MAAM,QAAQ,GAAuB;QACpC,UAAU,EAAE,OAAO,CAAC,EAAE;QACtB,cAAc,EAAE,aAAa;QAC7B,eAAe,EAAE,OAAO,CAAC,cAAc;QACvC,WAAW,EAAE,OAAO,CAAC,WAAW;QAChC,OAAO;QACP,OAAO;QACP,OAAO,EAAE,OAAO,GAAG,OAAO;QAC1B,KAAK;QACL,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE;QACvB,UAAU,EAAE,SAAS;QACrB,UAAU,EAAE,SAAS,CAAC,WAAW,EAAE;QACnC,QAAQ,EAAE,OAAO,CAAC,WAAW,EAAE;QAC/B,UAAU;KACV,CAAA;IAED,MAAM,SAAS,CAAC,YAAY,EAAE,GAAG,IAAI,CAAC,SAAS,CAAC,QAAQ,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,MAAM,CAAC,CAAA;IAE/E,OAAO,QAAQ,CAAA;AAChB,CAAC;AAED;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CACnC,QAAyB,EACzB,MAAyG;IAEzG,MAAM,GAAG,GAAyB,EAAE,CAAA;IACpC,KAAK,MAAM,OAAO,IAAI,QAAQ,CAAC,IAAI,EAAE,EAAE,CAAC;QACvC,MAAM,cAAc,GAAG,MAAM,CAAC,iBAAiB,EAAE,CAAC,OAAO,CAAC,IAAI,MAAM,CAAC,cAAc,CAAA;QACnF,GAAG,CAAC,IAAI,CACP,MAAM,UAAU,CAAC;YAChB,GAAG,MAAM;YACT,OAAO;YACP,cAAc;SACd,CAAC,CACF,CAAA;IACF,CAAC;IACD,OAAO,GAAG,CAAA;AACX,CAAC;AAED;;;GAGG;AACH,SAAS,gBAAgB,CAAC,OAAsB,EAAE,GAAiB;IAClE,IAAI,GAAG,CAAC,MAAM,KAAK,OAAO,CAAC,EAAE,EAAE,CAAC;QAC/B,MAAM,IAAI,KAAK,CAAC,WAAW,OAAO,CAAC,EAAE,2CAA2C,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,MAAM,CAAC,GAAG,CAAC,CAAA;IAC/G,CAAC;IACD,IAAI,CAAC,GAAG,CAAC,SAAS,EAAE,CAAC;QACpB,MAAM,IAAI,KAAK,CAAC,WAAW,OAAO,CAAC,EAAE,0BAA0B,CAAC,CAAA;IACjE,CAAC;IACD,IAAI,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC;QACd,MAAM,IAAI,KAAK,CAAC,WAAW,OAAO,CAAC,EAAE,oCAAoC,GAAG,CAAC,SAAS,EAAE,CAAC,CAAA;IAC1F,CAAC;IACD,IAAI,CAAC,GAAG,CAAC,OAAO,EAAE,CAAC;QAClB,MAAM,IAAI,KAAK,CAAC,WAAW,OAAO,CAAC,EAAE,wCAAwC,GAAG,CAAC,SAAS,EAAE,CAAC,CAAA;IAC9F,CAAC;IACD,IAAI,CAAC,GAAG,CAAC,OAAO,EAAE,CAAC;QAClB,MAAM,IAAI,KAAK,CAAC,WAAW,OAAO,CAAC,EAAE,wCAAwC,GAAG,CAAC,SAAS,EAAE,CAAC,CAAA;IAC9F,CAAC;AACF,CAAC;AAED,gGAAgG;AAChG,SAAS,IAAI,CAAC,OAAoB,EAAE,KAAwB;IAC3D,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;QACtC,MAAM,OAAO,GAAG,GAAS,EAAE;YAC1B,OAAO,CAAC,GAAG,CAAC,OAAO,EAAE,OAAO,CAAC,CAAA;YAC7B,OAAO,EAAE,CAAA;QACV,CAAC,CAAA;QACD,MAAM,OAAO,GAAG,CAAC,GAAU,EAAQ,EAAE;YACpC,OAAO,CAAC,GAAG,CAAC,KAAK,EAAE,OAAO,CAAC,CAAA;YAC3B,MAAM,CAAC,GAAG,CAAC,CAAA;QACZ,CAAC,CAAA;QACD,OAAO,CAAC,IAAI,CAAC,KAAK,EAAE,OAAO,CAAC,CAAA;QAC5B,OAAO,CAAC,IAAI,CAAC,OAAO,EAAE,OAAO,CAAC,CAAA;IAC/B,CAAC,CAAC,CAAA;AACH,CAAC;AAED,qEAAqE;AACrE,MAAM,CAAC,KAAK,UAAU,eAAe,CAAC,QAAgB;IACrD,MAAM,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAA;AACpD,CAAC"}
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Train / val / test split with **locality holdout** per the Phase 1 plan.
|
|
7
|
+
*
|
|
8
|
+
* The corpus's val + test sets are not randomly sampled rows — they're entire low-density regions
|
|
9
|
+
* held out so the model cannot memorize them at training time. Rationale (per the plan's "Common
|
|
10
|
+
* pitfalls" section): random splits leak by neighborhood — a model fed "13 Main St, Springfield,
|
|
11
|
+
* IL" in train and "15 Main St, Springfield, IL" in test generalizes via region/locality
|
|
12
|
+
* memorization, not by learning the underlying schema.
|
|
13
|
+
*
|
|
14
|
+
* Phase 1 holdouts (chosen for low data density + administrative isolation):
|
|
15
|
+
*
|
|
16
|
+
* - **US**: Vermont, Wyoming, North Dakota
|
|
17
|
+
* - **FR**: Corse, Lozère, Creuse
|
|
18
|
+
*
|
|
19
|
+
* Held-out rows are deterministically split 50/50 between val and test by hashing the row's
|
|
20
|
+
* `source_id`. Non-held-out rows go to train. The 90/5/5 ratio is approximate — what matters is
|
|
21
|
+
* the locality boundary, not the exact split percentages.
|
|
22
|
+
*
|
|
23
|
+
* The output is a `SplitManifest`: three `string[]` arrays of `source_id`. Manifests live in git
|
|
24
|
+
* (under `corpus/splits/<version>/`) so reruns are reproducible bit-for-bit.
|
|
25
|
+
*/
|
|
26
|
+
import type { CanonicalRow, LabeledRow } from "./types.js";
|
|
27
|
+
export type SplitName = "train" | "val" | "test";
|
|
28
|
+
export interface SplitOptions {
|
|
29
|
+
/**
|
|
30
|
+
* Region-name → holdout policy, keyed by ISO 3166-1 alpha-2 country. The values are the
|
|
31
|
+
* region-component strings the splitter looks for in `row.components.region`. Override to change
|
|
32
|
+
* the holdout for an experiment; defaults to `defaultHoldouts()`.
|
|
33
|
+
*/
|
|
34
|
+
holdouts?: Record<string, readonly string[]>;
|
|
35
|
+
}
|
|
36
|
+
/** Output manifest: source_id lists per split. */
|
|
37
|
+
export interface SplitManifest {
|
|
38
|
+
train: string[];
|
|
39
|
+
val: string[];
|
|
40
|
+
test: string[];
|
|
41
|
+
/** Echoes the holdouts used, so the manifest is self-describing. */
|
|
42
|
+
holdouts: Record<string, readonly string[]>;
|
|
43
|
+
/** Corpus version stamped onto the manifest. Read from the first row. */
|
|
44
|
+
corpus_version: string;
|
|
45
|
+
/** Counts for quick sanity checks. */
|
|
46
|
+
counts: {
|
|
47
|
+
train: number;
|
|
48
|
+
val: number;
|
|
49
|
+
test: number;
|
|
50
|
+
total: number;
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* Phase 1 default holdouts (per plan).
|
|
55
|
+
*
|
|
56
|
+
* - US: Vermont, Wyoming, North Dakota (low density, easy to identify in WOF/admin sources).
|
|
57
|
+
* - FR: Corse, Lozère, Creuse (small departments / regions).
|
|
58
|
+
*/
|
|
59
|
+
export declare function defaultHoldouts(): Record<string, readonly string[]>;
|
|
60
|
+
type SplitInputRow = Pick<CanonicalRow, "source_id" | "country" | "corpus_version" | "components">;
|
|
61
|
+
/**
|
|
62
|
+
* Pure per-row split decision. Used by both the in-memory `splitRows` and by the streaming
|
|
63
|
+
* `buildCorpus` align loop (`build.ts`) to decide each row's split without retaining the row in
|
|
64
|
+
* heap. Identical hash bucketing semantics to the array-based path so the decision is stable
|
|
65
|
+
* regardless of caller.
|
|
66
|
+
*/
|
|
67
|
+
export declare function splitForRow(row: Pick<SplitInputRow, "source_id" | "country" | "components">, holdouts?: Record<string, readonly string[]>): SplitName;
|
|
68
|
+
/**
|
|
69
|
+
* Compute a `SplitManifest` from an iterable of labeled (or canonical) rows. Both shapes are
|
|
70
|
+
* accepted — only `source_id`, `country`, `corpus_version`, and `components.region` are consulted.
|
|
71
|
+
*
|
|
72
|
+
* Retained for in-memory callers (tests; small-scale fixture runs). Real-data builds via
|
|
73
|
+
* `buildCorpus` use the streaming path (`splitForRow` + `writeSplitManifestsFromLabeledFiles`) to
|
|
74
|
+
* avoid materializing every aligned row's split membership in heap.
|
|
75
|
+
*/
|
|
76
|
+
export declare function splitRows(rows: Iterable<SplitInputRow>, opts?: SplitOptions): SplitManifest;
|
|
77
|
+
/** Lightweight deterministic 0..(n-1) bucket from a string id. */
|
|
78
|
+
export declare function hashBucket(id: string, n: number): number;
|
|
79
|
+
/**
|
|
80
|
+
* Write a `SplitManifest` to `<outputDir>/{train,val,test}.json`. The manifests are line-separated
|
|
81
|
+
* source_id lists (one id per line) so they diff cleanly in git. Also writes
|
|
82
|
+
* `<outputDir>/MANIFEST.json` with the full structured manifest including holdouts + counts +
|
|
83
|
+
* corpus version.
|
|
84
|
+
*
|
|
85
|
+
* Reruns produce byte-identical files (the underlying `splitRows` is deterministic).
|
|
86
|
+
*/
|
|
87
|
+
export declare function writeSplitManifests(manifest: SplitManifest, outputDir: string): Promise<void>;
|
|
88
|
+
/** Type re-export for callers that want to ingest LabeledRow specifically. */
|
|
89
|
+
export type SplitInputLabeledRow = Pick<LabeledRow, "source_id" | "country" | "corpus_version" | "components">;
|
|
90
|
+
/**
|
|
91
|
+
* Streaming variant of `writeSplitManifests`: derives the per-split source-id .txt manifests +
|
|
92
|
+
* `SPLIT_MANIFEST.json` by streaming three per-split labeled-row JSONL files (one per split).
|
|
93
|
+
* Memory cost is O(1) — `sort(1)` from coreutils handles the deterministic sort with disk spill for
|
|
94
|
+
* files that exceed in-memory thresholds.
|
|
95
|
+
*
|
|
96
|
+
* Used by `buildCorpus` after the align loop has already partitioned labeled rows into
|
|
97
|
+
* `labeled-{train,val,test}.jsonl` via `splitForRow`. Counts are pre-computed by the align loop and
|
|
98
|
+
* passed in (zero re-scan).
|
|
99
|
+
*/
|
|
100
|
+
export declare function writeSplitManifestsFromLabeledFiles(opts: {
|
|
101
|
+
labeledPaths: Record<SplitName, string>;
|
|
102
|
+
outputDir: string;
|
|
103
|
+
corpusVersion: string;
|
|
104
|
+
counts: Record<SplitName, number>;
|
|
105
|
+
holdouts?: Record<string, readonly string[]>;
|
|
106
|
+
}): Promise<SplitManifest["counts"]>;
|
|
107
|
+
export {};
|
|
108
|
+
//# sourceMappingURL=split.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"split.d.ts","sourceRoot":"","sources":["../../src/split.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AAQH,OAAO,KAAK,EAAE,YAAY,EAAE,UAAU,EAAE,MAAM,YAAY,CAAA;AAE1D,MAAM,MAAM,SAAS,GAAG,OAAO,GAAG,KAAK,GAAG,MAAM,CAAA;AAEhD,MAAM,WAAW,YAAY;IAC5B;;;;OAIG;IACH,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,SAAS,MAAM,EAAE,CAAC,CAAA;CAC5C;AAED,kDAAkD;AAClD,MAAM,WAAW,aAAa;IAC7B,KAAK,EAAE,MAAM,EAAE,CAAA;IACf,GAAG,EAAE,MAAM,EAAE,CAAA;IACb,IAAI,EAAE,MAAM,EAAE,CAAA;IACd,oEAAoE;IACpE,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,SAAS,MAAM,EAAE,CAAC,CAAA;IAC3C,yEAAyE;IACzE,cAAc,EAAE,MAAM,CAAA;IACtB,sCAAsC;IACtC,MAAM,EAAE;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,GAAG,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,CAAA;CACnE;AAED;;;;;GAKG;AACH,wBAAgB,eAAe,IAAI,MAAM,CAAC,MAAM,EAAE,SAAS,MAAM,EAAE,CAAC,CAKnE;AAED,KAAK,aAAa,GAAG,IAAI,CAAC,YAAY,EAAE,WAAW,GAAG,SAAS,GAAG,gBAAgB,GAAG,YAAY,CAAC,CAAA;AAElG;;;;;GAKG;AACH,wBAAgB,WAAW,CAC1B,GAAG,EAAE,IAAI,CAAC,aAAa,EAAE,WAAW,GAAG,SAAS,GAAG,YAAY,CAAC,EAChE,QAAQ,GAAE,MAAM,CAAC,MAAM,EAAE,SAAS,MAAM,EAAE,CAAqB,GAC7D,SAAS,CAOX;AAED;;;;;;;GAOG;AACH,wBAAgB,SAAS,CAAC,IAAI,EAAE,QAAQ,CAAC,aAAa,CAAC,EAAE,IAAI,GAAE,YAAiB,GAAG,aAAa,CAwB/F;AAED,kEAAkE;AAClE,wBAAgB,UAAU,CAAC,EAAE,EAAE,MAAM,EAAE,CAAC,EAAE,MAAM,GAAG,MAAM,CAKxD;AAED;;;;;;;GAOG;AACH,wBAAsB,mBAAmB,CAAC,QAAQ,EAAE,aAAa,EAAE,SAAS,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAYnG;AAED,8EAA8E;AAC9E,MAAM,MAAM,oBAAoB,GAAG,IAAI,CAAC,UAAU,EAAE,WAAW,GAAG,SAAS,GAAG,gBAAgB,GAAG,YAAY,CAAC,CAAA;AAE9G;;;;;;;;;GASG;AACH,wBAAsB,mCAAmC,CAAC,IAAI,EAAE;IAC/D,YAAY,EAAE,MAAM,CAAC,SAAS,EAAE,MAAM,CAAC,CAAA;IACvC,SAAS,EAAE,MAAM,CAAA;IACjB,aAAa,EAAE,MAAM,CAAA;IACrB,MAAM,EAAE,MAAM,CAAC,SAAS,EAAE,MAAM,CAAC,CAAA;IACjC,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,SAAS,MAAM,EAAE,CAAC,CAAA;CAC5C,GAAG,OAAO,CAAC,aAAa,CAAC,QAAQ,CAAC,CAAC,CAkBnC"}
|
package/out/src/split.js
ADDED
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Train / val / test split with **locality holdout** per the Phase 1 plan.
|
|
7
|
+
*
|
|
8
|
+
* The corpus's val + test sets are not randomly sampled rows — they're entire low-density regions
|
|
9
|
+
* held out so the model cannot memorize them at training time. Rationale (per the plan's "Common
|
|
10
|
+
* pitfalls" section): random splits leak by neighborhood — a model fed "13 Main St, Springfield,
|
|
11
|
+
* IL" in train and "15 Main St, Springfield, IL" in test generalizes via region/locality
|
|
12
|
+
* memorization, not by learning the underlying schema.
|
|
13
|
+
*
|
|
14
|
+
* Phase 1 holdouts (chosen for low data density + administrative isolation):
|
|
15
|
+
*
|
|
16
|
+
* - **US**: Vermont, Wyoming, North Dakota
|
|
17
|
+
* - **FR**: Corse, Lozère, Creuse
|
|
18
|
+
*
|
|
19
|
+
* Held-out rows are deterministically split 50/50 between val and test by hashing the row's
|
|
20
|
+
* `source_id`. Non-held-out rows go to train. The 90/5/5 ratio is approximate — what matters is
|
|
21
|
+
* the locality boundary, not the exact split percentages.
|
|
22
|
+
*
|
|
23
|
+
* The output is a `SplitManifest`: three `string[]` arrays of `source_id`. Manifests live in git
|
|
24
|
+
* (under `corpus/splits/<version>/`) so reruns are reproducible bit-for-bit.
|
|
25
|
+
*/
|
|
26
|
+
import { spawn } from "node:child_process";
|
|
27
|
+
import { createHash } from "node:crypto";
|
|
28
|
+
import { createReadStream, createWriteStream } from "node:fs";
|
|
29
|
+
import { mkdir, unlink, writeFile } from "node:fs/promises";
|
|
30
|
+
import { join } from "node:path";
|
|
31
|
+
import { createInterface } from "node:readline";
|
|
32
|
+
/**
|
|
33
|
+
* Phase 1 default holdouts (per plan).
|
|
34
|
+
*
|
|
35
|
+
* - US: Vermont, Wyoming, North Dakota (low density, easy to identify in WOF/admin sources).
|
|
36
|
+
* - FR: Corse, Lozère, Creuse (small departments / regions).
|
|
37
|
+
*/
|
|
38
|
+
export function defaultHoldouts() {
|
|
39
|
+
return {
|
|
40
|
+
US: ["Vermont", "VT", "Wyoming", "WY", "North Dakota", "ND"],
|
|
41
|
+
FR: ["Corse", "Lozère", "Lozere", "Creuse"],
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
/**
|
|
45
|
+
* Pure per-row split decision. Used by both the in-memory `splitRows` and by the streaming
|
|
46
|
+
* `buildCorpus` align loop (`build.ts`) to decide each row's split without retaining the row in
|
|
47
|
+
* heap. Identical hash bucketing semantics to the array-based path so the decision is stable
|
|
48
|
+
* regardless of caller.
|
|
49
|
+
*/
|
|
50
|
+
export function splitForRow(row, holdouts = defaultHoldouts()) {
|
|
51
|
+
const region = row.components.region;
|
|
52
|
+
const countryHoldouts = holdouts[row.country] ?? [];
|
|
53
|
+
const isHeldOut = region !== undefined && countryHoldouts.includes(region);
|
|
54
|
+
if (!isHeldOut)
|
|
55
|
+
return "train";
|
|
56
|
+
// 50/50 deterministic by source_id hash. Same input always lands in the same split.
|
|
57
|
+
return hashBucket(row.source_id, 2) === 0 ? "val" : "test";
|
|
58
|
+
}
|
|
59
|
+
/**
|
|
60
|
+
* Compute a `SplitManifest` from an iterable of labeled (or canonical) rows. Both shapes are
|
|
61
|
+
* accepted — only `source_id`, `country`, `corpus_version`, and `components.region` are consulted.
|
|
62
|
+
*
|
|
63
|
+
* Retained for in-memory callers (tests; small-scale fixture runs). Real-data builds via
|
|
64
|
+
* `buildCorpus` use the streaming path (`splitForRow` + `writeSplitManifestsFromLabeledFiles`) to
|
|
65
|
+
* avoid materializing every aligned row's split membership in heap.
|
|
66
|
+
*/
|
|
67
|
+
export function splitRows(rows, opts = {}) {
|
|
68
|
+
const holdouts = opts.holdouts ?? defaultHoldouts();
|
|
69
|
+
const train = [];
|
|
70
|
+
const val = [];
|
|
71
|
+
const test = [];
|
|
72
|
+
let corpus_version = "";
|
|
73
|
+
for (const row of rows) {
|
|
74
|
+
if (!corpus_version && row.corpus_version)
|
|
75
|
+
corpus_version = row.corpus_version;
|
|
76
|
+
const split = splitForRow(row, holdouts);
|
|
77
|
+
if (split === "train")
|
|
78
|
+
train.push(row.source_id);
|
|
79
|
+
else if (split === "val")
|
|
80
|
+
val.push(row.source_id);
|
|
81
|
+
else
|
|
82
|
+
test.push(row.source_id);
|
|
83
|
+
}
|
|
84
|
+
const total = train.length + val.length + test.length;
|
|
85
|
+
return {
|
|
86
|
+
train,
|
|
87
|
+
val,
|
|
88
|
+
test,
|
|
89
|
+
holdouts,
|
|
90
|
+
corpus_version,
|
|
91
|
+
counts: { train: train.length, val: val.length, test: test.length, total },
|
|
92
|
+
};
|
|
93
|
+
}
|
|
94
|
+
/** Lightweight deterministic 0..(n-1) bucket from a string id. */
|
|
95
|
+
export function hashBucket(id, n) {
|
|
96
|
+
const digest = createHash("sha256").update(id).digest();
|
|
97
|
+
// Read 4 bytes as uint32 to avoid bigint overhead.
|
|
98
|
+
const u = digest[0] * 0x01_00_00_00 + digest[1] * 0x01_00_00 + digest[2] * 0x01_00 + digest[3];
|
|
99
|
+
return u % n;
|
|
100
|
+
}
|
|
101
|
+
/**
|
|
102
|
+
* Write a `SplitManifest` to `<outputDir>/{train,val,test}.json`. The manifests are line-separated
|
|
103
|
+
* source_id lists (one id per line) so they diff cleanly in git. Also writes
|
|
104
|
+
* `<outputDir>/MANIFEST.json` with the full structured manifest including holdouts + counts +
|
|
105
|
+
* corpus version.
|
|
106
|
+
*
|
|
107
|
+
* Reruns produce byte-identical files (the underlying `splitRows` is deterministic).
|
|
108
|
+
*/
|
|
109
|
+
export async function writeSplitManifests(manifest, outputDir) {
|
|
110
|
+
await mkdir(outputDir, { recursive: true });
|
|
111
|
+
for (const name of ["train", "val", "test"]) {
|
|
112
|
+
const sorted = [...manifest[name]].sort();
|
|
113
|
+
await writeFile(join(outputDir, `${name}.txt`), sorted.join("\n") + (sorted.length ? "\n" : ""), "utf8");
|
|
114
|
+
}
|
|
115
|
+
const summary = {
|
|
116
|
+
corpus_version: manifest.corpus_version,
|
|
117
|
+
holdouts: manifest.holdouts,
|
|
118
|
+
counts: manifest.counts,
|
|
119
|
+
};
|
|
120
|
+
await writeFile(join(outputDir, "SPLIT_MANIFEST.json"), `${JSON.stringify(summary, null, 2)}\n`, "utf8");
|
|
121
|
+
}
|
|
122
|
+
/**
|
|
123
|
+
* Streaming variant of `writeSplitManifests`: derives the per-split source-id .txt manifests +
|
|
124
|
+
* `SPLIT_MANIFEST.json` by streaming three per-split labeled-row JSONL files (one per split).
|
|
125
|
+
* Memory cost is O(1) — `sort(1)` from coreutils handles the deterministic sort with disk spill for
|
|
126
|
+
* files that exceed in-memory thresholds.
|
|
127
|
+
*
|
|
128
|
+
* Used by `buildCorpus` after the align loop has already partitioned labeled rows into
|
|
129
|
+
* `labeled-{train,val,test}.jsonl` via `splitForRow`. Counts are pre-computed by the align loop and
|
|
130
|
+
* passed in (zero re-scan).
|
|
131
|
+
*/
|
|
132
|
+
export async function writeSplitManifestsFromLabeledFiles(opts) {
|
|
133
|
+
await mkdir(opts.outputDir, { recursive: true });
|
|
134
|
+
const holdouts = opts.holdouts ?? defaultHoldouts();
|
|
135
|
+
for (const split of ["train", "val", "test"]) {
|
|
136
|
+
const labeledPath = opts.labeledPaths[split];
|
|
137
|
+
const outPath = join(opts.outputDir, `${split}.txt`);
|
|
138
|
+
await streamSortedSourceIds(labeledPath, outPath);
|
|
139
|
+
}
|
|
140
|
+
const total = opts.counts.train + opts.counts.val + opts.counts.test;
|
|
141
|
+
const summary = {
|
|
142
|
+
corpus_version: opts.corpusVersion,
|
|
143
|
+
holdouts,
|
|
144
|
+
counts: { ...opts.counts, total },
|
|
145
|
+
};
|
|
146
|
+
await writeFile(join(opts.outputDir, "SPLIT_MANIFEST.json"), `${JSON.stringify(summary, null, 2)}\n`, "utf8");
|
|
147
|
+
return summary.counts;
|
|
148
|
+
}
|
|
149
|
+
/**
|
|
150
|
+
* Extract `source_id`s from a labeled JSONL file, write them sorted to `outPath`. Empty input →
|
|
151
|
+
* empty output file (not absent). Uses `sort(1)` for disk-spilling external sort so peak memory
|
|
152
|
+
* stays O(1) regardless of labeled-row count.
|
|
153
|
+
*/
|
|
154
|
+
async function streamSortedSourceIds(labeledJsonlPath, outPath) {
|
|
155
|
+
const unsortedPath = `${outPath}.unsorted`;
|
|
156
|
+
const out = createWriteStream(unsortedPath, { encoding: "utf8" });
|
|
157
|
+
const rl = createInterface({ input: createReadStream(labeledJsonlPath, { encoding: "utf8" }), crlfDelay: Infinity });
|
|
158
|
+
await new Promise((resolve, reject) => {
|
|
159
|
+
rl.on("line", (line) => {
|
|
160
|
+
if (!line)
|
|
161
|
+
return;
|
|
162
|
+
try {
|
|
163
|
+
const obj = JSON.parse(line);
|
|
164
|
+
if (typeof obj.source_id === "string")
|
|
165
|
+
out.write(`${obj.source_id}\n`);
|
|
166
|
+
}
|
|
167
|
+
catch (err) {
|
|
168
|
+
reject(err);
|
|
169
|
+
}
|
|
170
|
+
});
|
|
171
|
+
rl.on("close", () => {
|
|
172
|
+
out.end();
|
|
173
|
+
});
|
|
174
|
+
rl.on("error", reject);
|
|
175
|
+
out.on("close", () => resolve());
|
|
176
|
+
out.on("error", reject);
|
|
177
|
+
});
|
|
178
|
+
await new Promise((resolve, reject) => {
|
|
179
|
+
// LC_ALL=C: byte-sort, locale-independent → deterministic across hosts.
|
|
180
|
+
const proc = spawn("sort", [unsortedPath, "-o", outPath], { env: { ...process.env, LC_ALL: "C" } });
|
|
181
|
+
proc.on("error", reject);
|
|
182
|
+
proc.on("exit", (code) => {
|
|
183
|
+
if (code === 0)
|
|
184
|
+
resolve();
|
|
185
|
+
else
|
|
186
|
+
reject(new Error(`sort exited with code ${code}`));
|
|
187
|
+
});
|
|
188
|
+
});
|
|
189
|
+
await unlink(unsortedPath).catch(() => { });
|
|
190
|
+
}
|
|
191
|
+
//# sourceMappingURL=split.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"split.js","sourceRoot":"","sources":["../../src/split.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AAEH,OAAO,EAAE,KAAK,EAAE,MAAM,oBAAoB,CAAA;AAC1C,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAA;AACxC,OAAO,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,MAAM,SAAS,CAAA;AAC7D,OAAO,EAAE,KAAK,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAA;AAC3D,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAA;AAChC,OAAO,EAAE,eAAe,EAAE,MAAM,eAAe,CAAA;AA2B/C;;;;;GAKG;AACH,MAAM,UAAU,eAAe;IAC9B,OAAO;QACN,EAAE,EAAE,CAAC,SAAS,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,cAAc,EAAE,IAAI,CAAC;QAC5D,EAAE,EAAE,CAAC,OAAO,EAAE,QAAQ,EAAE,QAAQ,EAAE,QAAQ,CAAC;KAC3C,CAAA;AACF,CAAC;AAID;;;;;GAKG;AACH,MAAM,UAAU,WAAW,CAC1B,GAAgE,EAChE,WAA8C,eAAe,EAAE;IAE/D,MAAM,MAAM,GAAG,GAAG,CAAC,UAAU,CAAC,MAAM,CAAA;IACpC,MAAM,eAAe,GAAG,QAAQ,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,EAAE,CAAA;IACnD,MAAM,SAAS,GAAG,MAAM,KAAK,SAAS,IAAI,eAAe,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAA;IAC1E,IAAI,CAAC,SAAS;QAAE,OAAO,OAAO,CAAA;IAC9B,oFAAoF;IACpF,OAAO,UAAU,CAAC,GAAG,CAAC,SAAS,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,CAAA;AAC3D,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,UAAU,SAAS,CAAC,IAA6B,EAAE,OAAqB,EAAE;IAC/E,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,IAAI,eAAe,EAAE,CAAA;IACnD,MAAM,KAAK,GAAa,EAAE,CAAA;IAC1B,MAAM,GAAG,GAAa,EAAE,CAAA;IACxB,MAAM,IAAI,GAAa,EAAE,CAAA;IACzB,IAAI,cAAc,GAAG,EAAE,CAAA;IAEvB,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACxB,IAAI,CAAC,cAAc,IAAI,GAAG,CAAC,cAAc;YAAE,cAAc,GAAG,GAAG,CAAC,cAAc,CAAA;QAC9E,MAAM,KAAK,GAAG,WAAW,CAAC,GAAG,EAAE,QAAQ,CAAC,CAAA;QACxC,IAAI,KAAK,KAAK,OAAO;YAAE,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,SAAS,CAAC,CAAA;aAC3C,IAAI,KAAK,KAAK,KAAK;YAAE,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,SAAS,CAAC,CAAA;;YAC5C,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,SAAS,CAAC,CAAA;IAC9B,CAAC;IAED,MAAM,KAAK,GAAG,KAAK,CAAC,MAAM,GAAG,GAAG,CAAC,MAAM,GAAG,IAAI,CAAC,MAAM,CAAA;IACrD,OAAO;QACN,KAAK;QACL,GAAG;QACH,IAAI;QACJ,QAAQ;QACR,cAAc;QACd,MAAM,EAAE,EAAE,KAAK,EAAE,KAAK,CAAC,MAAM,EAAE,GAAG,EAAE,GAAG,CAAC,MAAM,EAAE,IAAI,EAAE,IAAI,CAAC,MAAM,EAAE,KAAK,EAAE;KAC1E,CAAA;AACF,CAAC;AAED,kEAAkE;AAClE,MAAM,UAAU,UAAU,CAAC,EAAU,EAAE,CAAS;IAC/C,MAAM,MAAM,GAAG,UAAU,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,MAAM,EAAE,CAAA;IACvD,mDAAmD;IACnD,MAAM,CAAC,GAAG,MAAM,CAAC,CAAC,CAAE,GAAG,aAAa,GAAG,MAAM,CAAC,CAAC,CAAE,GAAG,UAAU,GAAG,MAAM,CAAC,CAAC,CAAE,GAAG,OAAO,GAAG,MAAM,CAAC,CAAC,CAAE,CAAA;IAClG,OAAO,CAAC,GAAG,CAAC,CAAA;AACb,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,CAAC,KAAK,UAAU,mBAAmB,CAAC,QAAuB,EAAE,SAAiB;IACnF,MAAM,KAAK,CAAC,SAAS,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAA;IAC3C,KAAK,MAAM,IAAI,IAAI,CAAC,OAAO,EAAE,KAAK,EAAE,MAAM,CAAU,EAAE,CAAC;QACtD,MAAM,MAAM,GAAG,CAAC,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,EAAE,CAAA;QACzC,MAAM,SAAS,CAAC,IAAI,CAAC,SAAS,EAAE,GAAG,IAAI,MAAM,CAAC,EAAE,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,MAAM,CAAC,CAAA;IACzG,CAAC;IACD,MAAM,OAAO,GAAG;QACf,cAAc,EAAE,QAAQ,CAAC,cAAc;QACvC,QAAQ,EAAE,QAAQ,CAAC,QAAQ;QAC3B,MAAM,EAAE,QAAQ,CAAC,MAAM;KACvB,CAAA;IACD,MAAM,SAAS,CAAC,IAAI,CAAC,SAAS,EAAE,qBAAqB,CAAC,EAAE,GAAG,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,MAAM,CAAC,CAAA;AACzG,CAAC;AAKD;;;;;;;;;GASG;AACH,MAAM,CAAC,KAAK,UAAU,mCAAmC,CAAC,IAMzD;IACA,MAAM,KAAK,CAAC,IAAI,CAAC,SAAS,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAA;IAChD,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,IAAI,eAAe,EAAE,CAAA;IAEnD,KAAK,MAAM,KAAK,IAAI,CAAC,OAAO,EAAE,KAAK,EAAE,MAAM,CAAU,EAAE,CAAC;QACvD,MAAM,WAAW,GAAG,IAAI,CAAC,YAAY,CAAC,KAAK,CAAC,CAAA;QAC5C,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,GAAG,KAAK,MAAM,CAAC,CAAA;QACpD,MAAM,qBAAqB,CAAC,WAAW,EAAE,OAAO,CAAC,CAAA;IAClD,CAAC;IAED,MAAM,KAAK,GAAG,IAAI,CAAC,MAAM,CAAC,KAAK,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,GAAG,IAAI,CAAC,MAAM,CAAC,IAAI,CAAA;IACpE,MAAM,OAAO,GAAG;QACf,cAAc,EAAE,IAAI,CAAC,aAAa;QAClC,QAAQ;QACR,MAAM,EAAE,EAAE,GAAG,IAAI,CAAC,MAAM,EAAE,KAAK,EAAE;KACjC,CAAA;IACD,MAAM,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,qBAAqB,CAAC,EAAE,GAAG,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,MAAM,CAAC,CAAA;IAC7G,OAAO,OAAO,CAAC,MAAM,CAAA;AACtB,CAAC;AAED;;;;GAIG;AACH,KAAK,UAAU,qBAAqB,CAAC,gBAAwB,EAAE,OAAe;IAC7E,MAAM,YAAY,GAAG,GAAG,OAAO,WAAW,CAAA;IAC1C,MAAM,GAAG,GAAG,iBAAiB,CAAC,YAAY,EAAE,EAAE,QAAQ,EAAE,MAAM,EAAE,CAAC,CAAA;IACjE,MAAM,EAAE,GAAG,eAAe,CAAC,EAAE,KAAK,EAAE,gBAAgB,CAAC,gBAAgB,EAAE,EAAE,QAAQ,EAAE,MAAM,EAAE,CAAC,EAAE,SAAS,EAAE,QAAQ,EAAE,CAAC,CAAA;IAEpH,MAAM,IAAI,OAAO,CAAO,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;QAC3C,EAAE,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,EAAE;YACtB,IAAI,CAAC,IAAI;gBAAE,OAAM;YACjB,IAAI,CAAC;gBACJ,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAA2B,CAAA;gBACtD,IAAI,OAAO,GAAG,CAAC,SAAS,KAAK,QAAQ;oBAAE,GAAG,CAAC,KAAK,CAAC,GAAG,GAAG,CAAC,SAAS,IAAI,CAAC,CAAA;YACvE,CAAC;YAAC,OAAO,GAAG,EAAE,CAAC;gBACd,MAAM,CAAC,GAAY,CAAC,CAAA;YACrB,CAAC;QACF,CAAC,CAAC,CAAA;QACF,EAAE,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,EAAE;YACnB,GAAG,CAAC,GAAG,EAAE,CAAA;QACV,CAAC,CAAC,CAAA;QACF,EAAE,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,CAAC,CAAA;QACtB,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,EAAE,CAAC,OAAO,EAAE,CAAC,CAAA;QAChC,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,CAAC,CAAA;IACxB,CAAC,CAAC,CAAA;IAEF,MAAM,IAAI,OAAO,CAAO,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;QAC3C,wEAAwE;QACxE,MAAM,IAAI,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,YAAY,EAAE,IAAI,EAAE,OAAO,CAAC,EAAE,EAAE,GAAG,EAAE,EAAE,GAAG,OAAO,CAAC,GAAG,EAAE,MAAM,EAAE,GAAG,EAAE,EAAE,CAAC,CAAA;QACnG,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,CAAC,CAAA;QACxB,IAAI,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,EAAE;YACxB,IAAI,IAAI,KAAK,CAAC;gBAAE,OAAO,EAAE,CAAA;;gBACpB,MAAM,CAAC,IAAI,KAAK,CAAC,yBAAyB,IAAI,EAAE,CAAC,CAAC,CAAA;QACxD,CAAC,CAAC,CAAA;IACH,CAAC,CAAC,CAAA;IACF,MAAM,MAAM,CAAC,YAAY,CAAC,CAAC,KAAK,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAA;AAC3C,CAAC"}
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Synthesis / augmentation per Phase 1 task #6.
|
|
7
|
+
*
|
|
8
|
+
* An `Augmentation` is a pure function that takes a `CanonicalRow` and either returns a new
|
|
9
|
+
* `CanonicalRow` (with `raw` AND `components` transformed in lockstep so alignment still
|
|
10
|
+
* succeeds) or `null` when the augmentation doesn't apply to the row's shape.
|
|
11
|
+
*
|
|
12
|
+
* Synthesis runs **before** alignment: augmentations transform raw + components together, and the
|
|
13
|
+
* runner reruns alignment on each augmented row to produce its labels. This keeps the synthesis
|
|
14
|
+
* surface small (no token/label arithmetic) at the cost of a re-run.
|
|
15
|
+
*
|
|
16
|
+
* Every augmented row carries the `synth` marker:
|
|
17
|
+
*
|
|
18
|
+
* - `method`: the augmentation's stable id (e.g. `"case-upper"`, `"accent-strip"`).
|
|
19
|
+
* - `base_source_id`: the source_id of the un-augmented (or upstream-augmented) row, so ancestry is
|
|
20
|
+
* traceable.
|
|
21
|
+
*
|
|
22
|
+
* Phase 1 implements the locale-agnostic + most useful US/FR augmentations. Typo injection and
|
|
23
|
+
* other stochastic augmentations are intentionally deferred — they need a seed-aware API and are
|
|
24
|
+
* most useful at training time, not corpus build time.
|
|
25
|
+
*/
|
|
26
|
+
import { type Tokenizer } from "./tokenize.js";
|
|
27
|
+
import type { CanonicalRow, LabeledRow, QuarantinedRow } from "./types.js";
|
|
28
|
+
/**
|
|
29
|
+
* An augmentation transforms a single row. Return `null` if the augmentation doesn't apply (e.g.
|
|
30
|
+
* accent-strip on a row that has no accents; particle-strip on a US row).
|
|
31
|
+
*/
|
|
32
|
+
export type Augmentation = (row: CanonicalRow) => CanonicalRow | null;
|
|
33
|
+
/** Upper-case raw + every component value. Returns null if already all-upper. */
|
|
34
|
+
export declare const caseUpper: Augmentation;
|
|
35
|
+
/** Lower-case raw + every component value. Returns null if already all-lower. */
|
|
36
|
+
export declare const caseLower: Augmentation;
|
|
37
|
+
/** Drop commas from `raw`. Components unchanged (they didn't carry commas). */
|
|
38
|
+
export declare const dropCommas: Augmentation;
|
|
39
|
+
/**
|
|
40
|
+
* Replace single spaces with double spaces in `raw` AND in every component value. The component
|
|
41
|
+
* update is load-bearing for alignment: `alignRow` substring-searches each component's surface form
|
|
42
|
+
* inside `raw`, so doubling the spaces in `raw` only would leave single-spaced components
|
|
43
|
+
* unfindable (this was the bug behind v0.1.1's first build attempt — 99.9% of quarantined rows
|
|
44
|
+
* traced back to this augmentation). Doubling both keeps the substring contract intact.
|
|
45
|
+
*/
|
|
46
|
+
export declare const doubleSpace: Augmentation;
|
|
47
|
+
/**
|
|
48
|
+
* Strip Unicode combining marks (accents, diacritics) from raw + components. "Hôtel" → "Hotel";
|
|
49
|
+
* "Île-de-France" → "Ile-de-France". Returns null if the row has no accents.
|
|
50
|
+
*/
|
|
51
|
+
export declare const accentStrip: Augmentation;
|
|
52
|
+
/** US: substitute the full state name for its alpha-2 abbreviation. */
|
|
53
|
+
export declare const stateExpand: Augmentation;
|
|
54
|
+
/** US: substitute the alpha-2 abbreviation for the full state name. */
|
|
55
|
+
export declare const stateAbbreviate: Augmentation;
|
|
56
|
+
/** US: expand directional abbreviations in `street`/`street_suffix` (NW → Northwest). */
|
|
57
|
+
export declare const directionalExpand: Augmentation;
|
|
58
|
+
/** US: abbreviate directional words (Northwest → NW). */
|
|
59
|
+
export declare const directionalAbbreviate: Augmentation;
|
|
60
|
+
/**
|
|
61
|
+
* US: swap the trailing street-suffix word in `components.street` to its preferred USPS
|
|
62
|
+
* abbreviation, preserving case. `"5th Avenue"` → `"5th Ave"`; `"5TH AVENUE"` → `"5TH AVE"`; `"main
|
|
63
|
+
* street"` → `"main st"`. Returns null when no trailing suffix is recognized, when the trailing
|
|
64
|
+
* word is already the preferred abbreviation, or when the swap would leave `raw` un- touched
|
|
65
|
+
* (alignment requires both raw and components to move in lockstep).
|
|
66
|
+
*
|
|
67
|
+
* Targets the trailing word only to avoid mangling streets like "Avenue of the Americas" where the
|
|
68
|
+
* suffix-shaped word is part of the proper name rather than a USPS suffix.
|
|
69
|
+
*/
|
|
70
|
+
export declare const streetSuffixAbbreviate: Augmentation;
|
|
71
|
+
/**
|
|
72
|
+
* US: swap the trailing street-suffix word in `components.street` to its full canonical form,
|
|
73
|
+
* preserving case. `"5th Ave"` → `"5th Avenue"`; `"5TH AVE"` → `"5TH AVENUE"`; `"main st"` → `"main
|
|
74
|
+
* street"`. Returns null when no trailing suffix is recognized, when the trailing word is already
|
|
75
|
+
* the canonical full form, or when the swap would leave `raw` untouched.
|
|
76
|
+
*
|
|
77
|
+
* Same trailing-word-only rule as `streetSuffixAbbreviate`.
|
|
78
|
+
*/
|
|
79
|
+
export declare const streetSuffixExpand: Augmentation;
|
|
80
|
+
/** US: ZIP+4 form `12345-6789` → `123456789` (dash dropped). */
|
|
81
|
+
export declare const zipPlus4DashDrop: Augmentation;
|
|
82
|
+
/** FR: drop the article particle from a street ("Rue de la République" → "Rue République"). */
|
|
83
|
+
export declare const particleStrip: Augmentation;
|
|
84
|
+
/** Stable id → augmentation table. */
|
|
85
|
+
export declare const AUGMENTATIONS: Record<string, Augmentation>;
|
|
86
|
+
/** Default augmentation set, by country. Phase 1: US + FR; others get the locale-agnostic set. */
|
|
87
|
+
export declare function defaultAugmentationsForCountry(country: string): readonly Augmentation[];
|
|
88
|
+
/**
|
|
89
|
+
* Run every augmentation against a row; collect the non-null outputs. The augmentations are pure,
|
|
90
|
+
* so callers can compose them off this generator (e.g. nesting accent-strip ∘ state-abbreviate).
|
|
91
|
+
*/
|
|
92
|
+
export declare function synthesizeRow(row: CanonicalRow, augmentations?: readonly Augmentation[]): Generator<CanonicalRow>;
|
|
93
|
+
/** Options accepted by `composeAdversarialRow`. */
|
|
94
|
+
export interface ComposeAdversarialOptions {
|
|
95
|
+
/**
|
|
96
|
+
* Stable pattern label written into the emitted row's `synth.method` field (as
|
|
97
|
+
* `compose:<pattern>`). Free-form but should be one of a small set of canonical pattern names so
|
|
98
|
+
* downstream filtering / stratification can target individual patterns.
|
|
99
|
+
*
|
|
100
|
+
* Recommended values (Phase 1.6 §2.1):
|
|
101
|
+
*
|
|
102
|
+
* - `"place-name-venue"` — venue token shared with locality (`Buffalo Health Clinic, Buffalo NY`).
|
|
103
|
+
* - `"place-shaped-venue"` — venue contains a place-shaped substring (`New York, New York
|
|
104
|
+
* Steakhouse, Las Vegas NV`).
|
|
105
|
+
* - `"particle-honorific"` — apostrophe + St./Saint ambiguity (`P'tit St. Denis Street Café`).
|
|
106
|
+
*/
|
|
107
|
+
pattern: string;
|
|
108
|
+
/**
|
|
109
|
+
* Separator inserted between the venue and the address `raw`. Default `", "`. Single space (`"
|
|
110
|
+
* "`) produces the harder unpunctuated variant; newline (`"\n"`) the multi-line variant.
|
|
111
|
+
*/
|
|
112
|
+
separator?: string;
|
|
113
|
+
/**
|
|
114
|
+
* Tokenizer to apply to the venue prefix. Default `whitespaceTokenizer()`. The address half uses
|
|
115
|
+
* the same tokenizer when re-aligned — pass a consistent one if customizing.
|
|
116
|
+
*/
|
|
117
|
+
tokenizer?: Tokenizer;
|
|
118
|
+
}
|
|
119
|
+
/** Either a successful labeled composition or a quarantined attempt. */
|
|
120
|
+
export type ComposeResult = {
|
|
121
|
+
kind: "labeled";
|
|
122
|
+
row: LabeledRow;
|
|
123
|
+
} | {
|
|
124
|
+
kind: "quarantined";
|
|
125
|
+
row: QuarantinedRow;
|
|
126
|
+
};
|
|
127
|
+
/**
|
|
128
|
+
* Compose a venue string + an address row into a single adversarial `LabeledRow`.
|
|
129
|
+
*
|
|
130
|
+
* The emitted row's `raw` is `${venue}${separator}${address.raw}`. Tokens are produced by
|
|
131
|
+
* tokenizing the two halves independently and concatenating; labels are venue tokens → `B-venue` /
|
|
132
|
+
* `I-venue` followed by the address's labels (obtained by aligning the input address in isolation).
|
|
133
|
+
* This deterministic boundary is the entire point of the primitive: the embedded place-shaped
|
|
134
|
+
* tokens in the venue stay labeled as `venue`, never as the address's locality / region / etc.,
|
|
135
|
+
* even when they share surface forms.
|
|
136
|
+
*
|
|
137
|
+
* The address's components are forwarded as-is (alignment ran on them and they survived); `venue`
|
|
138
|
+
* is added on top with the trimmed venue string as its surface form.
|
|
139
|
+
*
|
|
140
|
+
* Returns `{ kind: "quarantined" }` when:
|
|
141
|
+
*
|
|
142
|
+
* - The venue is empty or whitespace-only.
|
|
143
|
+
* - The address row fails alignment in isolation (the underlying failure reason is propagated).
|
|
144
|
+
*/
|
|
145
|
+
export declare function composeAdversarialRow(venue: string, address: CanonicalRow, options: ComposeAdversarialOptions): ComposeResult;
|
|
146
|
+
//# sourceMappingURL=synthesize.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"synthesize.d.ts","sourceRoot":"","sources":["../../src/synthesize.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AAKH,OAAO,EAAuB,KAAK,SAAS,EAAE,MAAM,eAAe,CAAA;AACnE,OAAO,KAAK,EAAE,YAAY,EAAE,UAAU,EAAE,cAAc,EAAE,MAAM,YAAY,CAAA;AAE1E;;;GAGG;AACH,MAAM,MAAM,YAAY,GAAG,CAAC,GAAG,EAAE,YAAY,KAAK,YAAY,GAAG,IAAI,CAAA;AAyBrE,iFAAiF;AACjF,eAAO,MAAM,SAAS,EAAE,YAQvB,CAAA;AAED,iFAAiF;AACjF,eAAO,MAAM,SAAS,EAAE,YAQvB,CAAA;AAED,+EAA+E;AAC/E,eAAO,MAAM,UAAU,EAAE,YAIxB,CAAA;AAED;;;;;;GAMG;AACH,eAAO,MAAM,WAAW,EAAE,YAQzB,CAAA;AAED;;;GAGG;AACH,eAAO,MAAM,WAAW,EAAE,YAQzB,CAAA;AAqED,uEAAuE;AACvE,eAAO,MAAM,WAAW,EAAE,YAazB,CAAA;AAED,uEAAuE;AACvE,eAAO,MAAM,eAAe,EAAE,YAW7B,CAAA;AAgBD,yFAAyF;AACzF,eAAO,MAAM,iBAAiB,EAAE,YAkB/B,CAAA;AAED,yDAAyD;AACzD,eAAO,MAAM,qBAAqB,EAAE,YAqBnC,CAAA;AAED;;;;;;;;;GASG;AACH,eAAO,MAAM,sBAAsB,EAAE,YAkBpC,CAAA;AAED;;;;;;;GAOG;AACH,eAAO,MAAM,kBAAkB,EAAE,YAiBhC,CAAA;AAED,gEAAgE;AAChE,eAAO,MAAM,gBAAgB,EAAE,YAQ9B,CAAA;AAMD,+FAA+F;AAC/F,eAAO,MAAM,aAAa,EAAE,YAW3B,CAAA;AAMD,sCAAsC;AACtC,eAAO,MAAM,aAAa,EAAE,MAAM,CAAC,MAAM,EAAE,YAAY,CActD,CAAA;AAED,kGAAkG;AAClG,wBAAgB,8BAA8B,CAAC,OAAO,EAAE,MAAM,GAAG,SAAS,YAAY,EAAE,CAmBvF;AAED;;;GAGG;AACH,wBAAiB,aAAa,CAC7B,GAAG,EAAE,YAAY,EACjB,aAAa,GAAE,SAAS,YAAY,EAAgD,GAClF,SAAS,CAAC,YAAY,CAAC,CAKzB;AAqCD,mDAAmD;AACnD,MAAM,WAAW,yBAAyB;IACzC;;;;;;;;;;;OAWG;IACH,OAAO,EAAE,MAAM,CAAA;IAEf;;;OAGG;IACH,SAAS,CAAC,EAAE,MAAM,CAAA;IAElB;;;OAGG;IACH,SAAS,CAAC,EAAE,SAAS,CAAA;CACrB;AAED,wEAAwE;AACxE,MAAM,MAAM,aAAa,GAAG;IAAE,IAAI,EAAE,SAAS,CAAC;IAAC,GAAG,EAAE,UAAU,CAAA;CAAE,GAAG;IAAE,IAAI,EAAE,aAAa,CAAC;IAAC,GAAG,EAAE,cAAc,CAAA;CAAE,CAAA;AAE/G;;;;;;;;;;;;;;;;;GAiBG;AACH,wBAAgB,qBAAqB,CACpC,KAAK,EAAE,MAAM,EACb,OAAO,EAAE,YAAY,EACrB,OAAO,EAAE,yBAAyB,GAChC,aAAa,CAsDf"}
|