@mailwoman/corpus 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. package/out/src/adapter.d.ts +96 -0
  2. package/out/src/adapter.d.ts.map +1 -0
  3. package/out/src/adapter.js +107 -0
  4. package/out/src/adapter.js.map +1 -0
  5. package/out/src/adapters/ban/adapter.d.ts +32 -0
  6. package/out/src/adapters/ban/adapter.d.ts.map +1 -0
  7. package/out/src/adapters/ban/adapter.js +133 -0
  8. package/out/src/adapters/ban/adapter.js.map +1 -0
  9. package/out/src/adapters/fcc-bdc/adapter.d.ts +61 -0
  10. package/out/src/adapters/fcc-bdc/adapter.d.ts.map +1 -0
  11. package/out/src/adapters/fcc-bdc/adapter.js +153 -0
  12. package/out/src/adapters/fcc-bdc/adapter.js.map +1 -0
  13. package/out/src/adapters/index.d.ts +42 -0
  14. package/out/src/adapters/index.d.ts.map +1 -0
  15. package/out/src/adapters/index.js +76 -0
  16. package/out/src/adapters/index.js.map +1 -0
  17. package/out/src/adapters/openaddresses/adapter.d.ts +60 -0
  18. package/out/src/adapters/openaddresses/adapter.d.ts.map +1 -0
  19. package/out/src/adapters/openaddresses/adapter.js +174 -0
  20. package/out/src/adapters/openaddresses/adapter.js.map +1 -0
  21. package/out/src/adapters/state-ia-contractors/adapter.d.ts +23 -0
  22. package/out/src/adapters/state-ia-contractors/adapter.d.ts.map +1 -0
  23. package/out/src/adapters/state-ia-contractors/adapter.js +113 -0
  24. package/out/src/adapters/state-ia-contractors/adapter.js.map +1 -0
  25. package/out/src/adapters/state-ny-notaries/adapter.d.ts +21 -0
  26. package/out/src/adapters/state-ny-notaries/adapter.d.ts.map +1 -0
  27. package/out/src/adapters/state-ny-notaries/adapter.js +132 -0
  28. package/out/src/adapters/state-ny-notaries/adapter.js.map +1 -0
  29. package/out/src/adapters/state-tx-notaries/adapter.d.ts +22 -0
  30. package/out/src/adapters/state-tx-notaries/adapter.d.ts.map +1 -0
  31. package/out/src/adapters/state-tx-notaries/adapter.js +125 -0
  32. package/out/src/adapters/state-tx-notaries/adapter.js.map +1 -0
  33. package/out/src/adapters/tiger/adapter.d.ts +45 -0
  34. package/out/src/adapters/tiger/adapter.d.ts.map +1 -0
  35. package/out/src/adapters/tiger/adapter.js +179 -0
  36. package/out/src/adapters/tiger/adapter.js.map +1 -0
  37. package/out/src/adapters/usgov-hrsa-fqhc/adapter.d.ts +36 -0
  38. package/out/src/adapters/usgov-hrsa-fqhc/adapter.d.ts.map +1 -0
  39. package/out/src/adapters/usgov-hrsa-fqhc/adapter.js +147 -0
  40. package/out/src/adapters/usgov-hrsa-fqhc/adapter.js.map +1 -0
  41. package/out/src/adapters/usgov-imls-pls/adapter.d.ts +25 -0
  42. package/out/src/adapters/usgov-imls-pls/adapter.d.ts.map +1 -0
  43. package/out/src/adapters/usgov-imls-pls/adapter.js +118 -0
  44. package/out/src/adapters/usgov-imls-pls/adapter.js.map +1 -0
  45. package/out/src/adapters/usgov-nad/adapter.d.ts +37 -0
  46. package/out/src/adapters/usgov-nad/adapter.d.ts.map +1 -0
  47. package/out/src/adapters/usgov-nad/adapter.js +227 -0
  48. package/out/src/adapters/usgov-nad/adapter.js.map +1 -0
  49. package/out/src/adapters/usgov-nppes/adapter.d.ts +28 -0
  50. package/out/src/adapters/usgov-nppes/adapter.d.ts.map +1 -0
  51. package/out/src/adapters/usgov-nppes/adapter.js +123 -0
  52. package/out/src/adapters/usgov-nppes/adapter.js.map +1 -0
  53. package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.d.ts +35 -0
  54. package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.d.ts.map +1 -0
  55. package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.js +162 -0
  56. package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.js.map +1 -0
  57. package/out/src/adapters/wof-admin-json/adapter.d.ts +85 -0
  58. package/out/src/adapters/wof-admin-json/adapter.d.ts.map +1 -0
  59. package/out/src/adapters/wof-admin-json/adapter.js +241 -0
  60. package/out/src/adapters/wof-admin-json/adapter.js.map +1 -0
  61. package/out/src/adapters/wof-postalcode-json/adapter.d.ts +63 -0
  62. package/out/src/adapters/wof-postalcode-json/adapter.d.ts.map +1 -0
  63. package/out/src/adapters/wof-postalcode-json/adapter.js +178 -0
  64. package/out/src/adapters/wof-postalcode-json/adapter.js.map +1 -0
  65. package/out/src/align.d.ts +58 -0
  66. package/out/src/align.d.ts.map +1 -0
  67. package/out/src/align.js +139 -0
  68. package/out/src/align.js.map +1 -0
  69. package/out/src/build.d.ts +104 -0
  70. package/out/src/build.d.ts.map +1 -0
  71. package/out/src/build.js +201 -0
  72. package/out/src/build.js.map +1 -0
  73. package/out/src/codex/us-fips-state.d.ts +44 -0
  74. package/out/src/codex/us-fips-state.d.ts.map +1 -0
  75. package/out/src/codex/us-fips-state.js +105 -0
  76. package/out/src/codex/us-fips-state.js.map +1 -0
  77. package/out/src/codex/us-street-suffix.d.ts +259 -0
  78. package/out/src/codex/us-street-suffix.d.ts.map +1 -0
  79. package/out/src/codex/us-street-suffix.js +285 -0
  80. package/out/src/codex/us-street-suffix.js.map +1 -0
  81. package/out/src/format.d.ts +79 -0
  82. package/out/src/format.d.ts.map +1 -0
  83. package/out/src/format.js +151 -0
  84. package/out/src/format.js.map +1 -0
  85. package/out/src/golden.d.ts +50 -0
  86. package/out/src/golden.d.ts.map +1 -0
  87. package/out/src/golden.js +104 -0
  88. package/out/src/golden.js.map +1 -0
  89. package/out/src/index.d.ts +18 -0
  90. package/out/src/index.d.ts.map +1 -0
  91. package/out/src/index.js +18 -0
  92. package/out/src/index.js.map +1 -0
  93. package/out/src/parquet-wrapper/index.d.ts +12 -0
  94. package/out/src/parquet-wrapper/index.d.ts.map +1 -0
  95. package/out/src/parquet-wrapper/index.js +12 -0
  96. package/out/src/parquet-wrapper/index.js.map +1 -0
  97. package/out/src/parquet-wrapper/reader.d.ts +31 -0
  98. package/out/src/parquet-wrapper/reader.d.ts.map +1 -0
  99. package/out/src/parquet-wrapper/reader.js +54 -0
  100. package/out/src/parquet-wrapper/reader.js.map +1 -0
  101. package/out/src/parquet-wrapper/schema.d.ts +45 -0
  102. package/out/src/parquet-wrapper/schema.d.ts.map +1 -0
  103. package/out/src/parquet-wrapper/schema.js +55 -0
  104. package/out/src/parquet-wrapper/schema.js.map +1 -0
  105. package/out/src/parquet-wrapper/writer.d.ts +41 -0
  106. package/out/src/parquet-wrapper/writer.d.ts.map +1 -0
  107. package/out/src/parquet-wrapper/writer.js +71 -0
  108. package/out/src/parquet-wrapper/writer.js.map +1 -0
  109. package/out/src/parquet.d.ts +122 -0
  110. package/out/src/parquet.d.ts.map +1 -0
  111. package/out/src/parquet.js +220 -0
  112. package/out/src/parquet.js.map +1 -0
  113. package/out/src/runner.d.ts +100 -0
  114. package/out/src/runner.d.ts.map +1 -0
  115. package/out/src/runner.js +183 -0
  116. package/out/src/runner.js.map +1 -0
  117. package/out/src/split.d.ts +108 -0
  118. package/out/src/split.d.ts.map +1 -0
  119. package/out/src/split.js +191 -0
  120. package/out/src/split.js.map +1 -0
  121. package/out/src/synthesize.d.ts +146 -0
  122. package/out/src/synthesize.d.ts.map +1 -0
  123. package/out/src/synthesize.js +472 -0
  124. package/out/src/synthesize.js.map +1 -0
  125. package/out/src/tokenize.d.ts +47 -0
  126. package/out/src/tokenize.d.ts.map +1 -0
  127. package/out/src/tokenize.js +49 -0
  128. package/out/src/tokenize.js.map +1 -0
  129. package/out/src/types.d.ts +168 -0
  130. package/out/src/types.d.ts.map +1 -0
  131. package/out/src/types.js +19 -0
  132. package/out/src/types.js.map +1 -0
  133. package/out/src/wof-json.d.ts +105 -0
  134. package/out/src/wof-json.d.ts.map +1 -0
  135. package/out/src/wof-json.js +174 -0
  136. package/out/src/wof-json.js.map +1 -0
  137. package/package.json +36 -0
@@ -0,0 +1,183 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Adapter runner — drives a `CorpusAdapter` to completion and writes intermediate JSONL + a
7
+ * per-shard manifest.
8
+ *
9
+ * Output layout under `outputDir`:
10
+ *
11
+ * ```
12
+ * <outputDir>/<adapter.id>/
13
+ * canonical.jsonl # one row per line, in emission order
14
+ * MANIFEST.json # adapter id, version, row count, sha256, license, started_at, ended_at
15
+ * ```
16
+ *
17
+ * The runner is responsible for everything an adapter is **not** responsible for:
18
+ *
19
+ * - Stamping `corpus_version` on every row (adapters must NOT set it).
20
+ * - Applying `canonicalDedupKey` and skipping duplicates.
21
+ * - Streaming sha256 over JSONL bytes so the manifest checksum doesn't require a re-read.
22
+ * - Honoring backpressure on the output write stream.
23
+ * - Counting + emitting periodic progress to an optional callback.
24
+ * - Honoring `signal` (delegates to adapter's iteration boundary).
25
+ *
26
+ * The runner does NOT perform alignment, tokenization, synthesis, or sharding into Parquet. Those
27
+ * steps run later, consuming the JSONL shards this writes.
28
+ */
29
+ import { createWriteStream } from "node:fs";
30
+ import { mkdir, writeFile } from "node:fs/promises";
31
+ import { dirname, join } from "node:path";
32
+ import { canonicalDedupKey, streamingSha256 } from "./adapter.js";
33
+ /**
34
+ * Drive a single adapter to completion.
35
+ *
36
+ * Returns the manifest describing the run. Writes `canonical.jsonl` + `MANIFEST.json` under
37
+ * `outputDir/<adapter.id>/`. Throws if the output directory cannot be created, if a row arrives
38
+ * with a missing required field, or if the abort signal fires.
39
+ */
40
+ export async function runAdapter(opts) {
41
+ const { adapter, adapterOptions, outputDir, corpusVersion } = opts;
42
+ const progressEvery = opts.progressEvery ?? 1_000;
43
+ const adapterDir = join(outputDir, adapter.id);
44
+ await mkdir(adapterDir, { recursive: true });
45
+ const jsonlPath = join(adapterDir, "canonical.jsonl");
46
+ const manifestPath = join(adapterDir, "MANIFEST.json");
47
+ const startedAt = new Date();
48
+ const t0 = performance.now();
49
+ const stream = createWriteStream(jsonlPath, { encoding: "utf8" });
50
+ const hasher = streamingSha256();
51
+ const seen = new Set();
52
+ const DEDUP_MAX_SIZE = 10_000_000;
53
+ let dedupExhausted = false;
54
+ let yielded = 0;
55
+ let written = 0;
56
+ let bytes = 0;
57
+ const emitProgress = () => {
58
+ opts.onProgress?.({
59
+ adapterId: adapter.id,
60
+ yielded,
61
+ written,
62
+ bytes,
63
+ elapsed_ms: performance.now() - t0,
64
+ });
65
+ };
66
+ try {
67
+ for await (const row of adapter.rows(adapterOptions)) {
68
+ if (adapterOptions.signal?.aborted) {
69
+ throw new DOMException("Adapter run aborted by signal", "AbortError");
70
+ }
71
+ yielded++;
72
+ assertEmittedRow(adapter, row);
73
+ const stamped = { ...row, corpus_version: corpusVersion };
74
+ const key = canonicalDedupKey(stamped);
75
+ if (!dedupExhausted) {
76
+ if (seen.has(key)) {
77
+ if (yielded % progressEvery === 0)
78
+ emitProgress();
79
+ continue;
80
+ }
81
+ if (seen.size >= DEDUP_MAX_SIZE) {
82
+ dedupExhausted = true;
83
+ process.stderr.write(` runner: dedup set full at ${DEDUP_MAX_SIZE.toLocaleString()} — skipping dedup for remaining rows\n`);
84
+ }
85
+ else {
86
+ seen.add(key);
87
+ }
88
+ }
89
+ const line = `${JSON.stringify(stamped)}\n`;
90
+ hasher.update(line);
91
+ bytes += Buffer.byteLength(line, "utf8");
92
+ written++;
93
+ if (!stream.write(line)) {
94
+ await once(stream, "drain");
95
+ }
96
+ if (yielded % progressEvery === 0)
97
+ emitProgress();
98
+ }
99
+ }
100
+ finally {
101
+ stream.end();
102
+ await once(stream, "close");
103
+ }
104
+ const endedAt = new Date();
105
+ const elapsed_ms = performance.now() - t0;
106
+ emitProgress();
107
+ const manifest = {
108
+ adapter_id: adapter.id,
109
+ corpus_version: corpusVersion,
110
+ default_license: adapter.defaultLicense,
111
+ description: adapter.description,
112
+ yielded,
113
+ written,
114
+ deduped: yielded - written,
115
+ bytes,
116
+ sha256: hasher.digest(),
117
+ jsonl_path: jsonlPath,
118
+ started_at: startedAt.toISOString(),
119
+ ended_at: endedAt.toISOString(),
120
+ elapsed_ms,
121
+ };
122
+ await writeFile(manifestPath, `${JSON.stringify(manifest, null, 2)}\n`, "utf8");
123
+ return manifest;
124
+ }
125
+ /**
126
+ * Drive every adapter in a registry sequentially. Stops on the first failure (caller can filter the
127
+ * registry before calling if partial-failure is desired).
128
+ *
129
+ * Returns the manifests in registry insertion order.
130
+ */
131
+ export async function runAllAdapters(registry, common) {
132
+ const out = [];
133
+ for (const adapter of registry.list()) {
134
+ const adapterOptions = common.adapterOptionsFor?.(adapter) ?? common.adapterOptions;
135
+ out.push(await runAdapter({
136
+ ...common,
137
+ adapter,
138
+ adapterOptions,
139
+ }));
140
+ }
141
+ return out;
142
+ }
143
+ /**
144
+ * Validate an emitted row. Cheap; runs once per row. Catches adapter bugs early so the JSONL
145
+ * doesn't end up half-malformed.
146
+ */
147
+ function assertEmittedRow(adapter, row) {
148
+ if (row.source !== adapter.id) {
149
+ throw new Error(`adapter ${adapter.id}: row.source must equal adapter.id (got ${JSON.stringify(row.source)})`);
150
+ }
151
+ if (!row.source_id) {
152
+ throw new Error(`adapter ${adapter.id}: row.source_id is empty`);
153
+ }
154
+ if (!row.raw) {
155
+ throw new Error(`adapter ${adapter.id}: row.raw is empty for source_id=${row.source_id}`);
156
+ }
157
+ if (!row.country) {
158
+ throw new Error(`adapter ${adapter.id}: row.country is empty for source_id=${row.source_id}`);
159
+ }
160
+ if (!row.license) {
161
+ throw new Error(`adapter ${adapter.id}: row.license is empty for source_id=${row.source_id}`);
162
+ }
163
+ }
164
+ /** Promise-ify a single event emission. Used to await `drain` / `close` on the write stream. */
165
+ function once(emitter, event) {
166
+ return new Promise((resolve, reject) => {
167
+ const onEvent = () => {
168
+ emitter.off("error", onError);
169
+ resolve();
170
+ };
171
+ const onError = (err) => {
172
+ emitter.off(event, onEvent);
173
+ reject(err);
174
+ };
175
+ emitter.once(event, onEvent);
176
+ emitter.once("error", onError);
177
+ });
178
+ }
179
+ /** Convenience: ensure the parent directory of `filePath` exists. */
180
+ export async function ensureParentDir(filePath) {
181
+ await mkdir(dirname(filePath), { recursive: true });
182
+ }
183
+ //# sourceMappingURL=runner.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"runner.js","sourceRoot":"","sources":["../../src/runner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;GA2BG;AAEH,OAAO,EAAE,iBAAiB,EAAoB,MAAM,SAAS,CAAA;AAC7D,OAAO,EAAE,KAAK,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAA;AACnD,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAA;AACzC,OAAO,EAAE,iBAAiB,EAAE,eAAe,EAA8C,MAAM,cAAc,CAAA;AAiE7G;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,UAAU,CAAC,IAAuB;IACvD,MAAM,EAAE,OAAO,EAAE,cAAc,EAAE,SAAS,EAAE,aAAa,EAAE,GAAG,IAAI,CAAA;IAClE,MAAM,aAAa,GAAG,IAAI,CAAC,aAAa,IAAI,KAAK,CAAA;IAEjD,MAAM,UAAU,GAAG,IAAI,CAAC,SAAS,EAAE,OAAO,CAAC,EAAE,CAAC,CAAA;IAC9C,MAAM,KAAK,CAAC,UAAU,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAA;IAE5C,MAAM,SAAS,GAAG,IAAI,CAAC,UAAU,EAAE,iBAAiB,CAAC,CAAA;IACrD,MAAM,YAAY,GAAG,IAAI,CAAC,UAAU,EAAE,eAAe,CAAC,CAAA;IAEtD,MAAM,SAAS,GAAG,IAAI,IAAI,EAAE,CAAA;IAC5B,MAAM,EAAE,GAAG,WAAW,CAAC,GAAG,EAAE,CAAA;IAE5B,MAAM,MAAM,GAAG,iBAAiB,CAAC,SAAS,EAAE,EAAE,QAAQ,EAAE,MAAM,EAAE,CAAC,CAAA;IACjE,MAAM,MAAM,GAAoB,eAAe,EAAE,CAAA;IACjD,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAA;IAC9B,MAAM,cAAc,GAAG,UAAU,CAAA;IACjC,IAAI,cAAc,GAAG,KAAK,CAAA;IAE1B,IAAI,OAAO,GAAG,CAAC,CAAA;IACf,IAAI,OAAO,GAAG,CAAC,CAAA;IACf,IAAI,KAAK,GAAG,CAAC,CAAA;IAEb,MAAM,YAAY,GAAG,GAAS,EAAE;QAC/B,IAAI,CAAC,UAAU,EAAE,CAAC;YACjB,SAAS,EAAE,OAAO,CAAC,EAAE;YACrB,OAAO;YACP,OAAO;YACP,KAAK;YACL,UAAU,EAAE,WAAW,CAAC,GAAG,EAAE,GAAG,EAAE;SAClC,CAAC,CAAA;IACH,CAAC,CAAA;IAED,IAAI,CAAC;QACJ,IAAI,KAAK,EAAE,MAAM,GAAG,IAAI,OAAO,CAAC,IAAI,CAAC,cAAc,CAAC,EAAE,CAAC;YACtD,IAAI,cAAc,CAAC,MAAM,EAAE,OAAO,EAAE,CAAC;gBACpC,MAAM,IAAI,YAAY,CAAC,+BAA+B,EAAE,YAAY,CAAC,CAAA;YACtE,CAAC;YAED,OAAO,EAAE,CAAA;YACT,gBAAgB,CAAC,OAAO,EAAE,GAAG,CAAC,CAAA;YAE9B,MAAM,OAAO,GAAiB,EAAE,GAAG,GAAG,EAAE,cAAc,EAAE,aAAa,EAAE,CAAA;YACvE,MAAM,GAAG,GAAG,iBAAiB,CAAC,OAAO,CAAC,CAAA;YACtC,IAAI,CAAC,cAAc,EAAE,CAAC;gBACrB,IAAI,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;oBACnB,IAAI,OAAO,GAAG,aAAa,KAAK,CAAC;wBAAE,YAAY,EAAE,CAAA;oBACjD,SAAQ;gBACT,CAAC;gBACD,IAAI,IAAI,CAAC,IAAI,IAAI,cAAc,EAAE,CAAC;oBACjC,cAAc,GAAG,IAAI,CAAA;oBACrB,OAAO,CAAC,MAAM,CAAC,KAAK,CACnB,+BAA+B,cAAc,CAAC,cAAc,EAAE,wCAAwC,CACtG,CAAA;gBACF,CAAC;qBAAM,CAAC;oBACP,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,CAAA;gBACd,CAAC;YACF,CAAC;YAED,MAAM,IAAI,GAAG,GAAG,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC,IAAI,CAAA;YAC3C,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAA;YACnB,KAAK,IAAI,MAAM,CAAC,UAAU,CAAC,IAAI,EAAE,MAAM,CAAC,CAAA;YACxC,OAAO,EAAE,CAAA;YAET,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC;gBACzB,MAAM,IAAI,CAAC,MAAM,EAAE,OAAO,CAAC,CAAA;YAC5B,CAAC;YAED,IAAI,OAAO,GAAG,aAAa,KAAK,CAAC;gBAAE,YAAY,EAAE,CAAA;QAClD,CAAC;IACF,CAAC;YAAS,CAAC;QACV,MAAM,CAAC,GAAG,EAAE,CAAA;QACZ,MAAM,IAAI,CAAC,MAAM,EAAE,OAAO,CAAC,CAAA;IAC5B,CAAC;IAED,MAAM,OAAO,GAAG,IAAI,IAAI,EAAE,CAAA;IAC1B,MAAM,UAAU,GAAG,WAAW,CAAC,GAAG,EAAE,GAAG,EAAE,CAAA;IACzC,YAAY,EAAE,CAAA;IAEd,MAAM,QAAQ,GAAuB;QACpC,UAAU,EAAE,OAAO,CAAC,EAAE;QACtB,cAAc,EAAE,aAAa;QAC7B,eAAe,EAAE,OAAO,CAAC,cAAc;QACvC,WAAW,EAAE,OAAO,CAAC,WAAW;QAChC,OAAO;QACP,OAAO;QACP,OAAO,EAAE,OAAO,GAAG,OAAO;QAC1B,KAAK;QACL,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE;QACvB,UAAU,EAAE,SAAS;QACrB,UAAU,EAAE,SAAS,CAAC,WAAW,EAAE;QACnC,QAAQ,EAAE,OAAO,CAAC,WAAW,EAAE;QAC/B,UAAU;KACV,CAAA;IAED,MAAM,SAAS,CAAC,YAAY,EAAE,GAAG,IAAI,CAAC,SAAS,CAAC,QAAQ,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,MAAM,CAAC,CAAA;IAE/E,OAAO,QAAQ,CAAA;AAChB,CAAC;AAED;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CACnC,QAAyB,EACzB,MAAyG;IAEzG,MAAM,GAAG,GAAyB,EAAE,CAAA;IACpC,KAAK,MAAM,OAAO,IAAI,QAAQ,CAAC,IAAI,EAAE,EAAE,CAAC;QACvC,MAAM,cAAc,GAAG,MAAM,CAAC,iBAAiB,EAAE,CAAC,OAAO,CAAC,IAAI,MAAM,CAAC,cAAc,CAAA;QACnF,GAAG,CAAC,IAAI,CACP,MAAM,UAAU,CAAC;YAChB,GAAG,MAAM;YACT,OAAO;YACP,cAAc;SACd,CAAC,CACF,CAAA;IACF,CAAC;IACD,OAAO,GAAG,CAAA;AACX,CAAC;AAED;;;GAGG;AACH,SAAS,gBAAgB,CAAC,OAAsB,EAAE,GAAiB;IAClE,IAAI,GAAG,CAAC,MAAM,KAAK,OAAO,CAAC,EAAE,EAAE,CAAC;QAC/B,MAAM,IAAI,KAAK,CAAC,WAAW,OAAO,CAAC,EAAE,2CAA2C,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,MAAM,CAAC,GAAG,CAAC,CAAA;IAC/G,CAAC;IACD,IAAI,CAAC,GAAG,CAAC,SAAS,EAAE,CAAC;QACpB,MAAM,IAAI,KAAK,CAAC,WAAW,OAAO,CAAC,EAAE,0BAA0B,CAAC,CAAA;IACjE,CAAC;IACD,IAAI,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC;QACd,MAAM,IAAI,KAAK,CAAC,WAAW,OAAO,CAAC,EAAE,oCAAoC,GAAG,CAAC,SAAS,EAAE,CAAC,CAAA;IAC1F,CAAC;IACD,IAAI,CAAC,GAAG,CAAC,OAAO,EAAE,CAAC;QAClB,MAAM,IAAI,KAAK,CAAC,WAAW,OAAO,CAAC,EAAE,wCAAwC,GAAG,CAAC,SAAS,EAAE,CAAC,CAAA;IAC9F,CAAC;IACD,IAAI,CAAC,GAAG,CAAC,OAAO,EAAE,CAAC;QAClB,MAAM,IAAI,KAAK,CAAC,WAAW,OAAO,CAAC,EAAE,wCAAwC,GAAG,CAAC,SAAS,EAAE,CAAC,CAAA;IAC9F,CAAC;AACF,CAAC;AAED,gGAAgG;AAChG,SAAS,IAAI,CAAC,OAAoB,EAAE,KAAwB;IAC3D,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;QACtC,MAAM,OAAO,GAAG,GAAS,EAAE;YAC1B,OAAO,CAAC,GAAG,CAAC,OAAO,EAAE,OAAO,CAAC,CAAA;YAC7B,OAAO,EAAE,CAAA;QACV,CAAC,CAAA;QACD,MAAM,OAAO,GAAG,CAAC,GAAU,EAAQ,EAAE;YACpC,OAAO,CAAC,GAAG,CAAC,KAAK,EAAE,OAAO,CAAC,CAAA;YAC3B,MAAM,CAAC,GAAG,CAAC,CAAA;QACZ,CAAC,CAAA;QACD,OAAO,CAAC,IAAI,CAAC,KAAK,EAAE,OAAO,CAAC,CAAA;QAC5B,OAAO,CAAC,IAAI,CAAC,OAAO,EAAE,OAAO,CAAC,CAAA;IAC/B,CAAC,CAAC,CAAA;AACH,CAAC;AAED,qEAAqE;AACrE,MAAM,CAAC,KAAK,UAAU,eAAe,CAAC,QAAgB;IACrD,MAAM,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAA;AACpD,CAAC"}
@@ -0,0 +1,108 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Train / val / test split with **locality holdout** per the Phase 1 plan.
7
+ *
8
+ * The corpus's val + test sets are not randomly sampled rows — they're entire low-density regions
9
+ * held out so the model cannot memorize them at training time. Rationale (per the plan's "Common
10
+ * pitfalls" section): random splits leak by neighborhood — a model fed "13 Main St, Springfield,
11
+ * IL" in train and "15 Main St, Springfield, IL" in test generalizes via region/locality
12
+ * memorization, not by learning the underlying schema.
13
+ *
14
+ * Phase 1 holdouts (chosen for low data density + administrative isolation):
15
+ *
16
+ * - **US**: Vermont, Wyoming, North Dakota
17
+ * - **FR**: Corse, Lozère, Creuse
18
+ *
19
+ * Held-out rows are deterministically split 50/50 between val and test by hashing the row's
20
+ * `source_id`. Non-held-out rows go to train. The 90/5/5 ratio is approximate — what matters is
21
+ * the locality boundary, not the exact split percentages.
22
+ *
23
+ * The output is a `SplitManifest`: three `string[]` arrays of `source_id`. Manifests live in git
24
+ * (under `corpus/splits/<version>/`) so reruns are reproducible bit-for-bit.
25
+ */
26
+ import type { CanonicalRow, LabeledRow } from "./types.js";
27
+ export type SplitName = "train" | "val" | "test";
28
+ export interface SplitOptions {
29
+ /**
30
+ * Region-name → holdout policy, keyed by ISO 3166-1 alpha-2 country. The values are the
31
+ * region-component strings the splitter looks for in `row.components.region`. Override to change
32
+ * the holdout for an experiment; defaults to `defaultHoldouts()`.
33
+ */
34
+ holdouts?: Record<string, readonly string[]>;
35
+ }
36
+ /** Output manifest: source_id lists per split. */
37
+ export interface SplitManifest {
38
+ train: string[];
39
+ val: string[];
40
+ test: string[];
41
+ /** Echoes the holdouts used, so the manifest is self-describing. */
42
+ holdouts: Record<string, readonly string[]>;
43
+ /** Corpus version stamped onto the manifest. Read from the first row. */
44
+ corpus_version: string;
45
+ /** Counts for quick sanity checks. */
46
+ counts: {
47
+ train: number;
48
+ val: number;
49
+ test: number;
50
+ total: number;
51
+ };
52
+ }
53
+ /**
54
+ * Phase 1 default holdouts (per plan).
55
+ *
56
+ * - US: Vermont, Wyoming, North Dakota (low density, easy to identify in WOF/admin sources).
57
+ * - FR: Corse, Lozère, Creuse (small departments / regions).
58
+ */
59
+ export declare function defaultHoldouts(): Record<string, readonly string[]>;
60
+ type SplitInputRow = Pick<CanonicalRow, "source_id" | "country" | "corpus_version" | "components">;
61
+ /**
62
+ * Pure per-row split decision. Used by both the in-memory `splitRows` and by the streaming
63
+ * `buildCorpus` align loop (`build.ts`) to decide each row's split without retaining the row in
64
+ * heap. Identical hash bucketing semantics to the array-based path so the decision is stable
65
+ * regardless of caller.
66
+ */
67
+ export declare function splitForRow(row: Pick<SplitInputRow, "source_id" | "country" | "components">, holdouts?: Record<string, readonly string[]>): SplitName;
68
+ /**
69
+ * Compute a `SplitManifest` from an iterable of labeled (or canonical) rows. Both shapes are
70
+ * accepted — only `source_id`, `country`, `corpus_version`, and `components.region` are consulted.
71
+ *
72
+ * Retained for in-memory callers (tests; small-scale fixture runs). Real-data builds via
73
+ * `buildCorpus` use the streaming path (`splitForRow` + `writeSplitManifestsFromLabeledFiles`) to
74
+ * avoid materializing every aligned row's split membership in heap.
75
+ */
76
+ export declare function splitRows(rows: Iterable<SplitInputRow>, opts?: SplitOptions): SplitManifest;
77
+ /** Lightweight deterministic 0..(n-1) bucket from a string id. */
78
+ export declare function hashBucket(id: string, n: number): number;
79
+ /**
80
+ * Write a `SplitManifest` to `<outputDir>/{train,val,test}.json`. The manifests are line-separated
81
+ * source_id lists (one id per line) so they diff cleanly in git. Also writes
82
+ * `<outputDir>/MANIFEST.json` with the full structured manifest including holdouts + counts +
83
+ * corpus version.
84
+ *
85
+ * Reruns produce byte-identical files (the underlying `splitRows` is deterministic).
86
+ */
87
+ export declare function writeSplitManifests(manifest: SplitManifest, outputDir: string): Promise<void>;
88
+ /** Type re-export for callers that want to ingest LabeledRow specifically. */
89
+ export type SplitInputLabeledRow = Pick<LabeledRow, "source_id" | "country" | "corpus_version" | "components">;
90
+ /**
91
+ * Streaming variant of `writeSplitManifests`: derives the per-split source-id .txt manifests +
92
+ * `SPLIT_MANIFEST.json` by streaming three per-split labeled-row JSONL files (one per split).
93
+ * Memory cost is O(1) — `sort(1)` from coreutils handles the deterministic sort with disk spill for
94
+ * files that exceed in-memory thresholds.
95
+ *
96
+ * Used by `buildCorpus` after the align loop has already partitioned labeled rows into
97
+ * `labeled-{train,val,test}.jsonl` via `splitForRow`. Counts are pre-computed by the align loop and
98
+ * passed in (zero re-scan).
99
+ */
100
+ export declare function writeSplitManifestsFromLabeledFiles(opts: {
101
+ labeledPaths: Record<SplitName, string>;
102
+ outputDir: string;
103
+ corpusVersion: string;
104
+ counts: Record<SplitName, number>;
105
+ holdouts?: Record<string, readonly string[]>;
106
+ }): Promise<SplitManifest["counts"]>;
107
+ export {};
108
+ //# sourceMappingURL=split.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"split.d.ts","sourceRoot":"","sources":["../../src/split.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AAQH,OAAO,KAAK,EAAE,YAAY,EAAE,UAAU,EAAE,MAAM,YAAY,CAAA;AAE1D,MAAM,MAAM,SAAS,GAAG,OAAO,GAAG,KAAK,GAAG,MAAM,CAAA;AAEhD,MAAM,WAAW,YAAY;IAC5B;;;;OAIG;IACH,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,SAAS,MAAM,EAAE,CAAC,CAAA;CAC5C;AAED,kDAAkD;AAClD,MAAM,WAAW,aAAa;IAC7B,KAAK,EAAE,MAAM,EAAE,CAAA;IACf,GAAG,EAAE,MAAM,EAAE,CAAA;IACb,IAAI,EAAE,MAAM,EAAE,CAAA;IACd,oEAAoE;IACpE,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,SAAS,MAAM,EAAE,CAAC,CAAA;IAC3C,yEAAyE;IACzE,cAAc,EAAE,MAAM,CAAA;IACtB,sCAAsC;IACtC,MAAM,EAAE;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,GAAG,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,CAAA;CACnE;AAED;;;;;GAKG;AACH,wBAAgB,eAAe,IAAI,MAAM,CAAC,MAAM,EAAE,SAAS,MAAM,EAAE,CAAC,CAKnE;AAED,KAAK,aAAa,GAAG,IAAI,CAAC,YAAY,EAAE,WAAW,GAAG,SAAS,GAAG,gBAAgB,GAAG,YAAY,CAAC,CAAA;AAElG;;;;;GAKG;AACH,wBAAgB,WAAW,CAC1B,GAAG,EAAE,IAAI,CAAC,aAAa,EAAE,WAAW,GAAG,SAAS,GAAG,YAAY,CAAC,EAChE,QAAQ,GAAE,MAAM,CAAC,MAAM,EAAE,SAAS,MAAM,EAAE,CAAqB,GAC7D,SAAS,CAOX;AAED;;;;;;;GAOG;AACH,wBAAgB,SAAS,CAAC,IAAI,EAAE,QAAQ,CAAC,aAAa,CAAC,EAAE,IAAI,GAAE,YAAiB,GAAG,aAAa,CAwB/F;AAED,kEAAkE;AAClE,wBAAgB,UAAU,CAAC,EAAE,EAAE,MAAM,EAAE,CAAC,EAAE,MAAM,GAAG,MAAM,CAKxD;AAED;;;;;;;GAOG;AACH,wBAAsB,mBAAmB,CAAC,QAAQ,EAAE,aAAa,EAAE,SAAS,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAYnG;AAED,8EAA8E;AAC9E,MAAM,MAAM,oBAAoB,GAAG,IAAI,CAAC,UAAU,EAAE,WAAW,GAAG,SAAS,GAAG,gBAAgB,GAAG,YAAY,CAAC,CAAA;AAE9G;;;;;;;;;GASG;AACH,wBAAsB,mCAAmC,CAAC,IAAI,EAAE;IAC/D,YAAY,EAAE,MAAM,CAAC,SAAS,EAAE,MAAM,CAAC,CAAA;IACvC,SAAS,EAAE,MAAM,CAAA;IACjB,aAAa,EAAE,MAAM,CAAA;IACrB,MAAM,EAAE,MAAM,CAAC,SAAS,EAAE,MAAM,CAAC,CAAA;IACjC,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,SAAS,MAAM,EAAE,CAAC,CAAA;CAC5C,GAAG,OAAO,CAAC,aAAa,CAAC,QAAQ,CAAC,CAAC,CAkBnC"}
@@ -0,0 +1,191 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Train / val / test split with **locality holdout** per the Phase 1 plan.
7
+ *
8
+ * The corpus's val + test sets are not randomly sampled rows — they're entire low-density regions
9
+ * held out so the model cannot memorize them at training time. Rationale (per the plan's "Common
10
+ * pitfalls" section): random splits leak by neighborhood — a model fed "13 Main St, Springfield,
11
+ * IL" in train and "15 Main St, Springfield, IL" in test generalizes via region/locality
12
+ * memorization, not by learning the underlying schema.
13
+ *
14
+ * Phase 1 holdouts (chosen for low data density + administrative isolation):
15
+ *
16
+ * - **US**: Vermont, Wyoming, North Dakota
17
+ * - **FR**: Corse, Lozère, Creuse
18
+ *
19
+ * Held-out rows are deterministically split 50/50 between val and test by hashing the row's
20
+ * `source_id`. Non-held-out rows go to train. The 90/5/5 ratio is approximate — what matters is
21
+ * the locality boundary, not the exact split percentages.
22
+ *
23
+ * The output is a `SplitManifest`: three `string[]` arrays of `source_id`. Manifests live in git
24
+ * (under `corpus/splits/<version>/`) so reruns are reproducible bit-for-bit.
25
+ */
26
+ import { spawn } from "node:child_process";
27
+ import { createHash } from "node:crypto";
28
+ import { createReadStream, createWriteStream } from "node:fs";
29
+ import { mkdir, unlink, writeFile } from "node:fs/promises";
30
+ import { join } from "node:path";
31
+ import { createInterface } from "node:readline";
32
+ /**
33
+ * Phase 1 default holdouts (per plan).
34
+ *
35
+ * - US: Vermont, Wyoming, North Dakota (low density, easy to identify in WOF/admin sources).
36
+ * - FR: Corse, Lozère, Creuse (small departments / regions).
37
+ */
38
+ export function defaultHoldouts() {
39
+ return {
40
+ US: ["Vermont", "VT", "Wyoming", "WY", "North Dakota", "ND"],
41
+ FR: ["Corse", "Lozère", "Lozere", "Creuse"],
42
+ };
43
+ }
44
+ /**
45
+ * Pure per-row split decision. Used by both the in-memory `splitRows` and by the streaming
46
+ * `buildCorpus` align loop (`build.ts`) to decide each row's split without retaining the row in
47
+ * heap. Identical hash bucketing semantics to the array-based path so the decision is stable
48
+ * regardless of caller.
49
+ */
50
+ export function splitForRow(row, holdouts = defaultHoldouts()) {
51
+ const region = row.components.region;
52
+ const countryHoldouts = holdouts[row.country] ?? [];
53
+ const isHeldOut = region !== undefined && countryHoldouts.includes(region);
54
+ if (!isHeldOut)
55
+ return "train";
56
+ // 50/50 deterministic by source_id hash. Same input always lands in the same split.
57
+ return hashBucket(row.source_id, 2) === 0 ? "val" : "test";
58
+ }
59
+ /**
60
+ * Compute a `SplitManifest` from an iterable of labeled (or canonical) rows. Both shapes are
61
+ * accepted — only `source_id`, `country`, `corpus_version`, and `components.region` are consulted.
62
+ *
63
+ * Retained for in-memory callers (tests; small-scale fixture runs). Real-data builds via
64
+ * `buildCorpus` use the streaming path (`splitForRow` + `writeSplitManifestsFromLabeledFiles`) to
65
+ * avoid materializing every aligned row's split membership in heap.
66
+ */
67
+ export function splitRows(rows, opts = {}) {
68
+ const holdouts = opts.holdouts ?? defaultHoldouts();
69
+ const train = [];
70
+ const val = [];
71
+ const test = [];
72
+ let corpus_version = "";
73
+ for (const row of rows) {
74
+ if (!corpus_version && row.corpus_version)
75
+ corpus_version = row.corpus_version;
76
+ const split = splitForRow(row, holdouts);
77
+ if (split === "train")
78
+ train.push(row.source_id);
79
+ else if (split === "val")
80
+ val.push(row.source_id);
81
+ else
82
+ test.push(row.source_id);
83
+ }
84
+ const total = train.length + val.length + test.length;
85
+ return {
86
+ train,
87
+ val,
88
+ test,
89
+ holdouts,
90
+ corpus_version,
91
+ counts: { train: train.length, val: val.length, test: test.length, total },
92
+ };
93
+ }
94
+ /** Lightweight deterministic 0..(n-1) bucket from a string id. */
95
+ export function hashBucket(id, n) {
96
+ const digest = createHash("sha256").update(id).digest();
97
+ // Read 4 bytes as uint32 to avoid bigint overhead.
98
+ const u = digest[0] * 0x01_00_00_00 + digest[1] * 0x01_00_00 + digest[2] * 0x01_00 + digest[3];
99
+ return u % n;
100
+ }
101
+ /**
102
+ * Write a `SplitManifest` to `<outputDir>/{train,val,test}.json`. The manifests are line-separated
103
+ * source_id lists (one id per line) so they diff cleanly in git. Also writes
104
+ * `<outputDir>/MANIFEST.json` with the full structured manifest including holdouts + counts +
105
+ * corpus version.
106
+ *
107
+ * Reruns produce byte-identical files (the underlying `splitRows` is deterministic).
108
+ */
109
+ export async function writeSplitManifests(manifest, outputDir) {
110
+ await mkdir(outputDir, { recursive: true });
111
+ for (const name of ["train", "val", "test"]) {
112
+ const sorted = [...manifest[name]].sort();
113
+ await writeFile(join(outputDir, `${name}.txt`), sorted.join("\n") + (sorted.length ? "\n" : ""), "utf8");
114
+ }
115
+ const summary = {
116
+ corpus_version: manifest.corpus_version,
117
+ holdouts: manifest.holdouts,
118
+ counts: manifest.counts,
119
+ };
120
+ await writeFile(join(outputDir, "SPLIT_MANIFEST.json"), `${JSON.stringify(summary, null, 2)}\n`, "utf8");
121
+ }
122
+ /**
123
+ * Streaming variant of `writeSplitManifests`: derives the per-split source-id .txt manifests +
124
+ * `SPLIT_MANIFEST.json` by streaming three per-split labeled-row JSONL files (one per split).
125
+ * Memory cost is O(1) — `sort(1)` from coreutils handles the deterministic sort with disk spill for
126
+ * files that exceed in-memory thresholds.
127
+ *
128
+ * Used by `buildCorpus` after the align loop has already partitioned labeled rows into
129
+ * `labeled-{train,val,test}.jsonl` via `splitForRow`. Counts are pre-computed by the align loop and
130
+ * passed in (zero re-scan).
131
+ */
132
+ export async function writeSplitManifestsFromLabeledFiles(opts) {
133
+ await mkdir(opts.outputDir, { recursive: true });
134
+ const holdouts = opts.holdouts ?? defaultHoldouts();
135
+ for (const split of ["train", "val", "test"]) {
136
+ const labeledPath = opts.labeledPaths[split];
137
+ const outPath = join(opts.outputDir, `${split}.txt`);
138
+ await streamSortedSourceIds(labeledPath, outPath);
139
+ }
140
+ const total = opts.counts.train + opts.counts.val + opts.counts.test;
141
+ const summary = {
142
+ corpus_version: opts.corpusVersion,
143
+ holdouts,
144
+ counts: { ...opts.counts, total },
145
+ };
146
+ await writeFile(join(opts.outputDir, "SPLIT_MANIFEST.json"), `${JSON.stringify(summary, null, 2)}\n`, "utf8");
147
+ return summary.counts;
148
+ }
149
+ /**
150
+ * Extract `source_id`s from a labeled JSONL file, write them sorted to `outPath`. Empty input →
151
+ * empty output file (not absent). Uses `sort(1)` for disk-spilling external sort so peak memory
152
+ * stays O(1) regardless of labeled-row count.
153
+ */
154
+ async function streamSortedSourceIds(labeledJsonlPath, outPath) {
155
+ const unsortedPath = `${outPath}.unsorted`;
156
+ const out = createWriteStream(unsortedPath, { encoding: "utf8" });
157
+ const rl = createInterface({ input: createReadStream(labeledJsonlPath, { encoding: "utf8" }), crlfDelay: Infinity });
158
+ await new Promise((resolve, reject) => {
159
+ rl.on("line", (line) => {
160
+ if (!line)
161
+ return;
162
+ try {
163
+ const obj = JSON.parse(line);
164
+ if (typeof obj.source_id === "string")
165
+ out.write(`${obj.source_id}\n`);
166
+ }
167
+ catch (err) {
168
+ reject(err);
169
+ }
170
+ });
171
+ rl.on("close", () => {
172
+ out.end();
173
+ });
174
+ rl.on("error", reject);
175
+ out.on("close", () => resolve());
176
+ out.on("error", reject);
177
+ });
178
+ await new Promise((resolve, reject) => {
179
+ // LC_ALL=C: byte-sort, locale-independent → deterministic across hosts.
180
+ const proc = spawn("sort", [unsortedPath, "-o", outPath], { env: { ...process.env, LC_ALL: "C" } });
181
+ proc.on("error", reject);
182
+ proc.on("exit", (code) => {
183
+ if (code === 0)
184
+ resolve();
185
+ else
186
+ reject(new Error(`sort exited with code ${code}`));
187
+ });
188
+ });
189
+ await unlink(unsortedPath).catch(() => { });
190
+ }
191
+ //# sourceMappingURL=split.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"split.js","sourceRoot":"","sources":["../../src/split.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AAEH,OAAO,EAAE,KAAK,EAAE,MAAM,oBAAoB,CAAA;AAC1C,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAA;AACxC,OAAO,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,MAAM,SAAS,CAAA;AAC7D,OAAO,EAAE,KAAK,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAA;AAC3D,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAA;AAChC,OAAO,EAAE,eAAe,EAAE,MAAM,eAAe,CAAA;AA2B/C;;;;;GAKG;AACH,MAAM,UAAU,eAAe;IAC9B,OAAO;QACN,EAAE,EAAE,CAAC,SAAS,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,cAAc,EAAE,IAAI,CAAC;QAC5D,EAAE,EAAE,CAAC,OAAO,EAAE,QAAQ,EAAE,QAAQ,EAAE,QAAQ,CAAC;KAC3C,CAAA;AACF,CAAC;AAID;;;;;GAKG;AACH,MAAM,UAAU,WAAW,CAC1B,GAAgE,EAChE,WAA8C,eAAe,EAAE;IAE/D,MAAM,MAAM,GAAG,GAAG,CAAC,UAAU,CAAC,MAAM,CAAA;IACpC,MAAM,eAAe,GAAG,QAAQ,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,EAAE,CAAA;IACnD,MAAM,SAAS,GAAG,MAAM,KAAK,SAAS,IAAI,eAAe,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAA;IAC1E,IAAI,CAAC,SAAS;QAAE,OAAO,OAAO,CAAA;IAC9B,oFAAoF;IACpF,OAAO,UAAU,CAAC,GAAG,CAAC,SAAS,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,CAAA;AAC3D,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,UAAU,SAAS,CAAC,IAA6B,EAAE,OAAqB,EAAE;IAC/E,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,IAAI,eAAe,EAAE,CAAA;IACnD,MAAM,KAAK,GAAa,EAAE,CAAA;IAC1B,MAAM,GAAG,GAAa,EAAE,CAAA;IACxB,MAAM,IAAI,GAAa,EAAE,CAAA;IACzB,IAAI,cAAc,GAAG,EAAE,CAAA;IAEvB,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACxB,IAAI,CAAC,cAAc,IAAI,GAAG,CAAC,cAAc;YAAE,cAAc,GAAG,GAAG,CAAC,cAAc,CAAA;QAC9E,MAAM,KAAK,GAAG,WAAW,CAAC,GAAG,EAAE,QAAQ,CAAC,CAAA;QACxC,IAAI,KAAK,KAAK,OAAO;YAAE,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,SAAS,CAAC,CAAA;aAC3C,IAAI,KAAK,KAAK,KAAK;YAAE,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,SAAS,CAAC,CAAA;;YAC5C,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,SAAS,CAAC,CAAA;IAC9B,CAAC;IAED,MAAM,KAAK,GAAG,KAAK,CAAC,MAAM,GAAG,GAAG,CAAC,MAAM,GAAG,IAAI,CAAC,MAAM,CAAA;IACrD,OAAO;QACN,KAAK;QACL,GAAG;QACH,IAAI;QACJ,QAAQ;QACR,cAAc;QACd,MAAM,EAAE,EAAE,KAAK,EAAE,KAAK,CAAC,MAAM,EAAE,GAAG,EAAE,GAAG,CAAC,MAAM,EAAE,IAAI,EAAE,IAAI,CAAC,MAAM,EAAE,KAAK,EAAE;KAC1E,CAAA;AACF,CAAC;AAED,kEAAkE;AAClE,MAAM,UAAU,UAAU,CAAC,EAAU,EAAE,CAAS;IAC/C,MAAM,MAAM,GAAG,UAAU,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,MAAM,EAAE,CAAA;IACvD,mDAAmD;IACnD,MAAM,CAAC,GAAG,MAAM,CAAC,CAAC,CAAE,GAAG,aAAa,GAAG,MAAM,CAAC,CAAC,CAAE,GAAG,UAAU,GAAG,MAAM,CAAC,CAAC,CAAE,GAAG,OAAO,GAAG,MAAM,CAAC,CAAC,CAAE,CAAA;IAClG,OAAO,CAAC,GAAG,CAAC,CAAA;AACb,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,CAAC,KAAK,UAAU,mBAAmB,CAAC,QAAuB,EAAE,SAAiB;IACnF,MAAM,KAAK,CAAC,SAAS,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAA;IAC3C,KAAK,MAAM,IAAI,IAAI,CAAC,OAAO,EAAE,KAAK,EAAE,MAAM,CAAU,EAAE,CAAC;QACtD,MAAM,MAAM,GAAG,CAAC,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,EAAE,CAAA;QACzC,MAAM,SAAS,CAAC,IAAI,CAAC,SAAS,EAAE,GAAG,IAAI,MAAM,CAAC,EAAE,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,MAAM,CAAC,CAAA;IACzG,CAAC;IACD,MAAM,OAAO,GAAG;QACf,cAAc,EAAE,QAAQ,CAAC,cAAc;QACvC,QAAQ,EAAE,QAAQ,CAAC,QAAQ;QAC3B,MAAM,EAAE,QAAQ,CAAC,MAAM;KACvB,CAAA;IACD,MAAM,SAAS,CAAC,IAAI,CAAC,SAAS,EAAE,qBAAqB,CAAC,EAAE,GAAG,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,MAAM,CAAC,CAAA;AACzG,CAAC;AAKD;;;;;;;;;GASG;AACH,MAAM,CAAC,KAAK,UAAU,mCAAmC,CAAC,IAMzD;IACA,MAAM,KAAK,CAAC,IAAI,CAAC,SAAS,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAA;IAChD,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,IAAI,eAAe,EAAE,CAAA;IAEnD,KAAK,MAAM,KAAK,IAAI,CAAC,OAAO,EAAE,KAAK,EAAE,MAAM,CAAU,EAAE,CAAC;QACvD,MAAM,WAAW,GAAG,IAAI,CAAC,YAAY,CAAC,KAAK,CAAC,CAAA;QAC5C,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,GAAG,KAAK,MAAM,CAAC,CAAA;QACpD,MAAM,qBAAqB,CAAC,WAAW,EAAE,OAAO,CAAC,CAAA;IAClD,CAAC;IAED,MAAM,KAAK,GAAG,IAAI,CAAC,MAAM,CAAC,KAAK,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,GAAG,IAAI,CAAC,MAAM,CAAC,IAAI,CAAA;IACpE,MAAM,OAAO,GAAG;QACf,cAAc,EAAE,IAAI,CAAC,aAAa;QAClC,QAAQ;QACR,MAAM,EAAE,EAAE,GAAG,IAAI,CAAC,MAAM,EAAE,KAAK,EAAE;KACjC,CAAA;IACD,MAAM,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,qBAAqB,CAAC,EAAE,GAAG,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,MAAM,CAAC,CAAA;IAC7G,OAAO,OAAO,CAAC,MAAM,CAAA;AACtB,CAAC;AAED;;;;GAIG;AACH,KAAK,UAAU,qBAAqB,CAAC,gBAAwB,EAAE,OAAe;IAC7E,MAAM,YAAY,GAAG,GAAG,OAAO,WAAW,CAAA;IAC1C,MAAM,GAAG,GAAG,iBAAiB,CAAC,YAAY,EAAE,EAAE,QAAQ,EAAE,MAAM,EAAE,CAAC,CAAA;IACjE,MAAM,EAAE,GAAG,eAAe,CAAC,EAAE,KAAK,EAAE,gBAAgB,CAAC,gBAAgB,EAAE,EAAE,QAAQ,EAAE,MAAM,EAAE,CAAC,EAAE,SAAS,EAAE,QAAQ,EAAE,CAAC,CAAA;IAEpH,MAAM,IAAI,OAAO,CAAO,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;QAC3C,EAAE,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,EAAE;YACtB,IAAI,CAAC,IAAI;gBAAE,OAAM;YACjB,IAAI,CAAC;gBACJ,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAA2B,CAAA;gBACtD,IAAI,OAAO,GAAG,CAAC,SAAS,KAAK,QAAQ;oBAAE,GAAG,CAAC,KAAK,CAAC,GAAG,GAAG,CAAC,SAAS,IAAI,CAAC,CAAA;YACvE,CAAC;YAAC,OAAO,GAAG,EAAE,CAAC;gBACd,MAAM,CAAC,GAAY,CAAC,CAAA;YACrB,CAAC;QACF,CAAC,CAAC,CAAA;QACF,EAAE,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,EAAE;YACnB,GAAG,CAAC,GAAG,EAAE,CAAA;QACV,CAAC,CAAC,CAAA;QACF,EAAE,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,CAAC,CAAA;QACtB,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,EAAE,CAAC,OAAO,EAAE,CAAC,CAAA;QAChC,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,CAAC,CAAA;IACxB,CAAC,CAAC,CAAA;IAEF,MAAM,IAAI,OAAO,CAAO,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;QAC3C,wEAAwE;QACxE,MAAM,IAAI,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,YAAY,EAAE,IAAI,EAAE,OAAO,CAAC,EAAE,EAAE,GAAG,EAAE,EAAE,GAAG,OAAO,CAAC,GAAG,EAAE,MAAM,EAAE,GAAG,EAAE,EAAE,CAAC,CAAA;QACnG,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,CAAC,CAAA;QACxB,IAAI,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,EAAE;YACxB,IAAI,IAAI,KAAK,CAAC;gBAAE,OAAO,EAAE,CAAA;;gBACpB,MAAM,CAAC,IAAI,KAAK,CAAC,yBAAyB,IAAI,EAAE,CAAC,CAAC,CAAA;QACxD,CAAC,CAAC,CAAA;IACH,CAAC,CAAC,CAAA;IACF,MAAM,MAAM,CAAC,YAAY,CAAC,CAAC,KAAK,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAA;AAC3C,CAAC"}
@@ -0,0 +1,146 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Synthesis / augmentation per Phase 1 task #6.
7
+ *
8
+ * An `Augmentation` is a pure function that takes a `CanonicalRow` and either returns a new
9
+ * `CanonicalRow` (with `raw` AND `components` transformed in lockstep so alignment still
10
+ * succeeds) or `null` when the augmentation doesn't apply to the row's shape.
11
+ *
12
+ * Synthesis runs **before** alignment: augmentations transform raw + components together, and the
13
+ * runner reruns alignment on each augmented row to produce its labels. This keeps the synthesis
14
+ * surface small (no token/label arithmetic) at the cost of a re-run.
15
+ *
16
+ * Every augmented row carries the `synth` marker:
17
+ *
18
+ * - `method`: the augmentation's stable id (e.g. `"case-upper"`, `"accent-strip"`).
19
+ * - `base_source_id`: the source_id of the un-augmented (or upstream-augmented) row, so ancestry is
20
+ * traceable.
21
+ *
22
+ * Phase 1 implements the locale-agnostic + most useful US/FR augmentations. Typo injection and
23
+ * other stochastic augmentations are intentionally deferred — they need a seed-aware API and are
24
+ * most useful at training time, not corpus build time.
25
+ */
26
+ import { type Tokenizer } from "./tokenize.js";
27
+ import type { CanonicalRow, LabeledRow, QuarantinedRow } from "./types.js";
28
+ /**
29
+ * An augmentation transforms a single row. Return `null` if the augmentation doesn't apply (e.g.
30
+ * accent-strip on a row that has no accents; particle-strip on a US row).
31
+ */
32
+ export type Augmentation = (row: CanonicalRow) => CanonicalRow | null;
33
+ /** Upper-case raw + every component value. Returns null if already all-upper. */
34
+ export declare const caseUpper: Augmentation;
35
+ /** Lower-case raw + every component value. Returns null if already all-lower. */
36
+ export declare const caseLower: Augmentation;
37
+ /** Drop commas from `raw`. Components unchanged (they didn't carry commas). */
38
+ export declare const dropCommas: Augmentation;
39
+ /**
40
+ * Replace single spaces with double spaces in `raw` AND in every component value. The component
41
+ * update is load-bearing for alignment: `alignRow` substring-searches each component's surface form
42
+ * inside `raw`, so doubling the spaces in `raw` only would leave single-spaced components
43
+ * unfindable (this was the bug behind v0.1.1's first build attempt — 99.9% of quarantined rows
44
+ * traced back to this augmentation). Doubling both keeps the substring contract intact.
45
+ */
46
+ export declare const doubleSpace: Augmentation;
47
+ /**
48
+ * Strip Unicode combining marks (accents, diacritics) from raw + components. "Hôtel" → "Hotel";
49
+ * "Île-de-France" → "Ile-de-France". Returns null if the row has no accents.
50
+ */
51
+ export declare const accentStrip: Augmentation;
52
+ /** US: substitute the full state name for its alpha-2 abbreviation. */
53
+ export declare const stateExpand: Augmentation;
54
+ /** US: substitute the alpha-2 abbreviation for the full state name. */
55
+ export declare const stateAbbreviate: Augmentation;
56
+ /** US: expand directional abbreviations in `street`/`street_suffix` (NW → Northwest). */
57
+ export declare const directionalExpand: Augmentation;
58
+ /** US: abbreviate directional words (Northwest → NW). */
59
+ export declare const directionalAbbreviate: Augmentation;
60
+ /**
61
+ * US: swap the trailing street-suffix word in `components.street` to its preferred USPS
62
+ * abbreviation, preserving case. `"5th Avenue"` → `"5th Ave"`; `"5TH AVENUE"` → `"5TH AVE"`; `"main
63
+ * street"` → `"main st"`. Returns null when no trailing suffix is recognized, when the trailing
64
+ * word is already the preferred abbreviation, or when the swap would leave `raw` un- touched
65
+ * (alignment requires both raw and components to move in lockstep).
66
+ *
67
+ * Targets the trailing word only to avoid mangling streets like "Avenue of the Americas" where the
68
+ * suffix-shaped word is part of the proper name rather than a USPS suffix.
69
+ */
70
+ export declare const streetSuffixAbbreviate: Augmentation;
71
+ /**
72
+ * US: swap the trailing street-suffix word in `components.street` to its full canonical form,
73
+ * preserving case. `"5th Ave"` → `"5th Avenue"`; `"5TH AVE"` → `"5TH AVENUE"`; `"main st"` → `"main
74
+ * street"`. Returns null when no trailing suffix is recognized, when the trailing word is already
75
+ * the canonical full form, or when the swap would leave `raw` untouched.
76
+ *
77
+ * Same trailing-word-only rule as `streetSuffixAbbreviate`.
78
+ */
79
+ export declare const streetSuffixExpand: Augmentation;
80
+ /** US: ZIP+4 form `12345-6789` → `123456789` (dash dropped). */
81
+ export declare const zipPlus4DashDrop: Augmentation;
82
+ /** FR: drop the article particle from a street ("Rue de la République" → "Rue République"). */
83
+ export declare const particleStrip: Augmentation;
84
+ /** Stable id → augmentation table. */
85
+ export declare const AUGMENTATIONS: Record<string, Augmentation>;
86
+ /** Default augmentation set, by country. Phase 1: US + FR; others get the locale-agnostic set. */
87
+ export declare function defaultAugmentationsForCountry(country: string): readonly Augmentation[];
88
+ /**
89
+ * Run every augmentation against a row; collect the non-null outputs. The augmentations are pure,
90
+ * so callers can compose them off this generator (e.g. nesting accent-strip ∘ state-abbreviate).
91
+ */
92
+ export declare function synthesizeRow(row: CanonicalRow, augmentations?: readonly Augmentation[]): Generator<CanonicalRow>;
93
+ /** Options accepted by `composeAdversarialRow`. */
94
+ export interface ComposeAdversarialOptions {
95
+ /**
96
+ * Stable pattern label written into the emitted row's `synth.method` field (as
97
+ * `compose:<pattern>`). Free-form but should be one of a small set of canonical pattern names so
98
+ * downstream filtering / stratification can target individual patterns.
99
+ *
100
+ * Recommended values (Phase 1.6 §2.1):
101
+ *
102
+ * - `"place-name-venue"` — venue token shared with locality (`Buffalo Health Clinic, Buffalo NY`).
103
+ * - `"place-shaped-venue"` — venue contains a place-shaped substring (`New York, New York
104
+ * Steakhouse, Las Vegas NV`).
105
+ * - `"particle-honorific"` — apostrophe + St./Saint ambiguity (`P'tit St. Denis Street Café`).
106
+ */
107
+ pattern: string;
108
+ /**
109
+ * Separator inserted between the venue and the address `raw`. Default `", "`. Single space (`"
110
+ * "`) produces the harder unpunctuated variant; newline (`"\n"`) the multi-line variant.
111
+ */
112
+ separator?: string;
113
+ /**
114
+ * Tokenizer to apply to the venue prefix. Default `whitespaceTokenizer()`. The address half uses
115
+ * the same tokenizer when re-aligned — pass a consistent one if customizing.
116
+ */
117
+ tokenizer?: Tokenizer;
118
+ }
119
+ /** Either a successful labeled composition or a quarantined attempt. */
120
+ export type ComposeResult = {
121
+ kind: "labeled";
122
+ row: LabeledRow;
123
+ } | {
124
+ kind: "quarantined";
125
+ row: QuarantinedRow;
126
+ };
127
+ /**
128
+ * Compose a venue string + an address row into a single adversarial `LabeledRow`.
129
+ *
130
+ * The emitted row's `raw` is `${venue}${separator}${address.raw}`. Tokens are produced by
131
+ * tokenizing the two halves independently and concatenating; labels are venue tokens → `B-venue` /
132
+ * `I-venue` followed by the address's labels (obtained by aligning the input address in isolation).
133
+ * This deterministic boundary is the entire point of the primitive: the embedded place-shaped
134
+ * tokens in the venue stay labeled as `venue`, never as the address's locality / region / etc.,
135
+ * even when they share surface forms.
136
+ *
137
+ * The address's components are forwarded as-is (alignment ran on them and they survived); `venue`
138
+ * is added on top with the trimmed venue string as its surface form.
139
+ *
140
+ * Returns `{ kind: "quarantined" }` when:
141
+ *
142
+ * - The venue is empty or whitespace-only.
143
+ * - The address row fails alignment in isolation (the underlying failure reason is propagated).
144
+ */
145
+ export declare function composeAdversarialRow(venue: string, address: CanonicalRow, options: ComposeAdversarialOptions): ComposeResult;
146
+ //# sourceMappingURL=synthesize.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"synthesize.d.ts","sourceRoot":"","sources":["../../src/synthesize.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AAKH,OAAO,EAAuB,KAAK,SAAS,EAAE,MAAM,eAAe,CAAA;AACnE,OAAO,KAAK,EAAE,YAAY,EAAE,UAAU,EAAE,cAAc,EAAE,MAAM,YAAY,CAAA;AAE1E;;;GAGG;AACH,MAAM,MAAM,YAAY,GAAG,CAAC,GAAG,EAAE,YAAY,KAAK,YAAY,GAAG,IAAI,CAAA;AAyBrE,iFAAiF;AACjF,eAAO,MAAM,SAAS,EAAE,YAQvB,CAAA;AAED,iFAAiF;AACjF,eAAO,MAAM,SAAS,EAAE,YAQvB,CAAA;AAED,+EAA+E;AAC/E,eAAO,MAAM,UAAU,EAAE,YAIxB,CAAA;AAED;;;;;;GAMG;AACH,eAAO,MAAM,WAAW,EAAE,YAQzB,CAAA;AAED;;;GAGG;AACH,eAAO,MAAM,WAAW,EAAE,YAQzB,CAAA;AAqED,uEAAuE;AACvE,eAAO,MAAM,WAAW,EAAE,YAazB,CAAA;AAED,uEAAuE;AACvE,eAAO,MAAM,eAAe,EAAE,YAW7B,CAAA;AAgBD,yFAAyF;AACzF,eAAO,MAAM,iBAAiB,EAAE,YAkB/B,CAAA;AAED,yDAAyD;AACzD,eAAO,MAAM,qBAAqB,EAAE,YAqBnC,CAAA;AAED;;;;;;;;;GASG;AACH,eAAO,MAAM,sBAAsB,EAAE,YAkBpC,CAAA;AAED;;;;;;;GAOG;AACH,eAAO,MAAM,kBAAkB,EAAE,YAiBhC,CAAA;AAED,gEAAgE;AAChE,eAAO,MAAM,gBAAgB,EAAE,YAQ9B,CAAA;AAMD,+FAA+F;AAC/F,eAAO,MAAM,aAAa,EAAE,YAW3B,CAAA;AAMD,sCAAsC;AACtC,eAAO,MAAM,aAAa,EAAE,MAAM,CAAC,MAAM,EAAE,YAAY,CActD,CAAA;AAED,kGAAkG;AAClG,wBAAgB,8BAA8B,CAAC,OAAO,EAAE,MAAM,GAAG,SAAS,YAAY,EAAE,CAmBvF;AAED;;;GAGG;AACH,wBAAiB,aAAa,CAC7B,GAAG,EAAE,YAAY,EACjB,aAAa,GAAE,SAAS,YAAY,EAAgD,GAClF,SAAS,CAAC,YAAY,CAAC,CAKzB;AAqCD,mDAAmD;AACnD,MAAM,WAAW,yBAAyB;IACzC;;;;;;;;;;;OAWG;IACH,OAAO,EAAE,MAAM,CAAA;IAEf;;;OAGG;IACH,SAAS,CAAC,EAAE,MAAM,CAAA;IAElB;;;OAGG;IACH,SAAS,CAAC,EAAE,SAAS,CAAA;CACrB;AAED,wEAAwE;AACxE,MAAM,MAAM,aAAa,GAAG;IAAE,IAAI,EAAE,SAAS,CAAC;IAAC,GAAG,EAAE,UAAU,CAAA;CAAE,GAAG;IAAE,IAAI,EAAE,aAAa,CAAC;IAAC,GAAG,EAAE,cAAc,CAAA;CAAE,CAAA;AAE/G;;;;;;;;;;;;;;;;;GAiBG;AACH,wBAAgB,qBAAqB,CACpC,KAAK,EAAE,MAAM,EACb,OAAO,EAAE,YAAY,EACrB,OAAO,EAAE,yBAAyB,GAChC,aAAa,CAsDf"}