@mailwoman/corpus 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. package/out/src/adapter.d.ts +96 -0
  2. package/out/src/adapter.d.ts.map +1 -0
  3. package/out/src/adapter.js +107 -0
  4. package/out/src/adapter.js.map +1 -0
  5. package/out/src/adapters/ban/adapter.d.ts +32 -0
  6. package/out/src/adapters/ban/adapter.d.ts.map +1 -0
  7. package/out/src/adapters/ban/adapter.js +133 -0
  8. package/out/src/adapters/ban/adapter.js.map +1 -0
  9. package/out/src/adapters/fcc-bdc/adapter.d.ts +61 -0
  10. package/out/src/adapters/fcc-bdc/adapter.d.ts.map +1 -0
  11. package/out/src/adapters/fcc-bdc/adapter.js +153 -0
  12. package/out/src/adapters/fcc-bdc/adapter.js.map +1 -0
  13. package/out/src/adapters/index.d.ts +42 -0
  14. package/out/src/adapters/index.d.ts.map +1 -0
  15. package/out/src/adapters/index.js +76 -0
  16. package/out/src/adapters/index.js.map +1 -0
  17. package/out/src/adapters/openaddresses/adapter.d.ts +60 -0
  18. package/out/src/adapters/openaddresses/adapter.d.ts.map +1 -0
  19. package/out/src/adapters/openaddresses/adapter.js +174 -0
  20. package/out/src/adapters/openaddresses/adapter.js.map +1 -0
  21. package/out/src/adapters/state-ia-contractors/adapter.d.ts +23 -0
  22. package/out/src/adapters/state-ia-contractors/adapter.d.ts.map +1 -0
  23. package/out/src/adapters/state-ia-contractors/adapter.js +113 -0
  24. package/out/src/adapters/state-ia-contractors/adapter.js.map +1 -0
  25. package/out/src/adapters/state-ny-notaries/adapter.d.ts +21 -0
  26. package/out/src/adapters/state-ny-notaries/adapter.d.ts.map +1 -0
  27. package/out/src/adapters/state-ny-notaries/adapter.js +132 -0
  28. package/out/src/adapters/state-ny-notaries/adapter.js.map +1 -0
  29. package/out/src/adapters/state-tx-notaries/adapter.d.ts +22 -0
  30. package/out/src/adapters/state-tx-notaries/adapter.d.ts.map +1 -0
  31. package/out/src/adapters/state-tx-notaries/adapter.js +125 -0
  32. package/out/src/adapters/state-tx-notaries/adapter.js.map +1 -0
  33. package/out/src/adapters/tiger/adapter.d.ts +45 -0
  34. package/out/src/adapters/tiger/adapter.d.ts.map +1 -0
  35. package/out/src/adapters/tiger/adapter.js +179 -0
  36. package/out/src/adapters/tiger/adapter.js.map +1 -0
  37. package/out/src/adapters/usgov-hrsa-fqhc/adapter.d.ts +36 -0
  38. package/out/src/adapters/usgov-hrsa-fqhc/adapter.d.ts.map +1 -0
  39. package/out/src/adapters/usgov-hrsa-fqhc/adapter.js +147 -0
  40. package/out/src/adapters/usgov-hrsa-fqhc/adapter.js.map +1 -0
  41. package/out/src/adapters/usgov-imls-pls/adapter.d.ts +25 -0
  42. package/out/src/adapters/usgov-imls-pls/adapter.d.ts.map +1 -0
  43. package/out/src/adapters/usgov-imls-pls/adapter.js +118 -0
  44. package/out/src/adapters/usgov-imls-pls/adapter.js.map +1 -0
  45. package/out/src/adapters/usgov-nad/adapter.d.ts +37 -0
  46. package/out/src/adapters/usgov-nad/adapter.d.ts.map +1 -0
  47. package/out/src/adapters/usgov-nad/adapter.js +227 -0
  48. package/out/src/adapters/usgov-nad/adapter.js.map +1 -0
  49. package/out/src/adapters/usgov-nppes/adapter.d.ts +28 -0
  50. package/out/src/adapters/usgov-nppes/adapter.d.ts.map +1 -0
  51. package/out/src/adapters/usgov-nppes/adapter.js +123 -0
  52. package/out/src/adapters/usgov-nppes/adapter.js.map +1 -0
  53. package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.d.ts +35 -0
  54. package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.d.ts.map +1 -0
  55. package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.js +162 -0
  56. package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.js.map +1 -0
  57. package/out/src/adapters/wof-admin-json/adapter.d.ts +85 -0
  58. package/out/src/adapters/wof-admin-json/adapter.d.ts.map +1 -0
  59. package/out/src/adapters/wof-admin-json/adapter.js +241 -0
  60. package/out/src/adapters/wof-admin-json/adapter.js.map +1 -0
  61. package/out/src/adapters/wof-postalcode-json/adapter.d.ts +63 -0
  62. package/out/src/adapters/wof-postalcode-json/adapter.d.ts.map +1 -0
  63. package/out/src/adapters/wof-postalcode-json/adapter.js +178 -0
  64. package/out/src/adapters/wof-postalcode-json/adapter.js.map +1 -0
  65. package/out/src/align.d.ts +58 -0
  66. package/out/src/align.d.ts.map +1 -0
  67. package/out/src/align.js +139 -0
  68. package/out/src/align.js.map +1 -0
  69. package/out/src/build.d.ts +104 -0
  70. package/out/src/build.d.ts.map +1 -0
  71. package/out/src/build.js +201 -0
  72. package/out/src/build.js.map +1 -0
  73. package/out/src/codex/us-fips-state.d.ts +44 -0
  74. package/out/src/codex/us-fips-state.d.ts.map +1 -0
  75. package/out/src/codex/us-fips-state.js +105 -0
  76. package/out/src/codex/us-fips-state.js.map +1 -0
  77. package/out/src/codex/us-street-suffix.d.ts +259 -0
  78. package/out/src/codex/us-street-suffix.d.ts.map +1 -0
  79. package/out/src/codex/us-street-suffix.js +285 -0
  80. package/out/src/codex/us-street-suffix.js.map +1 -0
  81. package/out/src/format.d.ts +79 -0
  82. package/out/src/format.d.ts.map +1 -0
  83. package/out/src/format.js +151 -0
  84. package/out/src/format.js.map +1 -0
  85. package/out/src/golden.d.ts +50 -0
  86. package/out/src/golden.d.ts.map +1 -0
  87. package/out/src/golden.js +104 -0
  88. package/out/src/golden.js.map +1 -0
  89. package/out/src/index.d.ts +18 -0
  90. package/out/src/index.d.ts.map +1 -0
  91. package/out/src/index.js +18 -0
  92. package/out/src/index.js.map +1 -0
  93. package/out/src/parquet-wrapper/index.d.ts +12 -0
  94. package/out/src/parquet-wrapper/index.d.ts.map +1 -0
  95. package/out/src/parquet-wrapper/index.js +12 -0
  96. package/out/src/parquet-wrapper/index.js.map +1 -0
  97. package/out/src/parquet-wrapper/reader.d.ts +31 -0
  98. package/out/src/parquet-wrapper/reader.d.ts.map +1 -0
  99. package/out/src/parquet-wrapper/reader.js +54 -0
  100. package/out/src/parquet-wrapper/reader.js.map +1 -0
  101. package/out/src/parquet-wrapper/schema.d.ts +45 -0
  102. package/out/src/parquet-wrapper/schema.d.ts.map +1 -0
  103. package/out/src/parquet-wrapper/schema.js +55 -0
  104. package/out/src/parquet-wrapper/schema.js.map +1 -0
  105. package/out/src/parquet-wrapper/writer.d.ts +41 -0
  106. package/out/src/parquet-wrapper/writer.d.ts.map +1 -0
  107. package/out/src/parquet-wrapper/writer.js +71 -0
  108. package/out/src/parquet-wrapper/writer.js.map +1 -0
  109. package/out/src/parquet.d.ts +122 -0
  110. package/out/src/parquet.d.ts.map +1 -0
  111. package/out/src/parquet.js +220 -0
  112. package/out/src/parquet.js.map +1 -0
  113. package/out/src/runner.d.ts +100 -0
  114. package/out/src/runner.d.ts.map +1 -0
  115. package/out/src/runner.js +183 -0
  116. package/out/src/runner.js.map +1 -0
  117. package/out/src/split.d.ts +108 -0
  118. package/out/src/split.d.ts.map +1 -0
  119. package/out/src/split.js +191 -0
  120. package/out/src/split.js.map +1 -0
  121. package/out/src/synthesize.d.ts +146 -0
  122. package/out/src/synthesize.d.ts.map +1 -0
  123. package/out/src/synthesize.js +472 -0
  124. package/out/src/synthesize.js.map +1 -0
  125. package/out/src/tokenize.d.ts +47 -0
  126. package/out/src/tokenize.d.ts.map +1 -0
  127. package/out/src/tokenize.js +49 -0
  128. package/out/src/tokenize.js.map +1 -0
  129. package/out/src/types.d.ts +168 -0
  130. package/out/src/types.d.ts.map +1 -0
  131. package/out/src/types.js +19 -0
  132. package/out/src/types.js.map +1 -0
  133. package/out/src/wof-json.d.ts +105 -0
  134. package/out/src/wof-json.d.ts.map +1 -0
  135. package/out/src/wof-json.js +174 -0
  136. package/out/src/wof-json.js.map +1 -0
  137. package/package.json +36 -0
@@ -0,0 +1,241 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * `wof-admin`: Who's On First admin GeoJSON-bundle adapter.
7
+ *
8
+ * **Phase 1.5.1 pivot.** The original Phase 1.5 SQLite adapter (formerly at
9
+ * `packages/corpus/src/adapters/wof-admin/`, removed in this same change) was replaced by this
10
+ * one because the SQLite distribution path was unworkable for the real corpus build:
11
+ *
12
+ * 1. `dist.whosonfirst.org/sqlite/` is dead (NXDOMAIN); the Geocode-Earth mirror is the only one.
13
+ * 2. The Geocode-Earth-hosted postalcode DB tags every row `mz:is_current = -1` ("unknown but treated
14
+ * as active"); the SQLite adapter's `is_current = 1` predicate emitted zero rows.
15
+ * 3. The `names` table in the SQLite distribution is empty — localized `name:*` variants live in a
16
+ * separate distribution. The St. Petersburg / Mt. Vernon / Ft. Lauderdale alternation cases
17
+ * (the original Phase 1.5.1 motivator) cannot be solved on the SQLite path even with a
18
+ * patched `is_current` predicate.
19
+ *
20
+ * Input: a directory containing one or more cloned `whosonfirst-data-admin-<cc>` GitHub repos. Each
21
+ * repo has `data/XXX/YYY/ZZZ/<wof-id>.geojson` files; `**\/*.geojson` walks the tree recursively.
22
+ * Alternate-geometry siblings (`-alt-*`) are skipped — they're separate exports of the same
23
+ * record, not new records.
24
+ *
25
+ * Per record, the adapter emits one row per `(name-variant, hierarchy-variant)` pair:
26
+ *
27
+ * - **Name variants**: the canonical `wof:name` (slot key `default`) plus every `name:*` localized
28
+ * variant present on the feature (`name:eng_x_preferred`, `name:eng_x_colloquial`,
29
+ * `name:rus_x_preferred`, ...). This is the Phase 1.5.1 fix for the St. Petersburg case:
30
+ * `"Saint Petersburg"` (canonical) and `"St. Petersburg"` (eng_x_colloquial) both become
31
+ * training rows for the same WOF id.
32
+ * - **Hierarchy variants** (unchanged from the SQLite adapter): locality → 3 variants, region → 2,
33
+ * country → 1, county → 1.
34
+ *
35
+ * `source_id` is `wof-admin-<wof_id>-<name-slot>-<hierarchy-variant>`. The previous SQLite adapter
36
+ * used `wof-admin-<wof_id>-<hierarchy-variant>` (no name slot); the new format adds a name-slot
37
+ * segment so the colloquial / preferred / per-locale variants survive dedup independently.
38
+ *
39
+ * License: CC0. The adapter stamps every row with `CC0-1.0`.
40
+ */
41
+ import { formatAddress, reconcileComponents } from "../../format.js";
42
+ import { buildAncestryIndex, normalizeNameKey, walkFeatures } from "../../wof-json.js";
43
+ /**
44
+ * Display name for the country, keyed by ISO 3166-1 alpha-2.
45
+ *
46
+ * Must be the **OpenCage-canonical** surface form: the `address-formatter` library expands some
47
+ * country names en route to its output (e.g. `"United States"` → `"United States of America"`). If
48
+ * `components.country` and the formatted `raw` disagree, alignment will fail downstream. Keying off
49
+ * the canonical form keeps the two in lockstep.
50
+ *
51
+ * Phase 1 US + FR only; extend as new locales come online. Missing countries fall back to the
52
+ * country row's `wof:name`, accepting the alignment risk for non-canonicalized names.
53
+ */
54
+ const COUNTRY_DISPLAY_NAME = {
55
+ US: "United States of America",
56
+ FR: "France",
57
+ };
58
+ /** BCP-47 locale defaulting for the corpus row's `locale` field. Defaulted by country. */
59
+ const LOCALE_BY_COUNTRY = {
60
+ US: "en-US",
61
+ FR: "fr-FR",
62
+ };
63
+ /** Map a WOF placetype to a Mailwoman `ComponentTag`, or `undefined` to skip. */
64
+ function placetypeToTag(placetype) {
65
+ switch (placetype) {
66
+ case "country":
67
+ case "nation":
68
+ return "country";
69
+ case "macroregion":
70
+ case "region":
71
+ return "region";
72
+ case "macrocounty":
73
+ case "county":
74
+ case "localadmin":
75
+ return "subregion";
76
+ case "locality":
77
+ return "locality";
78
+ case "borough":
79
+ case "macrohood":
80
+ case "neighbourhood":
81
+ case "microhood":
82
+ return "dependent_locality";
83
+ default:
84
+ return undefined;
85
+ }
86
+ }
87
+ /**
88
+ * Compute the hierarchy variants for a record given its ancestry chain and the chosen `selfName`.
89
+ *
90
+ * `selfName` is the surface form to use for the record's own component (locality / region / country
91
+ * / subregion). Callers pass the canonical `wof:name` for the `"default"` slot and a `name:*`
92
+ * localized value for variant slots; ancestor names always come from the ancestor's canonical
93
+ * `wof:name`.
94
+ *
95
+ * Country variants substitute `COUNTRY_DISPLAY_NAME` for the default slot so the OpenCage template
96
+ * produces the canonicalized form (`"United States of America"`), matching the legacy SQLite
97
+ * adapter's behavior.
98
+ */
99
+ export function variantsFor(row, ancestry, selfName) {
100
+ const selfTag = placetypeToTag(row.placetype);
101
+ if (!selfTag)
102
+ return [];
103
+ const region = ancestry.find((a) => placetypeToTag(a.placetype) === "region");
104
+ const country = ancestry.find((a) => placetypeToTag(a.placetype) === "country");
105
+ const countryDisplay = COUNTRY_DISPLAY_NAME[row.country] ?? country?.name ?? row.country;
106
+ const variants = [];
107
+ switch (selfTag) {
108
+ case "locality":
109
+ case "dependent_locality": {
110
+ variants.push({ suffix: "self", components: { [selfTag]: selfName } });
111
+ if (region) {
112
+ variants.push({
113
+ suffix: "with-region",
114
+ components: { [selfTag]: selfName, region: region.name },
115
+ });
116
+ }
117
+ if (region && country) {
118
+ variants.push({
119
+ suffix: "with-region-country",
120
+ components: { [selfTag]: selfName, region: region.name, country: countryDisplay },
121
+ });
122
+ }
123
+ else if (!region && country) {
124
+ variants.push({
125
+ suffix: "with-country",
126
+ components: { [selfTag]: selfName, country: countryDisplay },
127
+ });
128
+ }
129
+ return variants;
130
+ }
131
+ case "region": {
132
+ variants.push({ suffix: "self", components: { region: selfName } });
133
+ if (country) {
134
+ variants.push({
135
+ suffix: "with-country",
136
+ components: { region: selfName, country: countryDisplay },
137
+ });
138
+ }
139
+ return variants;
140
+ }
141
+ case "country": {
142
+ variants.push({ suffix: "self", components: { country: selfName } });
143
+ return variants;
144
+ }
145
+ case "subregion": {
146
+ variants.push({ suffix: "self", components: { subregion: selfName } });
147
+ return variants;
148
+ }
149
+ default:
150
+ return [];
151
+ }
152
+ }
153
+ /**
154
+ * Build the per-record name-slot list. The canonical `"default"` slot uses the OpenCage-canonical
155
+ * country form when the record is itself a country (matches SQLite-adapter behavior); every other
156
+ * placetype's default slot uses `wof:name` verbatim.
157
+ *
158
+ * Subsequent slots come from `name:*` variants, deduplicated against the default name so we don't
159
+ * emit a redundant `"default"`-equivalent row under a localized key.
160
+ */
161
+ export function nameSlotsFor(rec) {
162
+ const selfTag = placetypeToTag(rec.placetype);
163
+ const canonicalSelfName = selfTag === "country" ? (COUNTRY_DISPLAY_NAME[rec.country] ?? rec.name) : rec.name;
164
+ const seen = new Set([canonicalSelfName]);
165
+ const slots = [{ key: "default", value: canonicalSelfName }];
166
+ for (const [rawKey, value] of rec.nameVariants) {
167
+ if (seen.has(value))
168
+ continue;
169
+ seen.add(value);
170
+ slots.push({ key: normalizeNameKey(rawKey), value });
171
+ }
172
+ return slots;
173
+ }
174
+ export const WOF_ADMIN_ADAPTER_ID = "wof-admin";
175
+ /**
176
+ * Construct the wof-admin JSON-bundle adapter. The adapter is stateless across runs; calling this
177
+ * twice with the same input directory produces byte-identical `canonical.jsonl` (records are
178
+ * emitted in sorted `wof:id` order to be insensitive to filesystem walk ordering).
179
+ */
180
+ export function createWofAdminAdapter() {
181
+ return {
182
+ id: WOF_ADMIN_ADAPTER_ID,
183
+ defaultLicense: "CC0-1.0",
184
+ description: "Who's On First admin GeoJSON bundles (countries, regions, counties, localities) — multi-name variants per record.",
185
+ async *rows(opts) {
186
+ // Pass 1: scan every GeoJSON file once, build the in-memory record index.
187
+ // We keep only records whose placetype maps to a ComponentTag — irrelevant placetypes
188
+ // (campus, county-region hybrids on which Mailwoman has no opinion) are dropped here so
189
+ // they don't inflate the ancestry index. Country-filtered runs prune to the matching
190
+ // country code too; the ancestors of a same-country record live in the same admin repo.
191
+ const byId = new Map();
192
+ for await (const rec of walkFeatures(opts.inputPath, { signal: opts.signal })) {
193
+ if (opts.signal?.aborted)
194
+ return;
195
+ if (opts.country && rec.country !== opts.country)
196
+ continue;
197
+ if (!placetypeToTag(rec.placetype))
198
+ continue;
199
+ byId.set(rec.id, rec);
200
+ }
201
+ const ancestry = buildAncestryIndex(byId);
202
+ // Pass 2: emit rows in sorted-id order for deterministic JSONL.
203
+ const ids = [...byId.keys()].sort((a, b) => a - b);
204
+ let emitted = 0;
205
+ for (const id of ids) {
206
+ if (opts.signal?.aborted)
207
+ return;
208
+ const rec = byId.get(id);
209
+ const chain = ancestry.get(id) ?? [];
210
+ const slots = nameSlotsFor(rec);
211
+ for (const slot of slots) {
212
+ const variants = variantsFor(rec, chain, slot.value);
213
+ for (const variant of variants) {
214
+ if (opts.limit !== undefined && emitted >= opts.limit)
215
+ return;
216
+ const raw = formatAddress(variant.components, rec.country, { separator: ", " });
217
+ if (!raw)
218
+ continue;
219
+ const aligned = reconcileComponents(variant.components, raw);
220
+ if (Object.keys(aligned).length === 0)
221
+ continue;
222
+ yield {
223
+ raw,
224
+ components: aligned,
225
+ country: rec.country,
226
+ locale: LOCALE_BY_COUNTRY[rec.country],
227
+ source: WOF_ADMIN_ADAPTER_ID,
228
+ source_id: `${WOF_ADMIN_ADAPTER_ID}-${rec.id}-${slot.key}-${variant.suffix}`,
229
+ corpus_version: "",
230
+ license: "CC0-1.0",
231
+ };
232
+ emitted++;
233
+ }
234
+ }
235
+ }
236
+ },
237
+ };
238
+ }
239
+ /** Single shared instance, suitable for `defaultAdapterRegistry`. */
240
+ export const wofAdminAdapter = createWofAdminAdapter();
241
+ //# sourceMappingURL=adapter.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"adapter.js","sourceRoot":"","sources":["../../../../src/adapters/wof-admin-json/adapter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAuCG;AAIH,OAAO,EAAE,aAAa,EAAE,mBAAmB,EAAE,MAAM,iBAAiB,CAAA;AAEpE,OAAO,EAAE,kBAAkB,EAAE,gBAAgB,EAAE,YAAY,EAAkB,MAAM,mBAAmB,CAAA;AAEtG;;;;;;;;;;GAUG;AACH,MAAM,oBAAoB,GAA2B;IACpD,EAAE,EAAE,0BAA0B;IAC9B,EAAE,EAAE,QAAQ;CACZ,CAAA;AAED,0FAA0F;AAC1F,MAAM,iBAAiB,GAA2B;IACjD,EAAE,EAAE,OAAO;IACX,EAAE,EAAE,OAAO;CACX,CAAA;AAED,iFAAiF;AACjF,SAAS,cAAc,CAAC,SAAwC;IAC/D,QAAQ,SAAS,EAAE,CAAC;QACnB,KAAK,SAAS,CAAC;QACf,KAAK,QAAQ;YACZ,OAAO,SAAS,CAAA;QACjB,KAAK,aAAa,CAAC;QACnB,KAAK,QAAQ;YACZ,OAAO,QAAQ,CAAA;QAChB,KAAK,aAAa,CAAC;QACnB,KAAK,QAAQ,CAAC;QACd,KAAK,YAAY;YAChB,OAAO,WAAW,CAAA;QACnB,KAAK,UAAU;YACd,OAAO,UAAU,CAAA;QAClB,KAAK,SAAS,CAAC;QACf,KAAK,WAAW,CAAC;QACjB,KAAK,eAAe,CAAC;QACrB,KAAK,WAAW;YACf,OAAO,oBAAoB,CAAA;QAC5B;YACC,OAAO,SAAS,CAAA;IAClB,CAAC;AACF,CAAC;AAUD;;;;;;;;;;;GAWG;AACH,MAAM,UAAU,WAAW,CAAC,GAAc,EAAE,QAAqB,EAAE,QAAgB;IAClF,MAAM,OAAO,GAAG,cAAc,CAAC,GAAG,CAAC,SAAS,CAAC,CAAA;IAC7C,IAAI,CAAC,OAAO;QAAE,OAAO,EAAE,CAAA;IAEvB,MAAM,MAAM,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,cAAc,CAAC,CAAC,CAAC,SAAS,CAAC,KAAK,QAAQ,CAAC,CAAA;IAC7E,MAAM,OAAO,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,cAAc,CAAC,CAAC,CAAC,SAAS,CAAC,KAAK,SAAS,CAAC,CAAA;IAC/E,MAAM,cAAc,GAAG,oBAAoB,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,OAAO,EAAE,IAAI,IAAI,GAAG,CAAC,OAAO,CAAA;IAExF,MAAM,QAAQ,GAAkB,EAAE,CAAA;IAElC,QAAQ,OAAO,EAAE,CAAC;QACjB,KAAK,UAAU,CAAC;QAChB,KAAK,oBAAoB,CAAC,CAAC,CAAC;YAC3B,QAAQ,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,MAAM,EAAE,UAAU,EAAE,EAAE,CAAC,OAAO,CAAC,EAAE,QAAQ,EAAE,EAAE,CAAC,CAAA;YACtE,IAAI,MAAM,EAAE,CAAC;gBACZ,QAAQ,CAAC,IAAI,CAAC;oBACb,MAAM,EAAE,aAAa;oBACrB,UAAU,EAAE,EAAE,CAAC,OAAO,CAAC,EAAE,QAAQ,EAAE,MAAM,EAAE,MAAM,CAAC,IAAI,EAAE;iBACxD,CAAC,CAAA;YACH,CAAC;YACD,IAAI,MAAM,IAAI,OAAO,EAAE,CAAC;gBACvB,QAAQ,CAAC,IAAI,CAAC;oBACb,MAAM,EAAE,qBAAqB;oBAC7B,UAAU,EAAE,EAAE,CAAC,OAAO,CAAC,EAAE,QAAQ,EAAE,MAAM,EAAE,MAAM,CAAC,IAAI,EAAE,OAAO,EAAE,cAAc,EAAE;iBACjF,CAAC,CAAA;YACH,CAAC;iBAAM,IAAI,CAAC,MAAM,IAAI,OAAO,EAAE,CAAC;gBAC/B,QAAQ,CAAC,IAAI,CAAC;oBACb,MAAM,EAAE,cAAc;oBACtB,UAAU,EAAE,EAAE,CAAC,OAAO,CAAC,EAAE,QAAQ,EAAE,OAAO,EAAE,cAAc,EAAE;iBAC5D,CAAC,CAAA;YACH,CAAC;YACD,OAAO,QAAQ,CAAA;QAChB,CAAC;QAED,KAAK,QAAQ,CAAC,CAAC,CAAC;YACf,QAAQ,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,MAAM,EAAE,UAAU,EAAE,EAAE,MAAM,EAAE,QAAQ,EAAE,EAAE,CAAC,CAAA;YACnE,IAAI,OAAO,EAAE,CAAC;gBACb,QAAQ,CAAC,IAAI,CAAC;oBACb,MAAM,EAAE,cAAc;oBACtB,UAAU,EAAE,EAAE,MAAM,EAAE,QAAQ,EAAE,OAAO,EAAE,cAAc,EAAE;iBACzD,CAAC,CAAA;YACH,CAAC;YACD,OAAO,QAAQ,CAAA;QAChB,CAAC;QAED,KAAK,SAAS,CAAC,CAAC,CAAC;YAChB,QAAQ,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,MAAM,EAAE,UAAU,EAAE,EAAE,OAAO,EAAE,QAAQ,EAAE,EAAE,CAAC,CAAA;YACpE,OAAO,QAAQ,CAAA;QAChB,CAAC;QAED,KAAK,WAAW,CAAC,CAAC,CAAC;YAClB,QAAQ,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,MAAM,EAAE,UAAU,EAAE,EAAE,SAAS,EAAE,QAAQ,EAAE,EAAE,CAAC,CAAA;YACtE,OAAO,QAAQ,CAAA;QAChB,CAAC;QAED;YACC,OAAO,EAAE,CAAA;IACX,CAAC;AACF,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,UAAU,YAAY,CAAC,GAAc;IAC1C,MAAM,OAAO,GAAG,cAAc,CAAC,GAAG,CAAC,SAAS,CAAC,CAAA;IAC7C,MAAM,iBAAiB,GAAG,OAAO,KAAK,SAAS,CAAC,CAAC,CAAC,CAAC,oBAAoB,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,IAAI,CAAA;IAE5G,MAAM,IAAI,GAAG,IAAI,GAAG,CAAS,CAAC,iBAAiB,CAAC,CAAC,CAAA;IACjD,MAAM,KAAK,GAA0C,CAAC,EAAE,GAAG,EAAE,SAAS,EAAE,KAAK,EAAE,iBAAiB,EAAE,CAAC,CAAA;IAEnG,KAAK,MAAM,CAAC,MAAM,EAAE,KAAK,CAAC,IAAI,GAAG,CAAC,YAAY,EAAE,CAAC;QAChD,IAAI,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC;YAAE,SAAQ;QAC7B,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,CAAA;QACf,KAAK,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,gBAAgB,CAAC,MAAM,CAAC,EAAE,KAAK,EAAE,CAAC,CAAA;IACrD,CAAC;IAED,OAAO,KAAK,CAAA;AACb,CAAC;AAED,MAAM,CAAC,MAAM,oBAAoB,GAAG,WAAW,CAAA;AAE/C;;;;GAIG;AACH,MAAM,UAAU,qBAAqB;IACpC,OAAO;QACN,EAAE,EAAE,oBAAoB;QACxB,cAAc,EAAE,SAAS;QACzB,WAAW,EACV,mHAAmH;QAEpH,KAAK,CAAC,CAAC,IAAI,CAAC,IAAoB;YAC/B,0EAA0E;YAC1E,sFAAsF;YACtF,wFAAwF;YACxF,qFAAqF;YACrF,wFAAwF;YACxF,MAAM,IAAI,GAAG,IAAI,GAAG,EAAqB,CAAA;YACzC,IAAI,KAAK,EAAE,MAAM,GAAG,IAAI,YAAY,CAAC,IAAI,CAAC,SAAS,EAAE,EAAE,MAAM,EAAE,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,CAAC;gBAC/E,IAAI,IAAI,CAAC,MAAM,EAAE,OAAO;oBAAE,OAAM;gBAChC,IAAI,IAAI,CAAC,OAAO,IAAI,GAAG,CAAC,OAAO,KAAK,IAAI,CAAC,OAAO;oBAAE,SAAQ;gBAC1D,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC,SAAS,CAAC;oBAAE,SAAQ;gBAC5C,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,EAAE,GAAG,CAAC,CAAA;YACtB,CAAC;YAED,MAAM,QAAQ,GAAG,kBAAkB,CAAC,IAAI,CAAC,CAAA;YAEzC,gEAAgE;YAChE,MAAM,GAAG,GAAG,CAAC,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAA;YAClD,IAAI,OAAO,GAAG,CAAC,CAAA;YAEf,KAAK,MAAM,EAAE,IAAI,GAAG,EAAE,CAAC;gBACtB,IAAI,IAAI,CAAC,MAAM,EAAE,OAAO;oBAAE,OAAM;gBAChC,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,EAAE,CAAE,CAAA;gBACzB,MAAM,KAAK,GAAG,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,EAAE,CAAA;gBACpC,MAAM,KAAK,GAAG,YAAY,CAAC,GAAG,CAAC,CAAA;gBAE/B,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;oBAC1B,MAAM,QAAQ,GAAG,WAAW,CAAC,GAAG,EAAE,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,CAAA;oBACpD,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;wBAChC,IAAI,IAAI,CAAC,KAAK,KAAK,SAAS,IAAI,OAAO,IAAI,IAAI,CAAC,KAAK;4BAAE,OAAM;wBAE7D,MAAM,GAAG,GAAG,aAAa,CAAC,OAAO,CAAC,UAAU,EAAE,GAAG,CAAC,OAAO,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAA;wBAC/E,IAAI,CAAC,GAAG;4BAAE,SAAQ;wBAClB,MAAM,OAAO,GAAG,mBAAmB,CAAC,OAAO,CAAC,UAAU,EAAE,GAAG,CAAC,CAAA;wBAC5D,IAAI,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,MAAM,KAAK,CAAC;4BAAE,SAAQ;wBAE/C,MAAM;4BACL,GAAG;4BACH,UAAU,EAAE,OAAO;4BACnB,OAAO,EAAE,GAAG,CAAC,OAAO;4BACpB,MAAM,EAAE,iBAAiB,CAAC,GAAG,CAAC,OAAO,CAAC;4BACtC,MAAM,EAAE,oBAAoB;4BAC5B,SAAS,EAAE,GAAG,oBAAoB,IAAI,GAAG,CAAC,EAAE,IAAI,IAAI,CAAC,GAAG,IAAI,OAAO,CAAC,MAAM,EAAE;4BAC5E,cAAc,EAAE,EAAE;4BAClB,OAAO,EAAE,SAAS;yBAClB,CAAA;wBACD,OAAO,EAAE,CAAA;oBACV,CAAC;gBACF,CAAC;YACF,CAAC;QACF,CAAC;KACD,CAAA;AACF,CAAC;AAED,qEAAqE;AACrE,MAAM,CAAC,MAAM,eAAe,GAAG,qBAAqB,EAAE,CAAA"}
@@ -0,0 +1,63 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * `wof-postalcode`: Who's On First postalcode GeoJSON-bundle adapter.
7
+ *
8
+ * **Phase 1.5.1 pivot.** Replaces the previous SpatiaLite-backed implementation (formerly at
9
+ * `packages/corpus/src/adapters/wof-postalcode/`, removed in this same change). The rationale is
10
+ * in `wof-admin-json/adapter.ts` and in `DECISIONS.md` — short version: the SQLite distribution
11
+ * mirror is dead, the live distro tags every postcode row `mz:is_current = -1` which the old
12
+ * `is_current = 1` predicate excluded, and localized `name:*` variants don't ship in the SQLite
13
+ * export at all.
14
+ *
15
+ * Input: a directory containing one or more cloned `whosonfirst-data-postalcode-<cc>` repos plus
16
+ * the relevant `whosonfirst-data-admin-<cc>` repos (postcode records reference admin ancestry by
17
+ * `wof:parent_id`, so the locality / region / country records must be in the same walk for the
18
+ * ancestry chain to resolve). The corpus pipeline clones all four repos under
19
+ * `/data/corpus/sources/wof/repos/` and points the adapter at that root.
20
+ *
21
+ * Per live postalcode record, the adapter emits one row per `(name-variant, hierarchy-variant)`
22
+ * pair:
23
+ *
24
+ * - **Name variants**: canonical `wof:name` (slot key `default`, typically the postcode digits
25
+ * themselves) plus any `name:*` variants on the postcode feature. In practice WOF postcode
26
+ * records rarely carry localized name variants, so this expansion is usually a no-op — but
27
+ * the code path stays symmetric with the admin adapter for consistency.
28
+ * - **Hierarchy variants** (unchanged from the SQLite adapter): self, +locality, +locality+region,
29
+ * +locality+region+country.
30
+ *
31
+ * `source_id` is `wof-postalcode-<wof_id>-<name-slot>-<hierarchy-variant>`. Ancestor names always
32
+ * come from the ancestor's canonical `wof:name`; this adapter does NOT iterate ancestor name
33
+ * variants (e.g. it does not emit `"75008 Париж"` even when Paris has a `name:rus_x_preferred`).
34
+ * That cross-product belongs to a future synthesis pass; emitting it here would multiply row
35
+ * counts ~10× without a clear training-value story.
36
+ *
37
+ * License: CC0.
38
+ */
39
+ import type { ComponentTag } from "@mailwoman/core/types";
40
+ import type { CorpusAdapter } from "../../types.js";
41
+ import { type WofRecord } from "../../wof-json.js";
42
+ interface VariantSpec {
43
+ suffix: string;
44
+ components: Partial<Record<ComponentTag, string>>;
45
+ }
46
+ /**
47
+ * Compute hierarchy variants for a postcode record. `selfName` is the postcode surface form
48
+ * (canonical `wof:name` for the `default` slot, a `name:*` localized variant otherwise).
49
+ */
50
+ export declare function postcodeVariantsFor(row: WofRecord, ancestry: WofRecord[], selfName: string): VariantSpec[];
51
+ /**
52
+ * Build the per-record name-slot list. The `default` slot uses `wof:name` verbatim (postcode
53
+ * digits); subsequent slots come from `name:*` variants dedup'd against the default.
54
+ */
55
+ export declare function nameSlotsFor(rec: WofRecord): Array<{
56
+ key: string;
57
+ value: string;
58
+ }>;
59
+ export declare const WOF_POSTALCODE_ADAPTER_ID = "wof-postalcode";
60
+ export declare function createWofPostalcodeAdapter(): CorpusAdapter;
61
+ export declare const wofPostalcodeAdapter: CorpusAdapter;
62
+ export {};
63
+ //# sourceMappingURL=adapter.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"adapter.d.ts","sourceRoot":"","sources":["../../../../src/adapters/wof-postalcode-json/adapter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAqCG;AAGH,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,uBAAuB,CAAA;AAEzD,OAAO,KAAK,EAAgC,aAAa,EAAE,MAAM,gBAAgB,CAAA;AACjF,OAAO,EAAsD,KAAK,SAAS,EAAE,MAAM,mBAAmB,CAAA;AA6BtG,UAAU,WAAW;IACpB,MAAM,EAAE,MAAM,CAAA;IACd,UAAU,EAAE,OAAO,CAAC,MAAM,CAAC,YAAY,EAAE,MAAM,CAAC,CAAC,CAAA;CACjD;AAED;;;GAGG;AACH,wBAAgB,mBAAmB,CAAC,GAAG,EAAE,SAAS,EAAE,QAAQ,EAAE,SAAS,EAAE,EAAE,QAAQ,EAAE,MAAM,GAAG,WAAW,EAAE,CAmC1G;AAED;;;GAGG;AACH,wBAAgB,YAAY,CAAC,GAAG,EAAE,SAAS,GAAG,KAAK,CAAC;IAAE,GAAG,EAAE,MAAM,CAAC;IAAC,KAAK,EAAE,MAAM,CAAA;CAAE,CAAC,CASlF;AAED,eAAO,MAAM,yBAAyB,mBAAmB,CAAA;AAEzD,wBAAgB,0BAA0B,IAAI,aAAa,CA2D1D;AAED,eAAO,MAAM,oBAAoB,eAA+B,CAAA"}
@@ -0,0 +1,178 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * `wof-postalcode`: Who's On First postalcode GeoJSON-bundle adapter.
7
+ *
8
+ * **Phase 1.5.1 pivot.** Replaces the previous SpatiaLite-backed implementation (formerly at
9
+ * `packages/corpus/src/adapters/wof-postalcode/`, removed in this same change). The rationale is
10
+ * in `wof-admin-json/adapter.ts` and in `DECISIONS.md` — short version: the SQLite distribution
11
+ * mirror is dead, the live distro tags every postcode row `mz:is_current = -1` which the old
12
+ * `is_current = 1` predicate excluded, and localized `name:*` variants don't ship in the SQLite
13
+ * export at all.
14
+ *
15
+ * Input: a directory containing one or more cloned `whosonfirst-data-postalcode-<cc>` repos plus
16
+ * the relevant `whosonfirst-data-admin-<cc>` repos (postcode records reference admin ancestry by
17
+ * `wof:parent_id`, so the locality / region / country records must be in the same walk for the
18
+ * ancestry chain to resolve). The corpus pipeline clones all four repos under
19
+ * `/data/corpus/sources/wof/repos/` and points the adapter at that root.
20
+ *
21
+ * Per live postalcode record, the adapter emits one row per `(name-variant, hierarchy-variant)`
22
+ * pair:
23
+ *
24
+ * - **Name variants**: canonical `wof:name` (slot key `default`, typically the postcode digits
25
+ * themselves) plus any `name:*` variants on the postcode feature. In practice WOF postcode
26
+ * records rarely carry localized name variants, so this expansion is usually a no-op — but
27
+ * the code path stays symmetric with the admin adapter for consistency.
28
+ * - **Hierarchy variants** (unchanged from the SQLite adapter): self, +locality, +locality+region,
29
+ * +locality+region+country.
30
+ *
31
+ * `source_id` is `wof-postalcode-<wof_id>-<name-slot>-<hierarchy-variant>`. Ancestor names always
32
+ * come from the ancestor's canonical `wof:name`; this adapter does NOT iterate ancestor name
33
+ * variants (e.g. it does not emit `"75008 Париж"` even when Paris has a `name:rus_x_preferred`).
34
+ * That cross-product belongs to a future synthesis pass; emitting it here would multiply row
35
+ * counts ~10× without a clear training-value story.
36
+ *
37
+ * License: CC0.
38
+ */
39
+ import { formatAddress, reconcileComponents } from "../../format.js";
40
+ import { buildAncestryIndex, normalizeNameKey, walkFeatures } from "../../wof-json.js";
41
+ const COUNTRY_DISPLAY_NAME = {
42
+ US: "United States of America",
43
+ FR: "France",
44
+ };
45
+ const LOCALE_BY_COUNTRY = {
46
+ US: "en-US",
47
+ FR: "fr-FR",
48
+ };
49
+ function placetypeToTag(placetype) {
50
+ switch (placetype) {
51
+ case "country":
52
+ case "nation":
53
+ return "country";
54
+ case "macroregion":
55
+ case "region":
56
+ return "region";
57
+ case "locality":
58
+ return "locality";
59
+ case "postalcode":
60
+ return "postcode";
61
+ default:
62
+ return undefined;
63
+ }
64
+ }
65
+ /**
66
+ * Compute hierarchy variants for a postcode record. `selfName` is the postcode surface form
67
+ * (canonical `wof:name` for the `default` slot, a `name:*` localized variant otherwise).
68
+ */
69
+ export function postcodeVariantsFor(row, ancestry, selfName) {
70
+ if (placetypeToTag(row.placetype) !== "postcode")
71
+ return [];
72
+ const locality = ancestry.find((a) => placetypeToTag(a.placetype) === "locality");
73
+ const region = ancestry.find((a) => placetypeToTag(a.placetype) === "region");
74
+ const country = ancestry.find((a) => placetypeToTag(a.placetype) === "country");
75
+ const countryDisplay = COUNTRY_DISPLAY_NAME[row.country] ?? country?.name ?? row.country;
76
+ const variants = [{ suffix: "self", components: { postcode: selfName } }];
77
+ if (locality) {
78
+ variants.push({
79
+ suffix: "with-locality",
80
+ components: { postcode: selfName, locality: locality.name },
81
+ });
82
+ }
83
+ if (locality && region) {
84
+ variants.push({
85
+ suffix: "with-locality-region",
86
+ components: { postcode: selfName, locality: locality.name, region: region.name },
87
+ });
88
+ }
89
+ if (locality && region && country) {
90
+ variants.push({
91
+ suffix: "with-locality-region-country",
92
+ components: {
93
+ postcode: selfName,
94
+ locality: locality.name,
95
+ region: region.name,
96
+ country: countryDisplay,
97
+ },
98
+ });
99
+ }
100
+ return variants;
101
+ }
102
+ /**
103
+ * Build the per-record name-slot list. The `default` slot uses `wof:name` verbatim (postcode
104
+ * digits); subsequent slots come from `name:*` variants dedup'd against the default.
105
+ */
106
+ export function nameSlotsFor(rec) {
107
+ const seen = new Set([rec.name]);
108
+ const slots = [{ key: "default", value: rec.name }];
109
+ for (const [rawKey, value] of rec.nameVariants) {
110
+ if (seen.has(value))
111
+ continue;
112
+ seen.add(value);
113
+ slots.push({ key: normalizeNameKey(rawKey), value });
114
+ }
115
+ return slots;
116
+ }
117
+ export const WOF_POSTALCODE_ADAPTER_ID = "wof-postalcode";
118
+ export function createWofPostalcodeAdapter() {
119
+ return {
120
+ id: WOF_POSTALCODE_ADAPTER_ID,
121
+ defaultLicense: "CC0-1.0",
122
+ description: "Who's On First postalcode GeoJSON bundles (postcode → locality/region pairs). Ancestor names from sibling admin repos.",
123
+ async *rows(opts) {
124
+ // Pass 1: full walk. We keep every record whose placetype maps to a ComponentTag — the
125
+ // postcode adapter needs locality / region / country admin records in the index so it
126
+ // can resolve postcode ancestry, even though it only emits rows for postcode records.
127
+ const byId = new Map();
128
+ for await (const rec of walkFeatures(opts.inputPath, { signal: opts.signal })) {
129
+ if (opts.signal?.aborted)
130
+ return;
131
+ if (opts.country && rec.country !== opts.country)
132
+ continue;
133
+ if (!placetypeToTag(rec.placetype))
134
+ continue;
135
+ byId.set(rec.id, rec);
136
+ }
137
+ const ancestry = buildAncestryIndex(byId);
138
+ // Pass 2: emit postcode rows only, sorted by id for determinism.
139
+ const ids = [...byId.keys()].sort((a, b) => a - b);
140
+ let emitted = 0;
141
+ for (const id of ids) {
142
+ if (opts.signal?.aborted)
143
+ return;
144
+ const rec = byId.get(id);
145
+ if (placetypeToTag(rec.placetype) !== "postcode")
146
+ continue;
147
+ const chain = ancestry.get(id) ?? [];
148
+ const slots = nameSlotsFor(rec);
149
+ for (const slot of slots) {
150
+ const variants = postcodeVariantsFor(rec, chain, slot.value);
151
+ for (const variant of variants) {
152
+ if (opts.limit !== undefined && emitted >= opts.limit)
153
+ return;
154
+ const raw = formatAddress(variant.components, rec.country, { separator: ", " });
155
+ if (!raw)
156
+ continue;
157
+ const aligned = reconcileComponents(variant.components, raw);
158
+ if (Object.keys(aligned).length === 0)
159
+ continue;
160
+ yield {
161
+ raw,
162
+ components: aligned,
163
+ country: rec.country,
164
+ locale: LOCALE_BY_COUNTRY[rec.country],
165
+ source: WOF_POSTALCODE_ADAPTER_ID,
166
+ source_id: `${WOF_POSTALCODE_ADAPTER_ID}-${rec.id}-${slot.key}-${variant.suffix}`,
167
+ corpus_version: "",
168
+ license: "CC0-1.0",
169
+ };
170
+ emitted++;
171
+ }
172
+ }
173
+ }
174
+ },
175
+ };
176
+ }
177
+ export const wofPostalcodeAdapter = createWofPostalcodeAdapter();
178
+ //# sourceMappingURL=adapter.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"adapter.js","sourceRoot":"","sources":["../../../../src/adapters/wof-postalcode-json/adapter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAqCG;AAIH,OAAO,EAAE,aAAa,EAAE,mBAAmB,EAAE,MAAM,iBAAiB,CAAA;AAEpE,OAAO,EAAE,kBAAkB,EAAE,gBAAgB,EAAE,YAAY,EAAkB,MAAM,mBAAmB,CAAA;AAEtG,MAAM,oBAAoB,GAA2B;IACpD,EAAE,EAAE,0BAA0B;IAC9B,EAAE,EAAE,QAAQ;CACZ,CAAA;AAED,MAAM,iBAAiB,GAA2B;IACjD,EAAE,EAAE,OAAO;IACX,EAAE,EAAE,OAAO;CACX,CAAA;AAED,SAAS,cAAc,CAAC,SAAwC;IAC/D,QAAQ,SAAS,EAAE,CAAC;QACnB,KAAK,SAAS,CAAC;QACf,KAAK,QAAQ;YACZ,OAAO,SAAS,CAAA;QACjB,KAAK,aAAa,CAAC;QACnB,KAAK,QAAQ;YACZ,OAAO,QAAQ,CAAA;QAChB,KAAK,UAAU;YACd,OAAO,UAAU,CAAA;QAClB,KAAK,YAAY;YAChB,OAAO,UAAU,CAAA;QAClB;YACC,OAAO,SAAS,CAAA;IAClB,CAAC;AACF,CAAC;AAOD;;;GAGG;AACH,MAAM,UAAU,mBAAmB,CAAC,GAAc,EAAE,QAAqB,EAAE,QAAgB;IAC1F,IAAI,cAAc,CAAC,GAAG,CAAC,SAAS,CAAC,KAAK,UAAU;QAAE,OAAO,EAAE,CAAA;IAE3D,MAAM,QAAQ,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,cAAc,CAAC,CAAC,CAAC,SAAS,CAAC,KAAK,UAAU,CAAC,CAAA;IACjF,MAAM,MAAM,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,cAAc,CAAC,CAAC,CAAC,SAAS,CAAC,KAAK,QAAQ,CAAC,CAAA;IAC7E,MAAM,OAAO,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,cAAc,CAAC,CAAC,CAAC,SAAS,CAAC,KAAK,SAAS,CAAC,CAAA;IAC/E,MAAM,cAAc,GAAG,oBAAoB,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,OAAO,EAAE,IAAI,IAAI,GAAG,CAAC,OAAO,CAAA;IAExF,MAAM,QAAQ,GAAkB,CAAC,EAAE,MAAM,EAAE,MAAM,EAAE,UAAU,EAAE,EAAE,QAAQ,EAAE,QAAQ,EAAE,EAAE,CAAC,CAAA;IAExF,IAAI,QAAQ,EAAE,CAAC;QACd,QAAQ,CAAC,IAAI,CAAC;YACb,MAAM,EAAE,eAAe;YACvB,UAAU,EAAE,EAAE,QAAQ,EAAE,QAAQ,EAAE,QAAQ,EAAE,QAAQ,CAAC,IAAI,EAAE;SAC3D,CAAC,CAAA;IACH,CAAC;IACD,IAAI,QAAQ,IAAI,MAAM,EAAE,CAAC;QACxB,QAAQ,CAAC,IAAI,CAAC;YACb,MAAM,EAAE,sBAAsB;YAC9B,UAAU,EAAE,EAAE,QAAQ,EAAE,QAAQ,EAAE,QAAQ,EAAE,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,MAAM,CAAC,IAAI,EAAE;SAChF,CAAC,CAAA;IACH,CAAC;IACD,IAAI,QAAQ,IAAI,MAAM,IAAI,OAAO,EAAE,CAAC;QACnC,QAAQ,CAAC,IAAI,CAAC;YACb,MAAM,EAAE,8BAA8B;YACtC,UAAU,EAAE;gBACX,QAAQ,EAAE,QAAQ;gBAClB,QAAQ,EAAE,QAAQ,CAAC,IAAI;gBACvB,MAAM,EAAE,MAAM,CAAC,IAAI;gBACnB,OAAO,EAAE,cAAc;aACvB;SACD,CAAC,CAAA;IACH,CAAC;IAED,OAAO,QAAQ,CAAA;AAChB,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,YAAY,CAAC,GAAc;IAC1C,MAAM,IAAI,GAAG,IAAI,GAAG,CAAS,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,CAAA;IACxC,MAAM,KAAK,GAA0C,CAAC,EAAE,GAAG,EAAE,SAAS,EAAE,KAAK,EAAE,GAAG,CAAC,IAAI,EAAE,CAAC,CAAA;IAC1F,KAAK,MAAM,CAAC,MAAM,EAAE,KAAK,CAAC,IAAI,GAAG,CAAC,YAAY,EAAE,CAAC;QAChD,IAAI,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC;YAAE,SAAQ;QAC7B,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,CAAA;QACf,KAAK,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,gBAAgB,CAAC,MAAM,CAAC,EAAE,KAAK,EAAE,CAAC,CAAA;IACrD,CAAC;IACD,OAAO,KAAK,CAAA;AACb,CAAC;AAED,MAAM,CAAC,MAAM,yBAAyB,GAAG,gBAAgB,CAAA;AAEzD,MAAM,UAAU,0BAA0B;IACzC,OAAO;QACN,EAAE,EAAE,yBAAyB;QAC7B,cAAc,EAAE,SAAS;QACzB,WAAW,EACV,wHAAwH;QAEzH,KAAK,CAAC,CAAC,IAAI,CAAC,IAAoB;YAC/B,uFAAuF;YACvF,sFAAsF;YACtF,sFAAsF;YACtF,MAAM,IAAI,GAAG,IAAI,GAAG,EAAqB,CAAA;YACzC,IAAI,KAAK,EAAE,MAAM,GAAG,IAAI,YAAY,CAAC,IAAI,CAAC,SAAS,EAAE,EAAE,MAAM,EAAE,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,CAAC;gBAC/E,IAAI,IAAI,CAAC,MAAM,EAAE,OAAO;oBAAE,OAAM;gBAChC,IAAI,IAAI,CAAC,OAAO,IAAI,GAAG,CAAC,OAAO,KAAK,IAAI,CAAC,OAAO;oBAAE,SAAQ;gBAC1D,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC,SAAS,CAAC;oBAAE,SAAQ;gBAC5C,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,EAAE,GAAG,CAAC,CAAA;YACtB,CAAC;YAED,MAAM,QAAQ,GAAG,kBAAkB,CAAC,IAAI,CAAC,CAAA;YAEzC,iEAAiE;YACjE,MAAM,GAAG,GAAG,CAAC,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAA;YAClD,IAAI,OAAO,GAAG,CAAC,CAAA;YAEf,KAAK,MAAM,EAAE,IAAI,GAAG,EAAE,CAAC;gBACtB,IAAI,IAAI,CAAC,MAAM,EAAE,OAAO;oBAAE,OAAM;gBAChC,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,EAAE,CAAE,CAAA;gBACzB,IAAI,cAAc,CAAC,GAAG,CAAC,SAAS,CAAC,KAAK,UAAU;oBAAE,SAAQ;gBAE1D,MAAM,KAAK,GAAG,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,EAAE,CAAA;gBACpC,MAAM,KAAK,GAAG,YAAY,CAAC,GAAG,CAAC,CAAA;gBAE/B,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;oBAC1B,MAAM,QAAQ,GAAG,mBAAmB,CAAC,GAAG,EAAE,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,CAAA;oBAC5D,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;wBAChC,IAAI,IAAI,CAAC,KAAK,KAAK,SAAS,IAAI,OAAO,IAAI,IAAI,CAAC,KAAK;4BAAE,OAAM;wBAE7D,MAAM,GAAG,GAAG,aAAa,CAAC,OAAO,CAAC,UAAU,EAAE,GAAG,CAAC,OAAO,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAA;wBAC/E,IAAI,CAAC,GAAG;4BAAE,SAAQ;wBAClB,MAAM,OAAO,GAAG,mBAAmB,CAAC,OAAO,CAAC,UAAU,EAAE,GAAG,CAAC,CAAA;wBAC5D,IAAI,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,MAAM,KAAK,CAAC;4BAAE,SAAQ;wBAE/C,MAAM;4BACL,GAAG;4BACH,UAAU,EAAE,OAAO;4BACnB,OAAO,EAAE,GAAG,CAAC,OAAO;4BACpB,MAAM,EAAE,iBAAiB,CAAC,GAAG,CAAC,OAAO,CAAC;4BACtC,MAAM,EAAE,yBAAyB;4BACjC,SAAS,EAAE,GAAG,yBAAyB,IAAI,GAAG,CAAC,EAAE,IAAI,IAAI,CAAC,GAAG,IAAI,OAAO,CAAC,MAAM,EAAE;4BACjF,cAAc,EAAE,EAAE;4BAClB,OAAO,EAAE,SAAS;yBAClB,CAAA;wBACD,OAAO,EAAE,CAAA;oBACV,CAAC;gBACF,CAAC;YACF,CAAC;QACF,CAAC;KACD,CAAA;AACF,CAAC;AAED,MAAM,CAAC,MAAM,oBAAoB,GAAG,0BAA0B,EAAE,CAAA"}
@@ -0,0 +1,58 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Alignment: turn a `CanonicalRow` (raw + components) into a `LabeledRow` (raw + tokens + BIO
7
+ * labels) or a `QuarantinedRow` (raw + reason) per the Phase 1 plan.
8
+ *
9
+ * Pipeline:
10
+ *
11
+ * 1. For each `(tag, value)` in `components`, find the value's character span in `raw`. First try a
12
+ * verbatim substring match (case-insensitive, whitespace-collapsed). If that fails, fall
13
+ * back to fuzzy match via `fastest-levenshtein`, with a tunable edit distance threshold.
14
+ * 2. If any component cannot be located, reject the row with a human-readable reason and send it to
15
+ * the quarantine pile (`reason: "component-not-found:<tag>"` or
16
+ * `"edit-distance-exceeded:<tag>:<dist>"`).
17
+ * 3. Tokenize `raw` with the supplied `Tokenizer` (defaults to the whitespace tokenizer).
18
+ * 4. For each token: walk the list of component spans, pick the one whose span contains the token's
19
+ * character range. First token in a component span → `B-<tag>`; subsequent tokens →
20
+ * `I-<tag>`; no overlap → `O`.
21
+ *
22
+ * Two structural invariants the function preserves:
23
+ *
24
+ * - `tokens.length === labels.length` always.
25
+ * - Each component contributes at most one contiguous BIO run (no `B-tag … O … I-tag` gaps). This is
26
+ * enforced by greedy first-match span assignment + ordered token iteration.
27
+ */
28
+ import { type Tokenizer } from "./tokenize.js";
29
+ import type { CanonicalRow, LabeledRow, QuarantinedRow } from "./types.js";
30
+ /** Options for `alignRow`. */
31
+ export interface AlignOptions {
32
+ /** Tokenizer to use. Defaults to `whitespaceTokenizer()`. */
33
+ tokenizer?: Tokenizer;
34
+ /**
35
+ * Max Levenshtein edit distance to accept when a verbatim substring match fails. Set `0` to
36
+ * require verbatim matches only. Default `2`.
37
+ *
38
+ * Distance is computed against same-length windows in `raw`, so the threshold scales naturally
39
+ * with the component value length.
40
+ */
41
+ maxEditDistance?: number;
42
+ /**
43
+ * Case-insensitive comparison for substring search. Default `true`. The retained span in `raw` is
44
+ * the original case; only matching is case-insensitive.
45
+ */
46
+ caseInsensitive?: boolean;
47
+ }
48
+ /** Either a successful labeled row or a quarantined one. */
49
+ export type AlignmentResult = {
50
+ kind: "labeled";
51
+ row: LabeledRow;
52
+ } | {
53
+ kind: "quarantined";
54
+ row: QuarantinedRow;
55
+ };
56
+ /** Align a single row. */
57
+ export declare function alignRow(row: CanonicalRow, opts?: AlignOptions): AlignmentResult;
58
+ //# sourceMappingURL=align.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"align.d.ts","sourceRoot":"","sources":["../../src/align.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;GA0BG;AAIH,OAAO,EAAuC,KAAK,SAAS,EAAE,MAAM,eAAe,CAAA;AACnF,OAAO,KAAK,EAAE,YAAY,EAAE,UAAU,EAAE,cAAc,EAAE,MAAM,YAAY,CAAA;AAE1E,8BAA8B;AAC9B,MAAM,WAAW,YAAY;IAC5B,6DAA6D;IAC7D,SAAS,CAAC,EAAE,SAAS,CAAA;IAErB;;;;;;OAMG;IACH,eAAe,CAAC,EAAE,MAAM,CAAA;IAExB;;;OAGG;IACH,eAAe,CAAC,EAAE,OAAO,CAAA;CACzB;AAED,4DAA4D;AAC5D,MAAM,MAAM,eAAe,GAAG;IAAE,IAAI,EAAE,SAAS,CAAC;IAAC,GAAG,EAAE,UAAU,CAAA;CAAE,GAAG;IAAE,IAAI,EAAE,aAAa,CAAC;IAAC,GAAG,EAAE,cAAc,CAAA;CAAE,CAAA;AAQjH,0BAA0B;AAC1B,wBAAgB,QAAQ,CAAC,GAAG,EAAE,YAAY,EAAE,IAAI,GAAE,YAAiB,GAAG,eAAe,CAyCpF"}