@mailwoman/corpus 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/out/src/adapter.d.ts +96 -0
- package/out/src/adapter.d.ts.map +1 -0
- package/out/src/adapter.js +107 -0
- package/out/src/adapter.js.map +1 -0
- package/out/src/adapters/ban/adapter.d.ts +32 -0
- package/out/src/adapters/ban/adapter.d.ts.map +1 -0
- package/out/src/adapters/ban/adapter.js +133 -0
- package/out/src/adapters/ban/adapter.js.map +1 -0
- package/out/src/adapters/fcc-bdc/adapter.d.ts +61 -0
- package/out/src/adapters/fcc-bdc/adapter.d.ts.map +1 -0
- package/out/src/adapters/fcc-bdc/adapter.js +153 -0
- package/out/src/adapters/fcc-bdc/adapter.js.map +1 -0
- package/out/src/adapters/index.d.ts +42 -0
- package/out/src/adapters/index.d.ts.map +1 -0
- package/out/src/adapters/index.js +76 -0
- package/out/src/adapters/index.js.map +1 -0
- package/out/src/adapters/openaddresses/adapter.d.ts +60 -0
- package/out/src/adapters/openaddresses/adapter.d.ts.map +1 -0
- package/out/src/adapters/openaddresses/adapter.js +174 -0
- package/out/src/adapters/openaddresses/adapter.js.map +1 -0
- package/out/src/adapters/state-ia-contractors/adapter.d.ts +23 -0
- package/out/src/adapters/state-ia-contractors/adapter.d.ts.map +1 -0
- package/out/src/adapters/state-ia-contractors/adapter.js +113 -0
- package/out/src/adapters/state-ia-contractors/adapter.js.map +1 -0
- package/out/src/adapters/state-ny-notaries/adapter.d.ts +21 -0
- package/out/src/adapters/state-ny-notaries/adapter.d.ts.map +1 -0
- package/out/src/adapters/state-ny-notaries/adapter.js +132 -0
- package/out/src/adapters/state-ny-notaries/adapter.js.map +1 -0
- package/out/src/adapters/state-tx-notaries/adapter.d.ts +22 -0
- package/out/src/adapters/state-tx-notaries/adapter.d.ts.map +1 -0
- package/out/src/adapters/state-tx-notaries/adapter.js +125 -0
- package/out/src/adapters/state-tx-notaries/adapter.js.map +1 -0
- package/out/src/adapters/tiger/adapter.d.ts +45 -0
- package/out/src/adapters/tiger/adapter.d.ts.map +1 -0
- package/out/src/adapters/tiger/adapter.js +179 -0
- package/out/src/adapters/tiger/adapter.js.map +1 -0
- package/out/src/adapters/usgov-hrsa-fqhc/adapter.d.ts +36 -0
- package/out/src/adapters/usgov-hrsa-fqhc/adapter.d.ts.map +1 -0
- package/out/src/adapters/usgov-hrsa-fqhc/adapter.js +147 -0
- package/out/src/adapters/usgov-hrsa-fqhc/adapter.js.map +1 -0
- package/out/src/adapters/usgov-imls-pls/adapter.d.ts +25 -0
- package/out/src/adapters/usgov-imls-pls/adapter.d.ts.map +1 -0
- package/out/src/adapters/usgov-imls-pls/adapter.js +118 -0
- package/out/src/adapters/usgov-imls-pls/adapter.js.map +1 -0
- package/out/src/adapters/usgov-nad/adapter.d.ts +37 -0
- package/out/src/adapters/usgov-nad/adapter.d.ts.map +1 -0
- package/out/src/adapters/usgov-nad/adapter.js +227 -0
- package/out/src/adapters/usgov-nad/adapter.js.map +1 -0
- package/out/src/adapters/usgov-nppes/adapter.d.ts +28 -0
- package/out/src/adapters/usgov-nppes/adapter.d.ts.map +1 -0
- package/out/src/adapters/usgov-nppes/adapter.js +123 -0
- package/out/src/adapters/usgov-nppes/adapter.js.map +1 -0
- package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.d.ts +35 -0
- package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.d.ts.map +1 -0
- package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.js +162 -0
- package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.js.map +1 -0
- package/out/src/adapters/wof-admin-json/adapter.d.ts +85 -0
- package/out/src/adapters/wof-admin-json/adapter.d.ts.map +1 -0
- package/out/src/adapters/wof-admin-json/adapter.js +241 -0
- package/out/src/adapters/wof-admin-json/adapter.js.map +1 -0
- package/out/src/adapters/wof-postalcode-json/adapter.d.ts +63 -0
- package/out/src/adapters/wof-postalcode-json/adapter.d.ts.map +1 -0
- package/out/src/adapters/wof-postalcode-json/adapter.js +178 -0
- package/out/src/adapters/wof-postalcode-json/adapter.js.map +1 -0
- package/out/src/align.d.ts +58 -0
- package/out/src/align.d.ts.map +1 -0
- package/out/src/align.js +139 -0
- package/out/src/align.js.map +1 -0
- package/out/src/build.d.ts +104 -0
- package/out/src/build.d.ts.map +1 -0
- package/out/src/build.js +201 -0
- package/out/src/build.js.map +1 -0
- package/out/src/codex/us-fips-state.d.ts +44 -0
- package/out/src/codex/us-fips-state.d.ts.map +1 -0
- package/out/src/codex/us-fips-state.js +105 -0
- package/out/src/codex/us-fips-state.js.map +1 -0
- package/out/src/codex/us-street-suffix.d.ts +259 -0
- package/out/src/codex/us-street-suffix.d.ts.map +1 -0
- package/out/src/codex/us-street-suffix.js +285 -0
- package/out/src/codex/us-street-suffix.js.map +1 -0
- package/out/src/format.d.ts +79 -0
- package/out/src/format.d.ts.map +1 -0
- package/out/src/format.js +151 -0
- package/out/src/format.js.map +1 -0
- package/out/src/golden.d.ts +50 -0
- package/out/src/golden.d.ts.map +1 -0
- package/out/src/golden.js +104 -0
- package/out/src/golden.js.map +1 -0
- package/out/src/index.d.ts +18 -0
- package/out/src/index.d.ts.map +1 -0
- package/out/src/index.js +18 -0
- package/out/src/index.js.map +1 -0
- package/out/src/parquet-wrapper/index.d.ts +12 -0
- package/out/src/parquet-wrapper/index.d.ts.map +1 -0
- package/out/src/parquet-wrapper/index.js +12 -0
- package/out/src/parquet-wrapper/index.js.map +1 -0
- package/out/src/parquet-wrapper/reader.d.ts +31 -0
- package/out/src/parquet-wrapper/reader.d.ts.map +1 -0
- package/out/src/parquet-wrapper/reader.js +54 -0
- package/out/src/parquet-wrapper/reader.js.map +1 -0
- package/out/src/parquet-wrapper/schema.d.ts +45 -0
- package/out/src/parquet-wrapper/schema.d.ts.map +1 -0
- package/out/src/parquet-wrapper/schema.js +55 -0
- package/out/src/parquet-wrapper/schema.js.map +1 -0
- package/out/src/parquet-wrapper/writer.d.ts +41 -0
- package/out/src/parquet-wrapper/writer.d.ts.map +1 -0
- package/out/src/parquet-wrapper/writer.js +71 -0
- package/out/src/parquet-wrapper/writer.js.map +1 -0
- package/out/src/parquet.d.ts +122 -0
- package/out/src/parquet.d.ts.map +1 -0
- package/out/src/parquet.js +220 -0
- package/out/src/parquet.js.map +1 -0
- package/out/src/runner.d.ts +100 -0
- package/out/src/runner.d.ts.map +1 -0
- package/out/src/runner.js +183 -0
- package/out/src/runner.js.map +1 -0
- package/out/src/split.d.ts +108 -0
- package/out/src/split.d.ts.map +1 -0
- package/out/src/split.js +191 -0
- package/out/src/split.js.map +1 -0
- package/out/src/synthesize.d.ts +146 -0
- package/out/src/synthesize.d.ts.map +1 -0
- package/out/src/synthesize.js +472 -0
- package/out/src/synthesize.js.map +1 -0
- package/out/src/tokenize.d.ts +47 -0
- package/out/src/tokenize.d.ts.map +1 -0
- package/out/src/tokenize.js +49 -0
- package/out/src/tokenize.js.map +1 -0
- package/out/src/types.d.ts +168 -0
- package/out/src/types.d.ts.map +1 -0
- package/out/src/types.js +19 -0
- package/out/src/types.js.map +1 -0
- package/out/src/wof-json.d.ts +105 -0
- package/out/src/wof-json.d.ts.map +1 -0
- package/out/src/wof-json.js +174 -0
- package/out/src/wof-json.js.map +1 -0
- package/package.json +36 -0
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* `wof-admin`: Who's On First admin GeoJSON-bundle adapter.
|
|
7
|
+
*
|
|
8
|
+
* **Phase 1.5.1 pivot.** The original Phase 1.5 SQLite adapter (formerly at
|
|
9
|
+
* `packages/corpus/src/adapters/wof-admin/`, removed in this same change) was replaced by this
|
|
10
|
+
* one because the SQLite distribution path was unworkable for the real corpus build:
|
|
11
|
+
*
|
|
12
|
+
* 1. `dist.whosonfirst.org/sqlite/` is dead (NXDOMAIN); the Geocode-Earth mirror is the only one.
|
|
13
|
+
* 2. The Geocode-Earth-hosted postalcode DB tags every row `mz:is_current = -1` ("unknown but treated
|
|
14
|
+
* as active"); the SQLite adapter's `is_current = 1` predicate emitted zero rows.
|
|
15
|
+
* 3. The `names` table in the SQLite distribution is empty — localized `name:*` variants live in a
|
|
16
|
+
* separate distribution. The St. Petersburg / Mt. Vernon / Ft. Lauderdale alternation cases
|
|
17
|
+
* (the original Phase 1.5.1 motivator) cannot be solved on the SQLite path even with a
|
|
18
|
+
* patched `is_current` predicate.
|
|
19
|
+
*
|
|
20
|
+
* Input: a directory containing one or more cloned `whosonfirst-data-admin-<cc>` GitHub repos. Each
|
|
21
|
+
* repo has `data/XXX/YYY/ZZZ/<wof-id>.geojson` files; `**\/*.geojson` walks the tree recursively.
|
|
22
|
+
* Alternate-geometry siblings (`-alt-*`) are skipped — they're separate exports of the same
|
|
23
|
+
* record, not new records.
|
|
24
|
+
*
|
|
25
|
+
* Per record, the adapter emits one row per `(name-variant, hierarchy-variant)` pair:
|
|
26
|
+
*
|
|
27
|
+
* - **Name variants**: the canonical `wof:name` (slot key `default`) plus every `name:*` localized
|
|
28
|
+
* variant present on the feature (`name:eng_x_preferred`, `name:eng_x_colloquial`,
|
|
29
|
+
* `name:rus_x_preferred`, ...). This is the Phase 1.5.1 fix for the St. Petersburg case:
|
|
30
|
+
* `"Saint Petersburg"` (canonical) and `"St. Petersburg"` (eng_x_colloquial) both become
|
|
31
|
+
* training rows for the same WOF id.
|
|
32
|
+
* - **Hierarchy variants** (unchanged from the SQLite adapter): locality → 3 variants, region → 2,
|
|
33
|
+
* country → 1, county → 1.
|
|
34
|
+
*
|
|
35
|
+
* `source_id` is `wof-admin-<wof_id>-<name-slot>-<hierarchy-variant>`. The previous SQLite adapter
|
|
36
|
+
* used `wof-admin-<wof_id>-<hierarchy-variant>` (no name slot); the new format adds a name-slot
|
|
37
|
+
* segment so the colloquial / preferred / per-locale variants survive dedup independently.
|
|
38
|
+
*
|
|
39
|
+
* License: CC0. The adapter stamps every row with `CC0-1.0`.
|
|
40
|
+
*/
|
|
41
|
+
import { formatAddress, reconcileComponents } from "../../format.js";
|
|
42
|
+
import { buildAncestryIndex, normalizeNameKey, walkFeatures } from "../../wof-json.js";
|
|
43
|
+
/**
|
|
44
|
+
* Display name for the country, keyed by ISO 3166-1 alpha-2.
|
|
45
|
+
*
|
|
46
|
+
* Must be the **OpenCage-canonical** surface form: the `address-formatter` library expands some
|
|
47
|
+
* country names en route to its output (e.g. `"United States"` → `"United States of America"`). If
|
|
48
|
+
* `components.country` and the formatted `raw` disagree, alignment will fail downstream. Keying off
|
|
49
|
+
* the canonical form keeps the two in lockstep.
|
|
50
|
+
*
|
|
51
|
+
* Phase 1 US + FR only; extend as new locales come online. Missing countries fall back to the
|
|
52
|
+
* country row's `wof:name`, accepting the alignment risk for non-canonicalized names.
|
|
53
|
+
*/
|
|
54
|
+
const COUNTRY_DISPLAY_NAME = {
|
|
55
|
+
US: "United States of America",
|
|
56
|
+
FR: "France",
|
|
57
|
+
};
|
|
58
|
+
/** BCP-47 locale defaulting for the corpus row's `locale` field. Defaulted by country. */
|
|
59
|
+
const LOCALE_BY_COUNTRY = {
|
|
60
|
+
US: "en-US",
|
|
61
|
+
FR: "fr-FR",
|
|
62
|
+
};
|
|
63
|
+
/** Map a WOF placetype to a Mailwoman `ComponentTag`, or `undefined` to skip. */
|
|
64
|
+
function placetypeToTag(placetype) {
|
|
65
|
+
switch (placetype) {
|
|
66
|
+
case "country":
|
|
67
|
+
case "nation":
|
|
68
|
+
return "country";
|
|
69
|
+
case "macroregion":
|
|
70
|
+
case "region":
|
|
71
|
+
return "region";
|
|
72
|
+
case "macrocounty":
|
|
73
|
+
case "county":
|
|
74
|
+
case "localadmin":
|
|
75
|
+
return "subregion";
|
|
76
|
+
case "locality":
|
|
77
|
+
return "locality";
|
|
78
|
+
case "borough":
|
|
79
|
+
case "macrohood":
|
|
80
|
+
case "neighbourhood":
|
|
81
|
+
case "microhood":
|
|
82
|
+
return "dependent_locality";
|
|
83
|
+
default:
|
|
84
|
+
return undefined;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
/**
|
|
88
|
+
* Compute the hierarchy variants for a record given its ancestry chain and the chosen `selfName`.
|
|
89
|
+
*
|
|
90
|
+
* `selfName` is the surface form to use for the record's own component (locality / region / country
|
|
91
|
+
* / subregion). Callers pass the canonical `wof:name` for the `"default"` slot and a `name:*`
|
|
92
|
+
* localized value for variant slots; ancestor names always come from the ancestor's canonical
|
|
93
|
+
* `wof:name`.
|
|
94
|
+
*
|
|
95
|
+
* Country variants substitute `COUNTRY_DISPLAY_NAME` for the default slot so the OpenCage template
|
|
96
|
+
* produces the canonicalized form (`"United States of America"`), matching the legacy SQLite
|
|
97
|
+
* adapter's behavior.
|
|
98
|
+
*/
|
|
99
|
+
export function variantsFor(row, ancestry, selfName) {
|
|
100
|
+
const selfTag = placetypeToTag(row.placetype);
|
|
101
|
+
if (!selfTag)
|
|
102
|
+
return [];
|
|
103
|
+
const region = ancestry.find((a) => placetypeToTag(a.placetype) === "region");
|
|
104
|
+
const country = ancestry.find((a) => placetypeToTag(a.placetype) === "country");
|
|
105
|
+
const countryDisplay = COUNTRY_DISPLAY_NAME[row.country] ?? country?.name ?? row.country;
|
|
106
|
+
const variants = [];
|
|
107
|
+
switch (selfTag) {
|
|
108
|
+
case "locality":
|
|
109
|
+
case "dependent_locality": {
|
|
110
|
+
variants.push({ suffix: "self", components: { [selfTag]: selfName } });
|
|
111
|
+
if (region) {
|
|
112
|
+
variants.push({
|
|
113
|
+
suffix: "with-region",
|
|
114
|
+
components: { [selfTag]: selfName, region: region.name },
|
|
115
|
+
});
|
|
116
|
+
}
|
|
117
|
+
if (region && country) {
|
|
118
|
+
variants.push({
|
|
119
|
+
suffix: "with-region-country",
|
|
120
|
+
components: { [selfTag]: selfName, region: region.name, country: countryDisplay },
|
|
121
|
+
});
|
|
122
|
+
}
|
|
123
|
+
else if (!region && country) {
|
|
124
|
+
variants.push({
|
|
125
|
+
suffix: "with-country",
|
|
126
|
+
components: { [selfTag]: selfName, country: countryDisplay },
|
|
127
|
+
});
|
|
128
|
+
}
|
|
129
|
+
return variants;
|
|
130
|
+
}
|
|
131
|
+
case "region": {
|
|
132
|
+
variants.push({ suffix: "self", components: { region: selfName } });
|
|
133
|
+
if (country) {
|
|
134
|
+
variants.push({
|
|
135
|
+
suffix: "with-country",
|
|
136
|
+
components: { region: selfName, country: countryDisplay },
|
|
137
|
+
});
|
|
138
|
+
}
|
|
139
|
+
return variants;
|
|
140
|
+
}
|
|
141
|
+
case "country": {
|
|
142
|
+
variants.push({ suffix: "self", components: { country: selfName } });
|
|
143
|
+
return variants;
|
|
144
|
+
}
|
|
145
|
+
case "subregion": {
|
|
146
|
+
variants.push({ suffix: "self", components: { subregion: selfName } });
|
|
147
|
+
return variants;
|
|
148
|
+
}
|
|
149
|
+
default:
|
|
150
|
+
return [];
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
/**
|
|
154
|
+
* Build the per-record name-slot list. The canonical `"default"` slot uses the OpenCage-canonical
|
|
155
|
+
* country form when the record is itself a country (matches SQLite-adapter behavior); every other
|
|
156
|
+
* placetype's default slot uses `wof:name` verbatim.
|
|
157
|
+
*
|
|
158
|
+
* Subsequent slots come from `name:*` variants, deduplicated against the default name so we don't
|
|
159
|
+
* emit a redundant `"default"`-equivalent row under a localized key.
|
|
160
|
+
*/
|
|
161
|
+
export function nameSlotsFor(rec) {
|
|
162
|
+
const selfTag = placetypeToTag(rec.placetype);
|
|
163
|
+
const canonicalSelfName = selfTag === "country" ? (COUNTRY_DISPLAY_NAME[rec.country] ?? rec.name) : rec.name;
|
|
164
|
+
const seen = new Set([canonicalSelfName]);
|
|
165
|
+
const slots = [{ key: "default", value: canonicalSelfName }];
|
|
166
|
+
for (const [rawKey, value] of rec.nameVariants) {
|
|
167
|
+
if (seen.has(value))
|
|
168
|
+
continue;
|
|
169
|
+
seen.add(value);
|
|
170
|
+
slots.push({ key: normalizeNameKey(rawKey), value });
|
|
171
|
+
}
|
|
172
|
+
return slots;
|
|
173
|
+
}
|
|
174
|
+
export const WOF_ADMIN_ADAPTER_ID = "wof-admin";
|
|
175
|
+
/**
|
|
176
|
+
* Construct the wof-admin JSON-bundle adapter. The adapter is stateless across runs; calling this
|
|
177
|
+
* twice with the same input directory produces byte-identical `canonical.jsonl` (records are
|
|
178
|
+
* emitted in sorted `wof:id` order to be insensitive to filesystem walk ordering).
|
|
179
|
+
*/
|
|
180
|
+
export function createWofAdminAdapter() {
|
|
181
|
+
return {
|
|
182
|
+
id: WOF_ADMIN_ADAPTER_ID,
|
|
183
|
+
defaultLicense: "CC0-1.0",
|
|
184
|
+
description: "Who's On First admin GeoJSON bundles (countries, regions, counties, localities) — multi-name variants per record.",
|
|
185
|
+
async *rows(opts) {
|
|
186
|
+
// Pass 1: scan every GeoJSON file once, build the in-memory record index.
|
|
187
|
+
// We keep only records whose placetype maps to a ComponentTag — irrelevant placetypes
|
|
188
|
+
// (campus, county-region hybrids on which Mailwoman has no opinion) are dropped here so
|
|
189
|
+
// they don't inflate the ancestry index. Country-filtered runs prune to the matching
|
|
190
|
+
// country code too; the ancestors of a same-country record live in the same admin repo.
|
|
191
|
+
const byId = new Map();
|
|
192
|
+
for await (const rec of walkFeatures(opts.inputPath, { signal: opts.signal })) {
|
|
193
|
+
if (opts.signal?.aborted)
|
|
194
|
+
return;
|
|
195
|
+
if (opts.country && rec.country !== opts.country)
|
|
196
|
+
continue;
|
|
197
|
+
if (!placetypeToTag(rec.placetype))
|
|
198
|
+
continue;
|
|
199
|
+
byId.set(rec.id, rec);
|
|
200
|
+
}
|
|
201
|
+
const ancestry = buildAncestryIndex(byId);
|
|
202
|
+
// Pass 2: emit rows in sorted-id order for deterministic JSONL.
|
|
203
|
+
const ids = [...byId.keys()].sort((a, b) => a - b);
|
|
204
|
+
let emitted = 0;
|
|
205
|
+
for (const id of ids) {
|
|
206
|
+
if (opts.signal?.aborted)
|
|
207
|
+
return;
|
|
208
|
+
const rec = byId.get(id);
|
|
209
|
+
const chain = ancestry.get(id) ?? [];
|
|
210
|
+
const slots = nameSlotsFor(rec);
|
|
211
|
+
for (const slot of slots) {
|
|
212
|
+
const variants = variantsFor(rec, chain, slot.value);
|
|
213
|
+
for (const variant of variants) {
|
|
214
|
+
if (opts.limit !== undefined && emitted >= opts.limit)
|
|
215
|
+
return;
|
|
216
|
+
const raw = formatAddress(variant.components, rec.country, { separator: ", " });
|
|
217
|
+
if (!raw)
|
|
218
|
+
continue;
|
|
219
|
+
const aligned = reconcileComponents(variant.components, raw);
|
|
220
|
+
if (Object.keys(aligned).length === 0)
|
|
221
|
+
continue;
|
|
222
|
+
yield {
|
|
223
|
+
raw,
|
|
224
|
+
components: aligned,
|
|
225
|
+
country: rec.country,
|
|
226
|
+
locale: LOCALE_BY_COUNTRY[rec.country],
|
|
227
|
+
source: WOF_ADMIN_ADAPTER_ID,
|
|
228
|
+
source_id: `${WOF_ADMIN_ADAPTER_ID}-${rec.id}-${slot.key}-${variant.suffix}`,
|
|
229
|
+
corpus_version: "",
|
|
230
|
+
license: "CC0-1.0",
|
|
231
|
+
};
|
|
232
|
+
emitted++;
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
},
|
|
237
|
+
};
|
|
238
|
+
}
|
|
239
|
+
/** Single shared instance, suitable for `defaultAdapterRegistry`. */
|
|
240
|
+
export const wofAdminAdapter = createWofAdminAdapter();
|
|
241
|
+
//# sourceMappingURL=adapter.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"adapter.js","sourceRoot":"","sources":["../../../../src/adapters/wof-admin-json/adapter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAuCG;AAIH,OAAO,EAAE,aAAa,EAAE,mBAAmB,EAAE,MAAM,iBAAiB,CAAA;AAEpE,OAAO,EAAE,kBAAkB,EAAE,gBAAgB,EAAE,YAAY,EAAkB,MAAM,mBAAmB,CAAA;AAEtG;;;;;;;;;;GAUG;AACH,MAAM,oBAAoB,GAA2B;IACpD,EAAE,EAAE,0BAA0B;IAC9B,EAAE,EAAE,QAAQ;CACZ,CAAA;AAED,0FAA0F;AAC1F,MAAM,iBAAiB,GAA2B;IACjD,EAAE,EAAE,OAAO;IACX,EAAE,EAAE,OAAO;CACX,CAAA;AAED,iFAAiF;AACjF,SAAS,cAAc,CAAC,SAAwC;IAC/D,QAAQ,SAAS,EAAE,CAAC;QACnB,KAAK,SAAS,CAAC;QACf,KAAK,QAAQ;YACZ,OAAO,SAAS,CAAA;QACjB,KAAK,aAAa,CAAC;QACnB,KAAK,QAAQ;YACZ,OAAO,QAAQ,CAAA;QAChB,KAAK,aAAa,CAAC;QACnB,KAAK,QAAQ,CAAC;QACd,KAAK,YAAY;YAChB,OAAO,WAAW,CAAA;QACnB,KAAK,UAAU;YACd,OAAO,UAAU,CAAA;QAClB,KAAK,SAAS,CAAC;QACf,KAAK,WAAW,CAAC;QACjB,KAAK,eAAe,CAAC;QACrB,KAAK,WAAW;YACf,OAAO,oBAAoB,CAAA;QAC5B;YACC,OAAO,SAAS,CAAA;IAClB,CAAC;AACF,CAAC;AAUD;;;;;;;;;;;GAWG;AACH,MAAM,UAAU,WAAW,CAAC,GAAc,EAAE,QAAqB,EAAE,QAAgB;IAClF,MAAM,OAAO,GAAG,cAAc,CAAC,GAAG,CAAC,SAAS,CAAC,CAAA;IAC7C,IAAI,CAAC,OAAO;QAAE,OAAO,EAAE,CAAA;IAEvB,MAAM,MAAM,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,cAAc,CAAC,CAAC,CAAC,SAAS,CAAC,KAAK,QAAQ,CAAC,CAAA;IAC7E,MAAM,OAAO,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,cAAc,CAAC,CAAC,CAAC,SAAS,CAAC,KAAK,SAAS,CAAC,CAAA;IAC/E,MAAM,cAAc,GAAG,oBAAoB,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,OAAO,EAAE,IAAI,IAAI,GAAG,CAAC,OAAO,CAAA;IAExF,MAAM,QAAQ,GAAkB,EAAE,CAAA;IAElC,QAAQ,OAAO,EAAE,CAAC;QACjB,KAAK,UAAU,CAAC;QAChB,KAAK,oBAAoB,CAAC,CAAC,CAAC;YAC3B,QAAQ,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,MAAM,EAAE,UAAU,EAAE,EAAE,CAAC,OAAO,CAAC,EAAE,QAAQ,EAAE,EAAE,CAAC,CAAA;YACtE,IAAI,MAAM,EAAE,CAAC;gBACZ,QAAQ,CAAC,IAAI,CAAC;oBACb,MAAM,EAAE,aAAa;oBACrB,UAAU,EAAE,EAAE,CAAC,OAAO,CAAC,EAAE,QAAQ,EAAE,MAAM,EAAE,MAAM,CAAC,IAAI,EAAE;iBACxD,CAAC,CAAA;YACH,CAAC;YACD,IAAI,MAAM,IAAI,OAAO,EAAE,CAAC;gBACvB,QAAQ,CAAC,IAAI,CAAC;oBACb,MAAM,EAAE,qBAAqB;oBAC7B,UAAU,EAAE,EAAE,CAAC,OAAO,CAAC,EAAE,QAAQ,EAAE,MAAM,EAAE,MAAM,CAAC,IAAI,EAAE,OAAO,EAAE,cAAc,EAAE;iBACjF,CAAC,CAAA;YACH,CAAC;iBAAM,IAAI,CAAC,MAAM,IAAI,OAAO,EAAE,CAAC;gBAC/B,QAAQ,CAAC,IAAI,CAAC;oBACb,MAAM,EAAE,cAAc;oBACtB,UAAU,EAAE,EAAE,CAAC,OAAO,CAAC,EAAE,QAAQ,EAAE,OAAO,EAAE,cAAc,EAAE;iBAC5D,CAAC,CAAA;YACH,CAAC;YACD,OAAO,QAAQ,CAAA;QAChB,CAAC;QAED,KAAK,QAAQ,CAAC,CAAC,CAAC;YACf,QAAQ,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,MAAM,EAAE,UAAU,EAAE,EAAE,MAAM,EAAE,QAAQ,EAAE,EAAE,CAAC,CAAA;YACnE,IAAI,OAAO,EAAE,CAAC;gBACb,QAAQ,CAAC,IAAI,CAAC;oBACb,MAAM,EAAE,cAAc;oBACtB,UAAU,EAAE,EAAE,MAAM,EAAE,QAAQ,EAAE,OAAO,EAAE,cAAc,EAAE;iBACzD,CAAC,CAAA;YACH,CAAC;YACD,OAAO,QAAQ,CAAA;QAChB,CAAC;QAED,KAAK,SAAS,CAAC,CAAC,CAAC;YAChB,QAAQ,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,MAAM,EAAE,UAAU,EAAE,EAAE,OAAO,EAAE,QAAQ,EAAE,EAAE,CAAC,CAAA;YACpE,OAAO,QAAQ,CAAA;QAChB,CAAC;QAED,KAAK,WAAW,CAAC,CAAC,CAAC;YAClB,QAAQ,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,MAAM,EAAE,UAAU,EAAE,EAAE,SAAS,EAAE,QAAQ,EAAE,EAAE,CAAC,CAAA;YACtE,OAAO,QAAQ,CAAA;QAChB,CAAC;QAED;YACC,OAAO,EAAE,CAAA;IACX,CAAC;AACF,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,UAAU,YAAY,CAAC,GAAc;IAC1C,MAAM,OAAO,GAAG,cAAc,CAAC,GAAG,CAAC,SAAS,CAAC,CAAA;IAC7C,MAAM,iBAAiB,GAAG,OAAO,KAAK,SAAS,CAAC,CAAC,CAAC,CAAC,oBAAoB,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,IAAI,CAAA;IAE5G,MAAM,IAAI,GAAG,IAAI,GAAG,CAAS,CAAC,iBAAiB,CAAC,CAAC,CAAA;IACjD,MAAM,KAAK,GAA0C,CAAC,EAAE,GAAG,EAAE,SAAS,EAAE,KAAK,EAAE,iBAAiB,EAAE,CAAC,CAAA;IAEnG,KAAK,MAAM,CAAC,MAAM,EAAE,KAAK,CAAC,IAAI,GAAG,CAAC,YAAY,EAAE,CAAC;QAChD,IAAI,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC;YAAE,SAAQ;QAC7B,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,CAAA;QACf,KAAK,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,gBAAgB,CAAC,MAAM,CAAC,EAAE,KAAK,EAAE,CAAC,CAAA;IACrD,CAAC;IAED,OAAO,KAAK,CAAA;AACb,CAAC;AAED,MAAM,CAAC,MAAM,oBAAoB,GAAG,WAAW,CAAA;AAE/C;;;;GAIG;AACH,MAAM,UAAU,qBAAqB;IACpC,OAAO;QACN,EAAE,EAAE,oBAAoB;QACxB,cAAc,EAAE,SAAS;QACzB,WAAW,EACV,mHAAmH;QAEpH,KAAK,CAAC,CAAC,IAAI,CAAC,IAAoB;YAC/B,0EAA0E;YAC1E,sFAAsF;YACtF,wFAAwF;YACxF,qFAAqF;YACrF,wFAAwF;YACxF,MAAM,IAAI,GAAG,IAAI,GAAG,EAAqB,CAAA;YACzC,IAAI,KAAK,EAAE,MAAM,GAAG,IAAI,YAAY,CAAC,IAAI,CAAC,SAAS,EAAE,EAAE,MAAM,EAAE,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,CAAC;gBAC/E,IAAI,IAAI,CAAC,MAAM,EAAE,OAAO;oBAAE,OAAM;gBAChC,IAAI,IAAI,CAAC,OAAO,IAAI,GAAG,CAAC,OAAO,KAAK,IAAI,CAAC,OAAO;oBAAE,SAAQ;gBAC1D,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC,SAAS,CAAC;oBAAE,SAAQ;gBAC5C,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,EAAE,GAAG,CAAC,CAAA;YACtB,CAAC;YAED,MAAM,QAAQ,GAAG,kBAAkB,CAAC,IAAI,CAAC,CAAA;YAEzC,gEAAgE;YAChE,MAAM,GAAG,GAAG,CAAC,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAA;YAClD,IAAI,OAAO,GAAG,CAAC,CAAA;YAEf,KAAK,MAAM,EAAE,IAAI,GAAG,EAAE,CAAC;gBACtB,IAAI,IAAI,CAAC,MAAM,EAAE,OAAO;oBAAE,OAAM;gBAChC,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,EAAE,CAAE,CAAA;gBACzB,MAAM,KAAK,GAAG,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,EAAE,CAAA;gBACpC,MAAM,KAAK,GAAG,YAAY,CAAC,GAAG,CAAC,CAAA;gBAE/B,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;oBAC1B,MAAM,QAAQ,GAAG,WAAW,CAAC,GAAG,EAAE,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,CAAA;oBACpD,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;wBAChC,IAAI,IAAI,CAAC,KAAK,KAAK,SAAS,IAAI,OAAO,IAAI,IAAI,CAAC,KAAK;4BAAE,OAAM;wBAE7D,MAAM,GAAG,GAAG,aAAa,CAAC,OAAO,CAAC,UAAU,EAAE,GAAG,CAAC,OAAO,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAA;wBAC/E,IAAI,CAAC,GAAG;4BAAE,SAAQ;wBAClB,MAAM,OAAO,GAAG,mBAAmB,CAAC,OAAO,CAAC,UAAU,EAAE,GAAG,CAAC,CAAA;wBAC5D,IAAI,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,MAAM,KAAK,CAAC;4BAAE,SAAQ;wBAE/C,MAAM;4BACL,GAAG;4BACH,UAAU,EAAE,OAAO;4BACnB,OAAO,EAAE,GAAG,CAAC,OAAO;4BACpB,MAAM,EAAE,iBAAiB,CAAC,GAAG,CAAC,OAAO,CAAC;4BACtC,MAAM,EAAE,oBAAoB;4BAC5B,SAAS,EAAE,GAAG,oBAAoB,IAAI,GAAG,CAAC,EAAE,IAAI,IAAI,CAAC,GAAG,IAAI,OAAO,CAAC,MAAM,EAAE;4BAC5E,cAAc,EAAE,EAAE;4BAClB,OAAO,EAAE,SAAS;yBAClB,CAAA;wBACD,OAAO,EAAE,CAAA;oBACV,CAAC;gBACF,CAAC;YACF,CAAC;QACF,CAAC;KACD,CAAA;AACF,CAAC;AAED,qEAAqE;AACrE,MAAM,CAAC,MAAM,eAAe,GAAG,qBAAqB,EAAE,CAAA"}
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* `wof-postalcode`: Who's On First postalcode GeoJSON-bundle adapter.
|
|
7
|
+
*
|
|
8
|
+
* **Phase 1.5.1 pivot.** Replaces the previous SpatiaLite-backed implementation (formerly at
|
|
9
|
+
* `packages/corpus/src/adapters/wof-postalcode/`, removed in this same change). The rationale is
|
|
10
|
+
* in `wof-admin-json/adapter.ts` and in `DECISIONS.md` — short version: the SQLite distribution
|
|
11
|
+
* mirror is dead, the live distro tags every postcode row `mz:is_current = -1` which the old
|
|
12
|
+
* `is_current = 1` predicate excluded, and localized `name:*` variants don't ship in the SQLite
|
|
13
|
+
* export at all.
|
|
14
|
+
*
|
|
15
|
+
* Input: a directory containing one or more cloned `whosonfirst-data-postalcode-<cc>` repos plus
|
|
16
|
+
* the relevant `whosonfirst-data-admin-<cc>` repos (postcode records reference admin ancestry by
|
|
17
|
+
* `wof:parent_id`, so the locality / region / country records must be in the same walk for the
|
|
18
|
+
* ancestry chain to resolve). The corpus pipeline clones all four repos under
|
|
19
|
+
* `/data/corpus/sources/wof/repos/` and points the adapter at that root.
|
|
20
|
+
*
|
|
21
|
+
* Per live postalcode record, the adapter emits one row per `(name-variant, hierarchy-variant)`
|
|
22
|
+
* pair:
|
|
23
|
+
*
|
|
24
|
+
* - **Name variants**: canonical `wof:name` (slot key `default`, typically the postcode digits
|
|
25
|
+
* themselves) plus any `name:*` variants on the postcode feature. In practice WOF postcode
|
|
26
|
+
* records rarely carry localized name variants, so this expansion is usually a no-op — but
|
|
27
|
+
* the code path stays symmetric with the admin adapter for consistency.
|
|
28
|
+
* - **Hierarchy variants** (unchanged from the SQLite adapter): self, +locality, +locality+region,
|
|
29
|
+
* +locality+region+country.
|
|
30
|
+
*
|
|
31
|
+
* `source_id` is `wof-postalcode-<wof_id>-<name-slot>-<hierarchy-variant>`. Ancestor names always
|
|
32
|
+
* come from the ancestor's canonical `wof:name`; this adapter does NOT iterate ancestor name
|
|
33
|
+
* variants (e.g. it does not emit `"75008 Париж"` even when Paris has a `name:rus_x_preferred`).
|
|
34
|
+
* That cross-product belongs to a future synthesis pass; emitting it here would multiply row
|
|
35
|
+
* counts ~10× without a clear training-value story.
|
|
36
|
+
*
|
|
37
|
+
* License: CC0.
|
|
38
|
+
*/
|
|
39
|
+
import type { ComponentTag } from "@mailwoman/core/types";
|
|
40
|
+
import type { CorpusAdapter } from "../../types.js";
|
|
41
|
+
import { type WofRecord } from "../../wof-json.js";
|
|
42
|
+
interface VariantSpec {
|
|
43
|
+
suffix: string;
|
|
44
|
+
components: Partial<Record<ComponentTag, string>>;
|
|
45
|
+
}
|
|
46
|
+
/**
|
|
47
|
+
* Compute hierarchy variants for a postcode record. `selfName` is the postcode surface form
|
|
48
|
+
* (canonical `wof:name` for the `default` slot, a `name:*` localized variant otherwise).
|
|
49
|
+
*/
|
|
50
|
+
export declare function postcodeVariantsFor(row: WofRecord, ancestry: WofRecord[], selfName: string): VariantSpec[];
|
|
51
|
+
/**
|
|
52
|
+
* Build the per-record name-slot list. The `default` slot uses `wof:name` verbatim (postcode
|
|
53
|
+
* digits); subsequent slots come from `name:*` variants dedup'd against the default.
|
|
54
|
+
*/
|
|
55
|
+
export declare function nameSlotsFor(rec: WofRecord): Array<{
|
|
56
|
+
key: string;
|
|
57
|
+
value: string;
|
|
58
|
+
}>;
|
|
59
|
+
export declare const WOF_POSTALCODE_ADAPTER_ID = "wof-postalcode";
|
|
60
|
+
export declare function createWofPostalcodeAdapter(): CorpusAdapter;
|
|
61
|
+
export declare const wofPostalcodeAdapter: CorpusAdapter;
|
|
62
|
+
export {};
|
|
63
|
+
//# sourceMappingURL=adapter.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"adapter.d.ts","sourceRoot":"","sources":["../../../../src/adapters/wof-postalcode-json/adapter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAqCG;AAGH,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,uBAAuB,CAAA;AAEzD,OAAO,KAAK,EAAgC,aAAa,EAAE,MAAM,gBAAgB,CAAA;AACjF,OAAO,EAAsD,KAAK,SAAS,EAAE,MAAM,mBAAmB,CAAA;AA6BtG,UAAU,WAAW;IACpB,MAAM,EAAE,MAAM,CAAA;IACd,UAAU,EAAE,OAAO,CAAC,MAAM,CAAC,YAAY,EAAE,MAAM,CAAC,CAAC,CAAA;CACjD;AAED;;;GAGG;AACH,wBAAgB,mBAAmB,CAAC,GAAG,EAAE,SAAS,EAAE,QAAQ,EAAE,SAAS,EAAE,EAAE,QAAQ,EAAE,MAAM,GAAG,WAAW,EAAE,CAmC1G;AAED;;;GAGG;AACH,wBAAgB,YAAY,CAAC,GAAG,EAAE,SAAS,GAAG,KAAK,CAAC;IAAE,GAAG,EAAE,MAAM,CAAC;IAAC,KAAK,EAAE,MAAM,CAAA;CAAE,CAAC,CASlF;AAED,eAAO,MAAM,yBAAyB,mBAAmB,CAAA;AAEzD,wBAAgB,0BAA0B,IAAI,aAAa,CA2D1D;AAED,eAAO,MAAM,oBAAoB,eAA+B,CAAA"}
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* `wof-postalcode`: Who's On First postalcode GeoJSON-bundle adapter.
|
|
7
|
+
*
|
|
8
|
+
* **Phase 1.5.1 pivot.** Replaces the previous SpatiaLite-backed implementation (formerly at
|
|
9
|
+
* `packages/corpus/src/adapters/wof-postalcode/`, removed in this same change). The rationale is
|
|
10
|
+
* in `wof-admin-json/adapter.ts` and in `DECISIONS.md` — short version: the SQLite distribution
|
|
11
|
+
* mirror is dead, the live distro tags every postcode row `mz:is_current = -1` which the old
|
|
12
|
+
* `is_current = 1` predicate excluded, and localized `name:*` variants don't ship in the SQLite
|
|
13
|
+
* export at all.
|
|
14
|
+
*
|
|
15
|
+
* Input: a directory containing one or more cloned `whosonfirst-data-postalcode-<cc>` repos plus
|
|
16
|
+
* the relevant `whosonfirst-data-admin-<cc>` repos (postcode records reference admin ancestry by
|
|
17
|
+
* `wof:parent_id`, so the locality / region / country records must be in the same walk for the
|
|
18
|
+
* ancestry chain to resolve). The corpus pipeline clones all four repos under
|
|
19
|
+
* `/data/corpus/sources/wof/repos/` and points the adapter at that root.
|
|
20
|
+
*
|
|
21
|
+
* Per live postalcode record, the adapter emits one row per `(name-variant, hierarchy-variant)`
|
|
22
|
+
* pair:
|
|
23
|
+
*
|
|
24
|
+
* - **Name variants**: canonical `wof:name` (slot key `default`, typically the postcode digits
|
|
25
|
+
* themselves) plus any `name:*` variants on the postcode feature. In practice WOF postcode
|
|
26
|
+
* records rarely carry localized name variants, so this expansion is usually a no-op — but
|
|
27
|
+
* the code path stays symmetric with the admin adapter for consistency.
|
|
28
|
+
* - **Hierarchy variants** (unchanged from the SQLite adapter): self, +locality, +locality+region,
|
|
29
|
+
* +locality+region+country.
|
|
30
|
+
*
|
|
31
|
+
* `source_id` is `wof-postalcode-<wof_id>-<name-slot>-<hierarchy-variant>`. Ancestor names always
|
|
32
|
+
* come from the ancestor's canonical `wof:name`; this adapter does NOT iterate ancestor name
|
|
33
|
+
* variants (e.g. it does not emit `"75008 Париж"` even when Paris has a `name:rus_x_preferred`).
|
|
34
|
+
* That cross-product belongs to a future synthesis pass; emitting it here would multiply row
|
|
35
|
+
* counts ~10× without a clear training-value story.
|
|
36
|
+
*
|
|
37
|
+
* License: CC0.
|
|
38
|
+
*/
|
|
39
|
+
import { formatAddress, reconcileComponents } from "../../format.js";
|
|
40
|
+
import { buildAncestryIndex, normalizeNameKey, walkFeatures } from "../../wof-json.js";
|
|
41
|
+
const COUNTRY_DISPLAY_NAME = {
|
|
42
|
+
US: "United States of America",
|
|
43
|
+
FR: "France",
|
|
44
|
+
};
|
|
45
|
+
const LOCALE_BY_COUNTRY = {
|
|
46
|
+
US: "en-US",
|
|
47
|
+
FR: "fr-FR",
|
|
48
|
+
};
|
|
49
|
+
function placetypeToTag(placetype) {
|
|
50
|
+
switch (placetype) {
|
|
51
|
+
case "country":
|
|
52
|
+
case "nation":
|
|
53
|
+
return "country";
|
|
54
|
+
case "macroregion":
|
|
55
|
+
case "region":
|
|
56
|
+
return "region";
|
|
57
|
+
case "locality":
|
|
58
|
+
return "locality";
|
|
59
|
+
case "postalcode":
|
|
60
|
+
return "postcode";
|
|
61
|
+
default:
|
|
62
|
+
return undefined;
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
/**
|
|
66
|
+
* Compute hierarchy variants for a postcode record. `selfName` is the postcode surface form
|
|
67
|
+
* (canonical `wof:name` for the `default` slot, a `name:*` localized variant otherwise).
|
|
68
|
+
*/
|
|
69
|
+
export function postcodeVariantsFor(row, ancestry, selfName) {
|
|
70
|
+
if (placetypeToTag(row.placetype) !== "postcode")
|
|
71
|
+
return [];
|
|
72
|
+
const locality = ancestry.find((a) => placetypeToTag(a.placetype) === "locality");
|
|
73
|
+
const region = ancestry.find((a) => placetypeToTag(a.placetype) === "region");
|
|
74
|
+
const country = ancestry.find((a) => placetypeToTag(a.placetype) === "country");
|
|
75
|
+
const countryDisplay = COUNTRY_DISPLAY_NAME[row.country] ?? country?.name ?? row.country;
|
|
76
|
+
const variants = [{ suffix: "self", components: { postcode: selfName } }];
|
|
77
|
+
if (locality) {
|
|
78
|
+
variants.push({
|
|
79
|
+
suffix: "with-locality",
|
|
80
|
+
components: { postcode: selfName, locality: locality.name },
|
|
81
|
+
});
|
|
82
|
+
}
|
|
83
|
+
if (locality && region) {
|
|
84
|
+
variants.push({
|
|
85
|
+
suffix: "with-locality-region",
|
|
86
|
+
components: { postcode: selfName, locality: locality.name, region: region.name },
|
|
87
|
+
});
|
|
88
|
+
}
|
|
89
|
+
if (locality && region && country) {
|
|
90
|
+
variants.push({
|
|
91
|
+
suffix: "with-locality-region-country",
|
|
92
|
+
components: {
|
|
93
|
+
postcode: selfName,
|
|
94
|
+
locality: locality.name,
|
|
95
|
+
region: region.name,
|
|
96
|
+
country: countryDisplay,
|
|
97
|
+
},
|
|
98
|
+
});
|
|
99
|
+
}
|
|
100
|
+
return variants;
|
|
101
|
+
}
|
|
102
|
+
/**
|
|
103
|
+
* Build the per-record name-slot list. The `default` slot uses `wof:name` verbatim (postcode
|
|
104
|
+
* digits); subsequent slots come from `name:*` variants dedup'd against the default.
|
|
105
|
+
*/
|
|
106
|
+
export function nameSlotsFor(rec) {
|
|
107
|
+
const seen = new Set([rec.name]);
|
|
108
|
+
const slots = [{ key: "default", value: rec.name }];
|
|
109
|
+
for (const [rawKey, value] of rec.nameVariants) {
|
|
110
|
+
if (seen.has(value))
|
|
111
|
+
continue;
|
|
112
|
+
seen.add(value);
|
|
113
|
+
slots.push({ key: normalizeNameKey(rawKey), value });
|
|
114
|
+
}
|
|
115
|
+
return slots;
|
|
116
|
+
}
|
|
117
|
+
export const WOF_POSTALCODE_ADAPTER_ID = "wof-postalcode";
|
|
118
|
+
export function createWofPostalcodeAdapter() {
|
|
119
|
+
return {
|
|
120
|
+
id: WOF_POSTALCODE_ADAPTER_ID,
|
|
121
|
+
defaultLicense: "CC0-1.0",
|
|
122
|
+
description: "Who's On First postalcode GeoJSON bundles (postcode → locality/region pairs). Ancestor names from sibling admin repos.",
|
|
123
|
+
async *rows(opts) {
|
|
124
|
+
// Pass 1: full walk. We keep every record whose placetype maps to a ComponentTag — the
|
|
125
|
+
// postcode adapter needs locality / region / country admin records in the index so it
|
|
126
|
+
// can resolve postcode ancestry, even though it only emits rows for postcode records.
|
|
127
|
+
const byId = new Map();
|
|
128
|
+
for await (const rec of walkFeatures(opts.inputPath, { signal: opts.signal })) {
|
|
129
|
+
if (opts.signal?.aborted)
|
|
130
|
+
return;
|
|
131
|
+
if (opts.country && rec.country !== opts.country)
|
|
132
|
+
continue;
|
|
133
|
+
if (!placetypeToTag(rec.placetype))
|
|
134
|
+
continue;
|
|
135
|
+
byId.set(rec.id, rec);
|
|
136
|
+
}
|
|
137
|
+
const ancestry = buildAncestryIndex(byId);
|
|
138
|
+
// Pass 2: emit postcode rows only, sorted by id for determinism.
|
|
139
|
+
const ids = [...byId.keys()].sort((a, b) => a - b);
|
|
140
|
+
let emitted = 0;
|
|
141
|
+
for (const id of ids) {
|
|
142
|
+
if (opts.signal?.aborted)
|
|
143
|
+
return;
|
|
144
|
+
const rec = byId.get(id);
|
|
145
|
+
if (placetypeToTag(rec.placetype) !== "postcode")
|
|
146
|
+
continue;
|
|
147
|
+
const chain = ancestry.get(id) ?? [];
|
|
148
|
+
const slots = nameSlotsFor(rec);
|
|
149
|
+
for (const slot of slots) {
|
|
150
|
+
const variants = postcodeVariantsFor(rec, chain, slot.value);
|
|
151
|
+
for (const variant of variants) {
|
|
152
|
+
if (opts.limit !== undefined && emitted >= opts.limit)
|
|
153
|
+
return;
|
|
154
|
+
const raw = formatAddress(variant.components, rec.country, { separator: ", " });
|
|
155
|
+
if (!raw)
|
|
156
|
+
continue;
|
|
157
|
+
const aligned = reconcileComponents(variant.components, raw);
|
|
158
|
+
if (Object.keys(aligned).length === 0)
|
|
159
|
+
continue;
|
|
160
|
+
yield {
|
|
161
|
+
raw,
|
|
162
|
+
components: aligned,
|
|
163
|
+
country: rec.country,
|
|
164
|
+
locale: LOCALE_BY_COUNTRY[rec.country],
|
|
165
|
+
source: WOF_POSTALCODE_ADAPTER_ID,
|
|
166
|
+
source_id: `${WOF_POSTALCODE_ADAPTER_ID}-${rec.id}-${slot.key}-${variant.suffix}`,
|
|
167
|
+
corpus_version: "",
|
|
168
|
+
license: "CC0-1.0",
|
|
169
|
+
};
|
|
170
|
+
emitted++;
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
},
|
|
175
|
+
};
|
|
176
|
+
}
|
|
177
|
+
export const wofPostalcodeAdapter = createWofPostalcodeAdapter();
|
|
178
|
+
//# sourceMappingURL=adapter.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"adapter.js","sourceRoot":"","sources":["../../../../src/adapters/wof-postalcode-json/adapter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAqCG;AAIH,OAAO,EAAE,aAAa,EAAE,mBAAmB,EAAE,MAAM,iBAAiB,CAAA;AAEpE,OAAO,EAAE,kBAAkB,EAAE,gBAAgB,EAAE,YAAY,EAAkB,MAAM,mBAAmB,CAAA;AAEtG,MAAM,oBAAoB,GAA2B;IACpD,EAAE,EAAE,0BAA0B;IAC9B,EAAE,EAAE,QAAQ;CACZ,CAAA;AAED,MAAM,iBAAiB,GAA2B;IACjD,EAAE,EAAE,OAAO;IACX,EAAE,EAAE,OAAO;CACX,CAAA;AAED,SAAS,cAAc,CAAC,SAAwC;IAC/D,QAAQ,SAAS,EAAE,CAAC;QACnB,KAAK,SAAS,CAAC;QACf,KAAK,QAAQ;YACZ,OAAO,SAAS,CAAA;QACjB,KAAK,aAAa,CAAC;QACnB,KAAK,QAAQ;YACZ,OAAO,QAAQ,CAAA;QAChB,KAAK,UAAU;YACd,OAAO,UAAU,CAAA;QAClB,KAAK,YAAY;YAChB,OAAO,UAAU,CAAA;QAClB;YACC,OAAO,SAAS,CAAA;IAClB,CAAC;AACF,CAAC;AAOD;;;GAGG;AACH,MAAM,UAAU,mBAAmB,CAAC,GAAc,EAAE,QAAqB,EAAE,QAAgB;IAC1F,IAAI,cAAc,CAAC,GAAG,CAAC,SAAS,CAAC,KAAK,UAAU;QAAE,OAAO,EAAE,CAAA;IAE3D,MAAM,QAAQ,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,cAAc,CAAC,CAAC,CAAC,SAAS,CAAC,KAAK,UAAU,CAAC,CAAA;IACjF,MAAM,MAAM,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,cAAc,CAAC,CAAC,CAAC,SAAS,CAAC,KAAK,QAAQ,CAAC,CAAA;IAC7E,MAAM,OAAO,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,cAAc,CAAC,CAAC,CAAC,SAAS,CAAC,KAAK,SAAS,CAAC,CAAA;IAC/E,MAAM,cAAc,GAAG,oBAAoB,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,OAAO,EAAE,IAAI,IAAI,GAAG,CAAC,OAAO,CAAA;IAExF,MAAM,QAAQ,GAAkB,CAAC,EAAE,MAAM,EAAE,MAAM,EAAE,UAAU,EAAE,EAAE,QAAQ,EAAE,QAAQ,EAAE,EAAE,CAAC,CAAA;IAExF,IAAI,QAAQ,EAAE,CAAC;QACd,QAAQ,CAAC,IAAI,CAAC;YACb,MAAM,EAAE,eAAe;YACvB,UAAU,EAAE,EAAE,QAAQ,EAAE,QAAQ,EAAE,QAAQ,EAAE,QAAQ,CAAC,IAAI,EAAE;SAC3D,CAAC,CAAA;IACH,CAAC;IACD,IAAI,QAAQ,IAAI,MAAM,EAAE,CAAC;QACxB,QAAQ,CAAC,IAAI,CAAC;YACb,MAAM,EAAE,sBAAsB;YAC9B,UAAU,EAAE,EAAE,QAAQ,EAAE,QAAQ,EAAE,QAAQ,EAAE,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,MAAM,CAAC,IAAI,EAAE;SAChF,CAAC,CAAA;IACH,CAAC;IACD,IAAI,QAAQ,IAAI,MAAM,IAAI,OAAO,EAAE,CAAC;QACnC,QAAQ,CAAC,IAAI,CAAC;YACb,MAAM,EAAE,8BAA8B;YACtC,UAAU,EAAE;gBACX,QAAQ,EAAE,QAAQ;gBAClB,QAAQ,EAAE,QAAQ,CAAC,IAAI;gBACvB,MAAM,EAAE,MAAM,CAAC,IAAI;gBACnB,OAAO,EAAE,cAAc;aACvB;SACD,CAAC,CAAA;IACH,CAAC;IAED,OAAO,QAAQ,CAAA;AAChB,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,YAAY,CAAC,GAAc;IAC1C,MAAM,IAAI,GAAG,IAAI,GAAG,CAAS,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,CAAA;IACxC,MAAM,KAAK,GAA0C,CAAC,EAAE,GAAG,EAAE,SAAS,EAAE,KAAK,EAAE,GAAG,CAAC,IAAI,EAAE,CAAC,CAAA;IAC1F,KAAK,MAAM,CAAC,MAAM,EAAE,KAAK,CAAC,IAAI,GAAG,CAAC,YAAY,EAAE,CAAC;QAChD,IAAI,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC;YAAE,SAAQ;QAC7B,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,CAAA;QACf,KAAK,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,gBAAgB,CAAC,MAAM,CAAC,EAAE,KAAK,EAAE,CAAC,CAAA;IACrD,CAAC;IACD,OAAO,KAAK,CAAA;AACb,CAAC;AAED,MAAM,CAAC,MAAM,yBAAyB,GAAG,gBAAgB,CAAA;AAEzD,MAAM,UAAU,0BAA0B;IACzC,OAAO;QACN,EAAE,EAAE,yBAAyB;QAC7B,cAAc,EAAE,SAAS;QACzB,WAAW,EACV,wHAAwH;QAEzH,KAAK,CAAC,CAAC,IAAI,CAAC,IAAoB;YAC/B,uFAAuF;YACvF,sFAAsF;YACtF,sFAAsF;YACtF,MAAM,IAAI,GAAG,IAAI,GAAG,EAAqB,CAAA;YACzC,IAAI,KAAK,EAAE,MAAM,GAAG,IAAI,YAAY,CAAC,IAAI,CAAC,SAAS,EAAE,EAAE,MAAM,EAAE,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,CAAC;gBAC/E,IAAI,IAAI,CAAC,MAAM,EAAE,OAAO;oBAAE,OAAM;gBAChC,IAAI,IAAI,CAAC,OAAO,IAAI,GAAG,CAAC,OAAO,KAAK,IAAI,CAAC,OAAO;oBAAE,SAAQ;gBAC1D,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC,SAAS,CAAC;oBAAE,SAAQ;gBAC5C,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,EAAE,GAAG,CAAC,CAAA;YACtB,CAAC;YAED,MAAM,QAAQ,GAAG,kBAAkB,CAAC,IAAI,CAAC,CAAA;YAEzC,iEAAiE;YACjE,MAAM,GAAG,GAAG,CAAC,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAA;YAClD,IAAI,OAAO,GAAG,CAAC,CAAA;YAEf,KAAK,MAAM,EAAE,IAAI,GAAG,EAAE,CAAC;gBACtB,IAAI,IAAI,CAAC,MAAM,EAAE,OAAO;oBAAE,OAAM;gBAChC,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,EAAE,CAAE,CAAA;gBACzB,IAAI,cAAc,CAAC,GAAG,CAAC,SAAS,CAAC,KAAK,UAAU;oBAAE,SAAQ;gBAE1D,MAAM,KAAK,GAAG,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,EAAE,CAAA;gBACpC,MAAM,KAAK,GAAG,YAAY,CAAC,GAAG,CAAC,CAAA;gBAE/B,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;oBAC1B,MAAM,QAAQ,GAAG,mBAAmB,CAAC,GAAG,EAAE,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,CAAA;oBAC5D,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;wBAChC,IAAI,IAAI,CAAC,KAAK,KAAK,SAAS,IAAI,OAAO,IAAI,IAAI,CAAC,KAAK;4BAAE,OAAM;wBAE7D,MAAM,GAAG,GAAG,aAAa,CAAC,OAAO,CAAC,UAAU,EAAE,GAAG,CAAC,OAAO,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAA;wBAC/E,IAAI,CAAC,GAAG;4BAAE,SAAQ;wBAClB,MAAM,OAAO,GAAG,mBAAmB,CAAC,OAAO,CAAC,UAAU,EAAE,GAAG,CAAC,CAAA;wBAC5D,IAAI,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,MAAM,KAAK,CAAC;4BAAE,SAAQ;wBAE/C,MAAM;4BACL,GAAG;4BACH,UAAU,EAAE,OAAO;4BACnB,OAAO,EAAE,GAAG,CAAC,OAAO;4BACpB,MAAM,EAAE,iBAAiB,CAAC,GAAG,CAAC,OAAO,CAAC;4BACtC,MAAM,EAAE,yBAAyB;4BACjC,SAAS,EAAE,GAAG,yBAAyB,IAAI,GAAG,CAAC,EAAE,IAAI,IAAI,CAAC,GAAG,IAAI,OAAO,CAAC,MAAM,EAAE;4BACjF,cAAc,EAAE,EAAE;4BAClB,OAAO,EAAE,SAAS;yBAClB,CAAA;wBACD,OAAO,EAAE,CAAA;oBACV,CAAC;gBACF,CAAC;YACF,CAAC;QACF,CAAC;KACD,CAAA;AACF,CAAC;AAED,MAAM,CAAC,MAAM,oBAAoB,GAAG,0BAA0B,EAAE,CAAA"}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Alignment: turn a `CanonicalRow` (raw + components) into a `LabeledRow` (raw + tokens + BIO
|
|
7
|
+
* labels) or a `QuarantinedRow` (raw + reason) per the Phase 1 plan.
|
|
8
|
+
*
|
|
9
|
+
* Pipeline:
|
|
10
|
+
*
|
|
11
|
+
* 1. For each `(tag, value)` in `components`, find the value's character span in `raw`. First try a
|
|
12
|
+
* verbatim substring match (case-insensitive, whitespace-collapsed). If that fails, fall
|
|
13
|
+
* back to fuzzy match via `fastest-levenshtein`, with a tunable edit distance threshold.
|
|
14
|
+
* 2. If any component cannot be located, reject the row with a human-readable reason and send it to
|
|
15
|
+
* the quarantine pile (`reason: "component-not-found:<tag>"` or
|
|
16
|
+
* `"edit-distance-exceeded:<tag>:<dist>"`).
|
|
17
|
+
* 3. Tokenize `raw` with the supplied `Tokenizer` (defaults to the whitespace tokenizer).
|
|
18
|
+
* 4. For each token: walk the list of component spans, pick the one whose span contains the token's
|
|
19
|
+
* character range. First token in a component span → `B-<tag>`; subsequent tokens →
|
|
20
|
+
* `I-<tag>`; no overlap → `O`.
|
|
21
|
+
*
|
|
22
|
+
* Two structural invariants the function preserves:
|
|
23
|
+
*
|
|
24
|
+
* - `tokens.length === labels.length` always.
|
|
25
|
+
* - Each component contributes at most one contiguous BIO run (no `B-tag … O … I-tag` gaps). This is
|
|
26
|
+
* enforced by greedy first-match span assignment + ordered token iteration.
|
|
27
|
+
*/
|
|
28
|
+
import { type Tokenizer } from "./tokenize.js";
|
|
29
|
+
import type { CanonicalRow, LabeledRow, QuarantinedRow } from "./types.js";
|
|
30
|
+
/** Options for `alignRow`. */
|
|
31
|
+
export interface AlignOptions {
|
|
32
|
+
/** Tokenizer to use. Defaults to `whitespaceTokenizer()`. */
|
|
33
|
+
tokenizer?: Tokenizer;
|
|
34
|
+
/**
|
|
35
|
+
* Max Levenshtein edit distance to accept when a verbatim substring match fails. Set `0` to
|
|
36
|
+
* require verbatim matches only. Default `2`.
|
|
37
|
+
*
|
|
38
|
+
* Distance is computed against same-length windows in `raw`, so the threshold scales naturally
|
|
39
|
+
* with the component value length.
|
|
40
|
+
*/
|
|
41
|
+
maxEditDistance?: number;
|
|
42
|
+
/**
|
|
43
|
+
* Case-insensitive comparison for substring search. Default `true`. The retained span in `raw` is
|
|
44
|
+
* the original case; only matching is case-insensitive.
|
|
45
|
+
*/
|
|
46
|
+
caseInsensitive?: boolean;
|
|
47
|
+
}
|
|
48
|
+
/** Either a successful labeled row or a quarantined one. */
|
|
49
|
+
export type AlignmentResult = {
|
|
50
|
+
kind: "labeled";
|
|
51
|
+
row: LabeledRow;
|
|
52
|
+
} | {
|
|
53
|
+
kind: "quarantined";
|
|
54
|
+
row: QuarantinedRow;
|
|
55
|
+
};
|
|
56
|
+
/** Align a single row. */
|
|
57
|
+
export declare function alignRow(row: CanonicalRow, opts?: AlignOptions): AlignmentResult;
|
|
58
|
+
//# sourceMappingURL=align.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"align.d.ts","sourceRoot":"","sources":["../../src/align.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;GA0BG;AAIH,OAAO,EAAuC,KAAK,SAAS,EAAE,MAAM,eAAe,CAAA;AACnF,OAAO,KAAK,EAAE,YAAY,EAAE,UAAU,EAAE,cAAc,EAAE,MAAM,YAAY,CAAA;AAE1E,8BAA8B;AAC9B,MAAM,WAAW,YAAY;IAC5B,6DAA6D;IAC7D,SAAS,CAAC,EAAE,SAAS,CAAA;IAErB;;;;;;OAMG;IACH,eAAe,CAAC,EAAE,MAAM,CAAA;IAExB;;;OAGG;IACH,eAAe,CAAC,EAAE,OAAO,CAAA;CACzB;AAED,4DAA4D;AAC5D,MAAM,MAAM,eAAe,GAAG;IAAE,IAAI,EAAE,SAAS,CAAC;IAAC,GAAG,EAAE,UAAU,CAAA;CAAE,GAAG;IAAE,IAAI,EAAE,aAAa,CAAC;IAAC,GAAG,EAAE,cAAc,CAAA;CAAE,CAAA;AAQjH,0BAA0B;AAC1B,wBAAgB,QAAQ,CAAC,GAAG,EAAE,YAAY,EAAE,IAAI,GAAE,YAAiB,GAAG,eAAe,CAyCpF"}
|