@mailwoman/resolver-wof-sqlite 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +250 -0
- package/out/address-point-interpolation.d.ts +48 -0
- package/out/address-point-interpolation.d.ts.map +1 -0
- package/out/address-point-interpolation.js +164 -0
- package/out/address-point-interpolation.js.map +1 -0
- package/out/address-point-schema.d.ts +58 -0
- package/out/address-point-schema.d.ts.map +1 -0
- package/out/address-point-schema.js +67 -0
- package/out/address-point-schema.js.map +1 -0
- package/out/address-point.d.ts +29 -0
- package/out/address-point.d.ts.map +1 -0
- package/out/address-point.js +62 -0
- package/out/address-point.js.map +1 -0
- package/out/ancestry.d.ts +40 -0
- package/out/ancestry.d.ts.map +1 -0
- package/out/ancestry.js +53 -0
- package/out/ancestry.js.map +1 -0
- package/out/build-candidate-cli.d.ts +16 -0
- package/out/build-candidate-cli.d.ts.map +1 -0
- package/out/build-candidate-cli.js +80 -0
- package/out/build-candidate-cli.js.map +1 -0
- package/out/build-candidate.d.ts +54 -0
- package/out/build-candidate.d.ts.map +1 -0
- package/out/build-candidate.js +230 -0
- package/out/build-candidate.js.map +1 -0
- package/out/build-coincident-roles-cli.d.ts +16 -0
- package/out/build-coincident-roles-cli.d.ts.map +1 -0
- package/out/build-coincident-roles-cli.js +94 -0
- package/out/build-coincident-roles-cli.js.map +1 -0
- package/out/build-fts-cli.d.ts +23 -0
- package/out/build-fts-cli.d.ts.map +1 -0
- package/out/build-fts-cli.js +117 -0
- package/out/build-fts-cli.js.map +1 -0
- package/out/build-slim-cli.d.ts +14 -0
- package/out/build-slim-cli.d.ts.map +1 -0
- package/out/build-slim-cli.js +130 -0
- package/out/build-slim-cli.js.map +1 -0
- package/out/build-slim.d.ts +71 -0
- package/out/build-slim.d.ts.map +1 -0
- package/out/build-slim.js +267 -0
- package/out/build-slim.js.map +1 -0
- package/out/candidate-lookup.d.ts +43 -0
- package/out/candidate-lookup.d.ts.map +1 -0
- package/out/candidate-lookup.js +191 -0
- package/out/candidate-lookup.js.map +1 -0
- package/out/candidate-schema.d.ts +86 -0
- package/out/candidate-schema.d.ts.map +1 -0
- package/out/candidate-schema.js +109 -0
- package/out/candidate-schema.js.map +1 -0
- package/out/coincident-roles.d.ts +86 -0
- package/out/coincident-roles.d.ts.map +1 -0
- package/out/coincident-roles.js +160 -0
- package/out/coincident-roles.js.map +1 -0
- package/out/convention.d.ts +109 -0
- package/out/convention.d.ts.map +1 -0
- package/out/convention.js +94 -0
- package/out/convention.js.map +1 -0
- package/out/fst-autocomplete.d.ts +49 -0
- package/out/fst-autocomplete.d.ts.map +1 -0
- package/out/fst-autocomplete.js +124 -0
- package/out/fst-autocomplete.js.map +1 -0
- package/out/fst-builder.d.ts +20 -0
- package/out/fst-builder.d.ts.map +1 -0
- package/out/fst-builder.js +219 -0
- package/out/fst-builder.js.map +1 -0
- package/out/fst-deserialize-web.d.ts +16 -0
- package/out/fst-deserialize-web.d.ts.map +1 -0
- package/out/fst-deserialize-web.js +133 -0
- package/out/fst-deserialize-web.js.map +1 -0
- package/out/fst-matcher.d.ts +33 -0
- package/out/fst-matcher.d.ts.map +1 -0
- package/out/fst-matcher.js +117 -0
- package/out/fst-matcher.js.map +1 -0
- package/out/fst-serialize.d.ts +30 -0
- package/out/fst-serialize.d.ts.map +1 -0
- package/out/fst-serialize.js +261 -0
- package/out/fst-serialize.js.map +1 -0
- package/out/fst-types.d.ts +60 -0
- package/out/fst-types.d.ts.map +1 -0
- package/out/fst-types.js +11 -0
- package/out/fst-types.js.map +1 -0
- package/out/fts.d.ts +158 -0
- package/out/fts.d.ts.map +1 -0
- package/out/fts.js +261 -0
- package/out/fts.js.map +1 -0
- package/out/geo.d.ts +74 -0
- package/out/geo.d.ts.map +1 -0
- package/out/geo.js +88 -0
- package/out/geo.js.map +1 -0
- package/out/index.d.ts +27 -0
- package/out/index.d.ts.map +1 -0
- package/out/index.js +22 -0
- package/out/index.js.map +1 -0
- package/out/interpolation.d.ts +84 -0
- package/out/interpolation.d.ts.map +1 -0
- package/out/interpolation.js +150 -0
- package/out/interpolation.js.map +1 -0
- package/out/lookup.d.ts +156 -0
- package/out/lookup.d.ts.map +1 -0
- package/out/lookup.js +876 -0
- package/out/lookup.js.map +1 -0
- package/out/postal-city-alias-lookup.d.ts +50 -0
- package/out/postal-city-alias-lookup.d.ts.map +1 -0
- package/out/postal-city-alias-lookup.js +66 -0
- package/out/postal-city-alias-lookup.js.map +1 -0
- package/out/postal-city-alias-schema.d.ts +51 -0
- package/out/postal-city-alias-schema.d.ts.map +1 -0
- package/out/postal-city-alias-schema.js +47 -0
- package/out/postal-city-alias-schema.js.map +1 -0
- package/out/postal-city-candidate-schema.d.ts +58 -0
- package/out/postal-city-candidate-schema.d.ts.map +1 -0
- package/out/postal-city-candidate-schema.js +56 -0
- package/out/postal-city-candidate-schema.js.map +1 -0
- package/out/postcode-point-lookup.d.ts +38 -0
- package/out/postcode-point-lookup.d.ts.map +1 -0
- package/out/postcode-point-lookup.js +46 -0
- package/out/postcode-point-lookup.js.map +1 -0
- package/out/reverse.d.ts +99 -0
- package/out/reverse.d.ts.map +1 -0
- package/out/reverse.js +290 -0
- package/out/reverse.js.map +1 -0
- package/out/schema.d.ts +163 -0
- package/out/schema.d.ts.map +1 -0
- package/out/schema.js +18 -0
- package/out/schema.js.map +1 -0
- package/out/sharding.d.ts +96 -0
- package/out/sharding.d.ts.map +1 -0
- package/out/sharding.js +129 -0
- package/out/sharding.js.map +1 -0
- package/out/sqlite-convention-source.d.ts +29 -0
- package/out/sqlite-convention-source.d.ts.map +1 -0
- package/out/sqlite-convention-source.js +53 -0
- package/out/sqlite-convention-source.js.map +1 -0
- package/out/sqlite-utils.d.ts +17 -0
- package/out/sqlite-utils.d.ts.map +1 -0
- package/out/sqlite-utils.js +24 -0
- package/out/sqlite-utils.js.map +1 -0
- package/out/street-morphology-fst-builder.d.ts +59 -0
- package/out/street-morphology-fst-builder.d.ts.map +1 -0
- package/out/street-morphology-fst-builder.js +174 -0
- package/out/street-morphology-fst-builder.js.map +1 -0
- package/out/street-normalize.d.ts +66 -0
- package/out/street-normalize.d.ts.map +1 -0
- package/out/street-normalize.js +176 -0
- package/out/street-normalize.js.map +1 -0
- package/out/street-segment-schema.d.ts +61 -0
- package/out/street-segment-schema.d.ts.map +1 -0
- package/out/street-segment-schema.js +64 -0
- package/out/street-segment-schema.js.map +1 -0
- package/out/types.d.ts +137 -0
- package/out/types.d.ts.map +1 -0
- package/out/types.js +13 -0
- package/out/types.js.map +1 -0
- package/out/unified-schema.d.ts +25 -0
- package/out/unified-schema.d.ts.map +1 -0
- package/out/unified-schema.js +142 -0
- package/out/unified-schema.js.map +1 -0
- package/package.json +54 -0
package/out/lookup.js
ADDED
|
@@ -0,0 +1,876 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* `WofSqlitePlaceLookup` — the resolver implementation backed by `node:sqlite` + a Kysely-typed
|
|
7
|
+
* query layer where the queries are non-trivial, and raw SQL where they aren't (FTS5 MATCH, the
|
|
8
|
+
* FTS index build).
|
|
9
|
+
*
|
|
10
|
+
* See `docs/plan/phases/PHASE_4_2_wof_sqlite.md` for the design rationale.
|
|
11
|
+
*/
|
|
12
|
+
import { Kysely, sql } from "kysely";
|
|
13
|
+
import { DatabaseSync } from "node:sqlite";
|
|
14
|
+
import { SqliteDialect } from "@mailwoman/core/kysley/dialect";
|
|
15
|
+
import { expandPlacetypeFilter } from "@mailwoman/resolver";
|
|
16
|
+
import { ADDRESS_CONVENTION_TABLE, resolveConvention, SeedConventionSource, } from "./convention.js";
|
|
17
|
+
import { ancestorLineage } from "./ancestry.js";
|
|
18
|
+
import { COINCIDENT_ROLES_TABLE, coincidentRolesExists } from "./coincident-roles.js";
|
|
19
|
+
import { aliasBagExactMatch, buildPlaceSearchFts, PLACE_BBOX_TABLE, PLACE_POPULATION_TABLE, placeBboxExists, placePopulationExists, placeSearchFtsExists, } from "./fts.js";
|
|
20
|
+
import { bboxAround, haversineKm } from "./geo.js";
|
|
21
|
+
import { pickShardForPlacetype, resolveShards } from "./sharding.js";
|
|
22
|
+
import { SqliteConventionSource } from "./sqlite-convention-source.js";
|
|
23
|
+
const DEFAULT_WEIGHTS = {
|
|
24
|
+
placetypeMatchBoost: 0.5,
|
|
25
|
+
localityImplicitBoost: 0.2,
|
|
26
|
+
countryMatchBoost: 0.3,
|
|
27
|
+
directChildBoost: 0.5,
|
|
28
|
+
descendantBoost: 0.2,
|
|
29
|
+
lengthPenaltyWeight: 0.1,
|
|
30
|
+
proximityBoost: 0.8,
|
|
31
|
+
proximityScaleKm: 100,
|
|
32
|
+
// populationBoost is intentionally large — empirical tuning against real WOF showed BM25 gaps
|
|
33
|
+
// of 1.5-3.0 between famous places and tiny same-name peers (because the famous ones have
|
|
34
|
+
// hundreds of alt-name entries that hurt their FTS document score). To consistently surface
|
|
35
|
+
// "the famous one" for unambiguous queries like "New York" or "Chicago", the population signal
|
|
36
|
+
// needs to dominate. Callers wanting a more conservative balance can drop this in the
|
|
37
|
+
// RankingWeights override.
|
|
38
|
+
//
|
|
39
|
+
// Note: this resolver uses `place_population` directly. The separate `place_importance` table
|
|
40
|
+
// (Wikipedia-derived) is consumed by the FST layer, not here. See
|
|
41
|
+
// docs/articles/concepts/importance-vs-population.md for the two-signal contract.
|
|
42
|
+
populationBoost: 4.0,
|
|
43
|
+
populationScaleLog10: 6,
|
|
44
|
+
// Exact name/alias match outranks partial match before the weighted sum (incl. population) is
|
|
45
|
+
// consulted — keeps population as an intra-tier prominence tiebreaker, not a cross-tier promoter.
|
|
46
|
+
// Fixes the 2-letter-region-abbrev bug ("ME" → Maine, not the more-populous Missouri).
|
|
47
|
+
exactMatchTiering: true,
|
|
48
|
+
};
|
|
49
|
+
/**
|
|
50
|
+
* Over-fetch floor for SHORT (≤3-char) queries — region abbreviations like "NY"/"VT". An
|
|
51
|
+
* exact-abbrev holder's BM25 is poor (long multilingual alt-name document), so the normal `limit *
|
|
52
|
+
* 4` window can drop it before `exactMatchTiering` promotes it. 200 comfortably covers every
|
|
53
|
+
* same-abbrev region across the 12-country gazetteer (a 2-letter token matches a few dozen regions
|
|
54
|
+
* at most) while staying a cheap region-placetype fetch. See the `#fuzzyNameMatch` over-fetch
|
|
55
|
+
* comment.
|
|
56
|
+
*/
|
|
57
|
+
const SHORT_QUERY_OVERFETCH = 200;
|
|
58
|
+
/**
|
|
59
|
+
* The coordinate-first candidate table (scripts/build-postcode-locality.py): postcode → containing
|
|
60
|
+
*
|
|
61
|
+
* - Nearby localities with WOF alt-name aliases.
|
|
62
|
+
*/
|
|
63
|
+
const POSTCODE_LOCALITY_TABLE = "postcode_locality";
|
|
64
|
+
/**
|
|
65
|
+
* Tunables for the coordinate-first locality soft-score `Score = pc·S_pc + name·S_name + pop·S_pop`
|
|
66
|
+
* (each S in [0,1]). The pc/name/pop WEIGHTS now come from the resolved convention's
|
|
67
|
+
* `scoringWeights` (`WORLD_DEFAULT` = 0.6/0.3/0.1 — the EU values), so a locale can retune them as
|
|
68
|
+
* data. PC_DECAY_KM sets how fast S_pc falls with distance.
|
|
69
|
+
*/
|
|
70
|
+
const CF_PC_DECAY_KM = 8;
|
|
71
|
+
/**
|
|
72
|
+
* The chosen locality must be within this distance of the postcode's containing locality, else the
|
|
73
|
+
* postcode and the parsed city name are judged to disagree (a transposed / wrong-for-the-city
|
|
74
|
+
* postcode) and the `mismatch` flag fires. Generous enough that a city-state Ortsteil (~15km from
|
|
75
|
+
* the city centroid) and an abutting town (~few km) are NOT flagged, tight enough to catch a wrong
|
|
76
|
+
* city (hundreds of km).
|
|
77
|
+
*/
|
|
78
|
+
const CF_MISMATCH_KM = 50;
|
|
79
|
+
const CF_MISMATCH_DELTA = 0.5;
|
|
80
|
+
/** Case-fold + strip diacritics + collapse punctuation — for the coord-first soft name match. */
|
|
81
|
+
function cfNormalize(s) {
|
|
82
|
+
return s
|
|
83
|
+
.toLowerCase()
|
|
84
|
+
.normalize("NFD")
|
|
85
|
+
.replace(/[\u0300-\u036f]/g, "") // combining diacritical marks
|
|
86
|
+
.replace(/[^a-z0-9]+/g, " ")
|
|
87
|
+
.trim();
|
|
88
|
+
}
|
|
89
|
+
/** Padded character-trigram set (a leading/trailing space pads short tokens). */
|
|
90
|
+
function trigrams(s) {
|
|
91
|
+
const t = ` ${s} `;
|
|
92
|
+
const out = new Set();
|
|
93
|
+
for (let i = 0; i + 3 <= t.length; i++)
|
|
94
|
+
out.add(t.slice(i, i + 3));
|
|
95
|
+
return out;
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* Character-trigram Jaccard ∈ [0,1] — tolerant of the swallowed-leading-char fragments ("auen" vs
|
|
99
|
+
* "plauen") and minor misspellings without a heavyweight edit-distance pass.
|
|
100
|
+
*/
|
|
101
|
+
function trigramJaccard(a, b) {
|
|
102
|
+
const A = trigrams(a);
|
|
103
|
+
const B = trigrams(b);
|
|
104
|
+
if (A.size === 0 || B.size === 0)
|
|
105
|
+
return 0;
|
|
106
|
+
let inter = 0;
|
|
107
|
+
for (const x of A)
|
|
108
|
+
if (B.has(x))
|
|
109
|
+
inter++;
|
|
110
|
+
return inter / (A.size + B.size - inter);
|
|
111
|
+
}
|
|
112
|
+
/** Soft name-match score ∈ [0,1]: exact (normalized) name/alias → 1, else best trigram-Jaccard. */
|
|
113
|
+
function softNameScore(text, name, aliases) {
|
|
114
|
+
const q = cfNormalize(text);
|
|
115
|
+
if (!q)
|
|
116
|
+
return 0;
|
|
117
|
+
let best = 0;
|
|
118
|
+
for (const raw of [name, ...aliases]) {
|
|
119
|
+
const n = cfNormalize(raw);
|
|
120
|
+
if (!n)
|
|
121
|
+
continue;
|
|
122
|
+
if (n === q)
|
|
123
|
+
return 1;
|
|
124
|
+
best = Math.max(best, trigramJaccard(q, n));
|
|
125
|
+
}
|
|
126
|
+
return best;
|
|
127
|
+
}
|
|
128
|
+
export class WofSqlitePlaceLookup {
|
|
129
|
+
#db;
|
|
130
|
+
#ownsDb;
|
|
131
|
+
#kysely;
|
|
132
|
+
#weights;
|
|
133
|
+
/**
|
|
134
|
+
* Cached at construction so we don't `sqlite_master` query on every findPlace call. Bbox + near-
|
|
135
|
+
* with-radius queries fall back to no-filter when this is false, preserving compatibility with
|
|
136
|
+
* DBs that were FTS-built before the R*Tree shipped.
|
|
137
|
+
*
|
|
138
|
+
* Per-shard: a shard is only considered to have the bbox index if its own R*Tree table exists.
|
|
139
|
+
*/
|
|
140
|
+
#hasBboxIndex;
|
|
141
|
+
/**
|
|
142
|
+
* Per-shard probe for the `place_population` aux table. When false, the LEFT JOIN is omitted from
|
|
143
|
+
* the SELECT and population boost is 0 for every row — preserves compatibility with DBs built
|
|
144
|
+
* before this feature shipped.
|
|
145
|
+
*/
|
|
146
|
+
#hasPopulationIndex;
|
|
147
|
+
/**
|
|
148
|
+
* Per-shard probe for the `postcode_locality` table (the coordinate-first candidate table, built
|
|
149
|
+
* by scripts/build-postcode-locality.py). Cached at construction; null'd out when absent so the
|
|
150
|
+
* coord-first path silently no-ops on a deployment that didn't ship the table.
|
|
151
|
+
*/
|
|
152
|
+
#postcodeLocalityShard;
|
|
153
|
+
/**
|
|
154
|
+
* Resolved shard list. Always at least one entry; first is `main`. Multi-shard adds extras with
|
|
155
|
+
* their own derived (or override) schema names.
|
|
156
|
+
*/
|
|
157
|
+
#shards;
|
|
158
|
+
/**
|
|
159
|
+
* The Geographic Rule Engine (Direction E, #289). `#conventionSource` supplies per-WOF-polygon
|
|
160
|
+
* resolution profiles; `#strategies` is the named-primitive registry the merged convention
|
|
161
|
+
* dispatches. Empty source → every query resolves to `WORLD_DEFAULT` → byte-identical to the
|
|
162
|
+
* pre-engine coordinate-first path. `#countryWofIdCache` memoizes the country-code →
|
|
163
|
+
* country-WOF-id lookup that seeds the convention ancestor chain (one query per country, then
|
|
164
|
+
* cached).
|
|
165
|
+
*/
|
|
166
|
+
#conventionSource;
|
|
167
|
+
#strategies;
|
|
168
|
+
#countryWofIdCache = new Map();
|
|
169
|
+
/** Strategy names already warned about — so an unknown name surfaces once, not once per query. */
|
|
170
|
+
#warnedUnknownStrategies = new Set();
|
|
171
|
+
/**
|
|
172
|
+
* Lazily-built `admin_id → coincident localities` map from the #403 relation (null until first
|
|
173
|
+
* use).
|
|
174
|
+
*/
|
|
175
|
+
#coincidentRolesCache = null;
|
|
176
|
+
/** Per-id memoized ancestor lineages (#404) — a hot chain is queried once. */
|
|
177
|
+
#ancestorsCache = new Map();
|
|
178
|
+
/**
|
|
179
|
+
* Opt-in postal-city alias reader (#475). `null` unless `opts.postalCityAliases` was supplied —
|
|
180
|
+
* every alias code path is gated on this, so the default resolver is byte-identical.
|
|
181
|
+
*/
|
|
182
|
+
#postalCityAliases;
|
|
183
|
+
constructor(opts, weights) {
|
|
184
|
+
if (opts.database && opts.databasePath) {
|
|
185
|
+
throw new Error("WofSqlitePlaceLookup: pass either `database` or `databasePath`, not both");
|
|
186
|
+
}
|
|
187
|
+
if (!opts.database && !opts.databasePath) {
|
|
188
|
+
throw new Error("WofSqlitePlaceLookup: one of `database` or `databasePath` is required");
|
|
189
|
+
}
|
|
190
|
+
if (opts.database) {
|
|
191
|
+
this.#db = opts.database;
|
|
192
|
+
this.#ownsDb = false;
|
|
193
|
+
this.#shards = [{ path: ":memory:", schemaName: "main", placetypes: [] }];
|
|
194
|
+
}
|
|
195
|
+
else {
|
|
196
|
+
const shards = resolveShards(opts.databasePath);
|
|
197
|
+
this.#shards = shards;
|
|
198
|
+
this.#db = new DatabaseSync(shards[0].path, { readOnly: false });
|
|
199
|
+
this.#ownsDb = true;
|
|
200
|
+
// ATTACH each non-main shard. Schema names were validated by resolveShards, so safe to
|
|
201
|
+
// interpolate directly (SQLite ATTACH doesn't accept parameters for the schema name).
|
|
202
|
+
for (const s of shards.slice(1)) {
|
|
203
|
+
this.#db.exec(`ATTACH DATABASE '${s.path.replace(/'/g, "''")}' AS ${s.schemaName}`);
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
// node:sqlite has no .pragma() helper; pragmas are executed as plain SQL.
|
|
207
|
+
this.#db.exec("PRAGMA busy_timeout = 5000");
|
|
208
|
+
if (opts.buildFts) {
|
|
209
|
+
this.#ensureFts();
|
|
210
|
+
}
|
|
211
|
+
else {
|
|
212
|
+
this.#assertFtsExists();
|
|
213
|
+
}
|
|
214
|
+
this.#kysely = new Kysely({
|
|
215
|
+
dialect: new SqliteDialect({ database: this.#db }),
|
|
216
|
+
});
|
|
217
|
+
this.#weights = { ...DEFAULT_WEIGHTS, ...(weights ?? {}) };
|
|
218
|
+
// Probe each shard's aux-table presence — driven by per-shard table existence in
|
|
219
|
+
// sqlite_master. Cached at construction so findPlace doesn't query sqlite_master per call.
|
|
220
|
+
this.#hasBboxIndex = new Map();
|
|
221
|
+
this.#hasPopulationIndex = new Map();
|
|
222
|
+
for (const s of this.#shards) {
|
|
223
|
+
this.#hasBboxIndex.set(s.schemaName, this.#shardHasTable(s.schemaName, PLACE_BBOX_TABLE));
|
|
224
|
+
this.#hasPopulationIndex.set(s.schemaName, this.#shardHasTable(s.schemaName, PLACE_POPULATION_TABLE));
|
|
225
|
+
}
|
|
226
|
+
// The postcode_locality table can live on any attached shard (typically its own
|
|
227
|
+
// `postcode-locality-<cc>.db`). Find the first shard that has it; null = coord-first disabled.
|
|
228
|
+
this.#postcodeLocalityShard =
|
|
229
|
+
this.#shards.find((s) => this.#shardHasTable(s.schemaName, POSTCODE_LOCALITY_TABLE))?.schemaName ?? null;
|
|
230
|
+
// Opt-in postal-city alias reader (#475). Construction-time present-or-not is the gate: null
|
|
231
|
+
// keeps the coordinate-first scorer byte-identical to pre-#475.
|
|
232
|
+
this.#postalCityAliases = opts.postalCityAliases ?? null;
|
|
233
|
+
// The Geographic Rule Engine convention source. Precedence: an explicit `opts.conventions`
|
|
234
|
+
// (a ready source or a seed map) wins; else the build-from-source convention asset if one is
|
|
235
|
+
// attached (auto-detected, like the postcode_locality shard — adding conventions.db to
|
|
236
|
+
// databasePath enables it; queried on demand, not paged into memory); else empty, so EU rides
|
|
237
|
+
// WORLD_DEFAULT. The registry binds strategy NAMES to the SQL-bound primitives — adding a
|
|
238
|
+
// strategy is registering it here.
|
|
239
|
+
const conventionShard = this.#shards.find((s) => this.#shardHasTable(s.schemaName, ADDRESS_CONVENTION_TABLE))?.schemaName ?? null;
|
|
240
|
+
this.#conventionSource = opts.conventions
|
|
241
|
+
? "get" in opts.conventions && typeof opts.conventions.get === "function"
|
|
242
|
+
? opts.conventions
|
|
243
|
+
: new SeedConventionSource(opts.conventions)
|
|
244
|
+
: conventionShard
|
|
245
|
+
? new SqliteConventionSource(this.#db, conventionShard)
|
|
246
|
+
: new SeedConventionSource();
|
|
247
|
+
this.#strategies = new Map([
|
|
248
|
+
["postcode_area_resolution", (q, c) => this.#postcodeAreaResolution(q, c)],
|
|
249
|
+
["fallback_fuzzy_name_match", (q) => this.#fuzzyNameMatch(q)],
|
|
250
|
+
]);
|
|
251
|
+
}
|
|
252
|
+
#shardHasTable(schemaName, tableName) {
|
|
253
|
+
// For main, the existing helpers work directly. For attached shards we have to ask via the
|
|
254
|
+
// schema-qualified `sqlite_master` view.
|
|
255
|
+
if (schemaName === "main") {
|
|
256
|
+
if (tableName === PLACE_BBOX_TABLE)
|
|
257
|
+
return placeBboxExists(this.#db);
|
|
258
|
+
if (tableName === PLACE_POPULATION_TABLE)
|
|
259
|
+
return placePopulationExists(this.#db);
|
|
260
|
+
}
|
|
261
|
+
const row = this.#db
|
|
262
|
+
.prepare(`SELECT name FROM ${schemaName}.sqlite_master WHERE type = 'table' AND name = ?`)
|
|
263
|
+
.get(tableName);
|
|
264
|
+
return Boolean(row);
|
|
265
|
+
}
|
|
266
|
+
async findPlace(query) {
|
|
267
|
+
// Geographic Rule Engine dispatch (#289). Resolve the effective convention for this query
|
|
268
|
+
// (WORLD_DEFAULT for the EU locales — the seed source is empty) and run its candidate strategies
|
|
269
|
+
// in order; the first to return a non-null result wins. The default list,
|
|
270
|
+
// [postcode_area_resolution, fallback_fuzzy_name_match], reproduces the pre-engine coordinate-
|
|
271
|
+
// first → FTS fall-through exactly. Unknown strategy names are skipped, so a convention may name
|
|
272
|
+
// a primitive a future phase will register.
|
|
273
|
+
const convention = this.#conventionFor(query);
|
|
274
|
+
for (const name of convention.candidateStrategies) {
|
|
275
|
+
const strategy = this.#strategies.get(name);
|
|
276
|
+
if (!strategy) {
|
|
277
|
+
this.#warnUnknownStrategy(name);
|
|
278
|
+
continue;
|
|
279
|
+
}
|
|
280
|
+
const result = await strategy(query, convention);
|
|
281
|
+
if (result !== null)
|
|
282
|
+
return result;
|
|
283
|
+
}
|
|
284
|
+
return [];
|
|
285
|
+
}
|
|
286
|
+
/**
|
|
287
|
+
* Dual-role localities coincident with an admin id, from the precomputed `coincident_roles`
|
|
288
|
+
* relation (#403). Backs {@link ResolveOpts.hierarchyCompletion} (#405): O(1) once the relation is
|
|
289
|
+
* loaded. Returns `[]` when the relation table is absent (older DB) or the admin isn't a
|
|
290
|
+
* dual-role place, so completion degrades gracefully. The relation + `spr` join is loaded once
|
|
291
|
+
* and memoized.
|
|
292
|
+
*/
|
|
293
|
+
coincidentLocalitiesFor(adminId) {
|
|
294
|
+
const id = typeof adminId === "number" ? adminId : Number(adminId);
|
|
295
|
+
if (!Number.isFinite(id))
|
|
296
|
+
return [];
|
|
297
|
+
if (!this.#coincidentRolesCache) {
|
|
298
|
+
const map = new Map();
|
|
299
|
+
if (coincidentRolesExists(this.#db)) {
|
|
300
|
+
const rows = this.#db
|
|
301
|
+
.prepare(`SELECT cr.admin_id AS adminId, s.id AS id, s.name AS name, s.country AS country,
|
|
302
|
+
s.latitude AS lat, s.longitude AS lon,
|
|
303
|
+
cr.relationship_type AS relationshipType, cr.locality_population AS population,
|
|
304
|
+
cr.distance_km AS distanceKm
|
|
305
|
+
FROM ${COINCIDENT_ROLES_TABLE} cr JOIN spr s ON s.id = cr.locality_id`)
|
|
306
|
+
.all();
|
|
307
|
+
for (const r of rows) {
|
|
308
|
+
const candidate = {
|
|
309
|
+
id: r.id,
|
|
310
|
+
name: r.name,
|
|
311
|
+
placetype: "locality",
|
|
312
|
+
country: r.country,
|
|
313
|
+
lat: r.lat,
|
|
314
|
+
lon: r.lon,
|
|
315
|
+
score: 0,
|
|
316
|
+
relationshipType: r.relationshipType,
|
|
317
|
+
population: r.population,
|
|
318
|
+
distanceKm: r.distanceKm,
|
|
319
|
+
};
|
|
320
|
+
const list = map.get(r.adminId);
|
|
321
|
+
if (list)
|
|
322
|
+
list.push(candidate);
|
|
323
|
+
else
|
|
324
|
+
map.set(r.adminId, [candidate]);
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
this.#coincidentRolesCache = map;
|
|
328
|
+
}
|
|
329
|
+
return this.#coincidentRolesCache.get(id) ?? [];
|
|
330
|
+
}
|
|
331
|
+
/**
|
|
332
|
+
* The ancestor lineage of a place — its containment chain joined with `spr` for canonical names,
|
|
333
|
+
* ordered NEAREST-FIRST (localadmin → county → region → … → country). Backs
|
|
334
|
+
* {@link ResolveOpts.includeAncestors} (#404). Self is excluded; memoized per id. Returns `[]`
|
|
335
|
+
* when the place has no recorded ancestry.
|
|
336
|
+
*
|
|
337
|
+
* The walk itself lives in `ancestry.ts` (shared with the reverse geocoder, #484); the ordering
|
|
338
|
+
* is its `PLACETYPE_DEPTH` table — same ranking as the previous inline SQL CASE, extended below
|
|
339
|
+
* `localadmin` so locality/neighbourhood ancestors order correctly instead of sorting last.
|
|
340
|
+
*/
|
|
341
|
+
ancestors(id) {
|
|
342
|
+
const pid = typeof id === "number" ? id : Number(id);
|
|
343
|
+
if (!Number.isFinite(pid))
|
|
344
|
+
return [];
|
|
345
|
+
const cached = this.#ancestorsCache.get(pid);
|
|
346
|
+
if (cached)
|
|
347
|
+
return cached;
|
|
348
|
+
const lineage = ancestorLineage(this.#db, pid).map((r) => ({
|
|
349
|
+
id: r.id,
|
|
350
|
+
placetype: r.placetype,
|
|
351
|
+
name: r.name,
|
|
352
|
+
}));
|
|
353
|
+
this.#ancestorsCache.set(pid, lineage);
|
|
354
|
+
return lineage;
|
|
355
|
+
}
|
|
356
|
+
/**
|
|
357
|
+
* Surface an unknown strategy name LOUDLY (once per name) rather than swallowing it silently — an
|
|
358
|
+
* invisible no-op is exactly the hidden-dependency failure mode we avoid (see the
|
|
359
|
+
* provenance-first design value). We warn rather than throw so a convention asset built against a
|
|
360
|
+
* newer code revision (one that adds a strategy) degrades gracefully on an older build instead of
|
|
361
|
+
* taking down resolution.
|
|
362
|
+
*/
|
|
363
|
+
#warnUnknownStrategy(name) {
|
|
364
|
+
if (this.#warnedUnknownStrategies.has(name))
|
|
365
|
+
return;
|
|
366
|
+
this.#warnedUnknownStrategies.add(name);
|
|
367
|
+
console.warn(`WofSqlitePlaceLookup: a convention names strategy "${name}", which this build does not register ` +
|
|
368
|
+
`(known: ${[...this.#strategies.keys()].join(", ")}). Skipping it. If the convention asset was built ` +
|
|
369
|
+
`against a newer code revision, rebuild the asset for this one.`);
|
|
370
|
+
}
|
|
371
|
+
/**
|
|
372
|
+
* Strategy `postcode_area_resolution` — the coordinate-first locality path, strictly gated (a
|
|
373
|
+
* sibling postcode AND a postcode_locality table AND a locality query). Returns `null` — so the
|
|
374
|
+
* dispatcher falls through to the next strategy — when the gate is unmet or the postcode isn't in
|
|
375
|
+
* the table; otherwise the soft-scored postcode∪name candidate set.
|
|
376
|
+
*/
|
|
377
|
+
#postcodeAreaResolution(query, convention) {
|
|
378
|
+
if (!(query.postcode && this.#postcodeLocalityShard && this.#isLocalityQuery(query))) {
|
|
379
|
+
return Promise.resolve(null);
|
|
380
|
+
}
|
|
381
|
+
return this.#findLocalityCoordFirst(query, this.#postcodeLocalityShard, convention);
|
|
382
|
+
}
|
|
383
|
+
/**
|
|
384
|
+
* Strategy `fallback_fuzzy_name_match` — the BM25 FTS name-match over the gazetteer, the
|
|
385
|
+
* universal fallback. Always returns an array (never null), so it terminates the dispatch chain.
|
|
386
|
+
*/
|
|
387
|
+
async #fuzzyNameMatch(query) {
|
|
388
|
+
const limit = query.limit ?? 10;
|
|
389
|
+
// Over-fetch so post-scoring + exact-match tiering have room to re-rank. SHORT queries (a 2–3-char
|
|
390
|
+
// region abbreviation like "NY"/"VT") are the danger case the `exactMatchTiering` docstring flags:
|
|
391
|
+
// the exact-abbrev holder's BM25 is poor (its long multilingual alt-name document tanks the score),
|
|
392
|
+
// so under the normal `limit * 4` window it drops OUT of the candidate pool BEFORE tiering can
|
|
393
|
+
// promote it — "NY" then resolves to a token-matching foreign region (Highland, GB) instead of New
|
|
394
|
+
// York. Widen the window for short queries so the exact match is always present to be tiered.
|
|
395
|
+
// (Cross-country abbrev collisions — "VT" is BOTH Vermont and Viterbo — still need a country/
|
|
396
|
+
// postcode signal to disambiguate; this only rescues the window-drop class, not genuine ambiguity.
|
|
397
|
+
// With a `country` hint every abbrev resolves; bare + no-context lifts 7→10/15 US states.)
|
|
398
|
+
const ftsLimit = query.text.trim().length <= 3 ? Math.max(limit * 4, SHORT_QUERY_OVERFETCH) : limit * 4;
|
|
399
|
+
// Expand the placetype filter through the shared equivalence table (core/resolver): a
|
|
400
|
+
// `locality` query must also reach `borough` / `localadmin` rows — Brooklyn-the-borough
|
|
401
|
+
// (pop 2.5M) is a borough, not a locality, and a strict filter made it unreachable so the
|
|
402
|
+
// fuzzy "Brooklyn Park, MN" won instead. Order-preserving: the FIRST entry stays the
|
|
403
|
+
// requested placetype, which is what shard routing keys off below.
|
|
404
|
+
const placetypes = expandPlacetypeFilter(normalizePlacetypes(query.placetype));
|
|
405
|
+
const ftsQuery = sanitizeFtsQuery(query.text);
|
|
406
|
+
if (!ftsQuery)
|
|
407
|
+
return [];
|
|
408
|
+
// Pick the shard for this query. Multi-shard routing is placetype-driven; a query without
|
|
409
|
+
// `placetype` always goes to main. (Mixed-placetype queries with multiple shards aren't
|
|
410
|
+
// supported in v1 — caller can issue two findPlace calls and merge in TS if needed.)
|
|
411
|
+
const firstPlacetype = placetypes?.[0];
|
|
412
|
+
const shard = pickShardForPlacetype(this.#shards, firstPlacetype);
|
|
413
|
+
const sch = shard.schemaName; // bare schema name; safe to interpolate (validated at construction)
|
|
414
|
+
// Filter out historical / superseded / deprecated places by default — they live in the same
|
|
415
|
+
// spr table but should never win a contemporary lookup. `is_current = 0` is the only WOF
|
|
416
|
+
// value that means "not current"; both `-1` (modern) and `1` (legacy) mean current. See #91.
|
|
417
|
+
// Note: with schema-qualified FROM the bare `place_search` reference in MATCH resolves to
|
|
418
|
+
// the FROM table — required by FTS5 parser, see sharding.ts header comment.
|
|
419
|
+
const where = ["place_search MATCH ?", "spr.is_current != 0", "spr.is_deprecated = 0"];
|
|
420
|
+
const params = [ftsQuery];
|
|
421
|
+
if (placetypes && placetypes.length > 0) {
|
|
422
|
+
where.push(`spr.placetype IN (${placetypes.map(() => "?").join(", ")})`);
|
|
423
|
+
params.push(...placetypes);
|
|
424
|
+
}
|
|
425
|
+
if (query.country) {
|
|
426
|
+
where.push("spr.country = ?");
|
|
427
|
+
params.push(query.country);
|
|
428
|
+
}
|
|
429
|
+
if (query.parentId !== undefined) {
|
|
430
|
+
where.push(`(spr.parent_id = ? OR spr.id IN (SELECT id FROM ${sch}.ancestors WHERE ancestor_id = ?))`);
|
|
431
|
+
params.push(query.parentId, query.parentId);
|
|
432
|
+
}
|
|
433
|
+
// Bbox + near-with-radius are SQL-level filters via the R*Tree. We only emit the JOIN when
|
|
434
|
+
// the active shard has the R*Tree; missing-but-requested is silently treated as no-bbox-
|
|
435
|
+
// filter so legacy DBs / shards-without-bbox don't crash.
|
|
436
|
+
const shardHasBbox = this.#hasBboxIndex.get(sch) === true;
|
|
437
|
+
const useBboxJoin = (query.bbox || query.near?.maxDistanceKm !== undefined) && shardHasBbox;
|
|
438
|
+
let joinClause = `JOIN ${sch}.spr ON spr.id = place_search.wof_id`;
|
|
439
|
+
if (useBboxJoin) {
|
|
440
|
+
joinClause += ` JOIN ${sch}.${PLACE_BBOX_TABLE} bbox ON bbox.id = spr.id`;
|
|
441
|
+
// AABB intersection — both bbox sides must overlap. R*Tree handles this in O(log n).
|
|
442
|
+
const filterBox = query.bbox
|
|
443
|
+
? query.bbox
|
|
444
|
+
: bboxAround(query.near.lat, query.near.lon, query.near.maxDistanceKm);
|
|
445
|
+
where.push("bbox.min_lat <= ? AND bbox.max_lat >= ?", "bbox.min_lon <= ? AND bbox.max_lon >= ?");
|
|
446
|
+
params.push(filterBox.maxLat, filterBox.minLat, filterBox.maxLon, filterBox.minLon);
|
|
447
|
+
}
|
|
448
|
+
// LEFT JOIN the population aux table when present. Missing-on-this-shard means the SELECT
|
|
449
|
+
// just doesn't include the population column; the post-scoring loop treats it as 0.
|
|
450
|
+
const shardHasPopulation = this.#hasPopulationIndex.get(sch) === true;
|
|
451
|
+
const populationSelect = shardHasPopulation
|
|
452
|
+
? `${PLACE_POPULATION_TABLE}.population AS population`
|
|
453
|
+
: `NULL AS population`;
|
|
454
|
+
const populationJoin = shardHasPopulation
|
|
455
|
+
? `LEFT JOIN ${sch}.${PLACE_POPULATION_TABLE} ON ${PLACE_POPULATION_TABLE}.id = spr.id`
|
|
456
|
+
: "";
|
|
457
|
+
// Push the population boost into the ORDER BY when the index is available, so famous places
|
|
458
|
+
// (whose long alt-name lists hurt BM25) actually make it into the over-fetch window. The TS
|
|
459
|
+
// post-scoring will still compute the same boost for the final score; this just ensures the
|
|
460
|
+
// candidate set is right.
|
|
461
|
+
//
|
|
462
|
+
// Formula: rank_adjusted = bm25 - populationBoost * min(1.0, log10(1 + pop) / scaleLog10)
|
|
463
|
+
// Lower rank_adjusted = better (matches SQLite's bm25 convention of "more negative = better").
|
|
464
|
+
const orderByExpr = shardHasPopulation
|
|
465
|
+
? `(bm25(place_search) - ? * MIN(1.0, COALESCE(log10(1.0 + ${PLACE_POPULATION_TABLE}.population), 0) / ?))`
|
|
466
|
+
: "bm25(place_search)";
|
|
467
|
+
// Schema-qualified FROM with bare-name MATCH — required syntax for FTS5 on attached schemas.
|
|
468
|
+
// See sharding.ts header for the gotcha that drove this design.
|
|
469
|
+
const stmt = this.#db.prepare(`
|
|
470
|
+
SELECT
|
|
471
|
+
spr.id AS id,
|
|
472
|
+
spr.name,
|
|
473
|
+
spr.placetype,
|
|
474
|
+
spr.country,
|
|
475
|
+
spr.parent_id,
|
|
476
|
+
bm25(place_search) AS rank,
|
|
477
|
+
spr.latitude AS lat,
|
|
478
|
+
spr.longitude AS lon,
|
|
479
|
+
spr.min_latitude, spr.max_latitude, spr.min_longitude, spr.max_longitude,
|
|
480
|
+
${populationSelect}
|
|
481
|
+
FROM ${sch}.place_search
|
|
482
|
+
${joinClause}
|
|
483
|
+
${populationJoin}
|
|
484
|
+
WHERE ${where.join(" AND ")}
|
|
485
|
+
ORDER BY ${orderByExpr} ASC
|
|
486
|
+
LIMIT ?
|
|
487
|
+
`);
|
|
488
|
+
if (shardHasPopulation) {
|
|
489
|
+
params.push(this.#weights.populationBoost, this.#weights.populationScaleLog10);
|
|
490
|
+
}
|
|
491
|
+
params.push(ftsLimit);
|
|
492
|
+
const rawRows = stmt.all(...params);
|
|
493
|
+
const queryLen = query.text.length;
|
|
494
|
+
const candidates = rawRows.map((row) => {
|
|
495
|
+
// SQLite's bm25() returns a lower-is-better score (negative for matches). Negate so we
|
|
496
|
+
// start from a higher-is-better baseline.
|
|
497
|
+
let score = -row.rank;
|
|
498
|
+
if (placetypes && placetypes.length > 0 && placetypes.includes(row.placetype)) {
|
|
499
|
+
score += this.#weights.placetypeMatchBoost;
|
|
500
|
+
}
|
|
501
|
+
if (!placetypes && row.placetype === "locality") {
|
|
502
|
+
score += this.#weights.localityImplicitBoost;
|
|
503
|
+
}
|
|
504
|
+
if (query.country && row.country === query.country) {
|
|
505
|
+
score += this.#weights.countryMatchBoost;
|
|
506
|
+
}
|
|
507
|
+
if (query.parentId !== undefined) {
|
|
508
|
+
if (row.parent_id === query.parentId) {
|
|
509
|
+
score += this.#weights.directChildBoost;
|
|
510
|
+
}
|
|
511
|
+
else {
|
|
512
|
+
score += this.#weights.descendantBoost;
|
|
513
|
+
}
|
|
514
|
+
}
|
|
515
|
+
const extraLen = Math.max(0, row.name.length - queryLen - 3);
|
|
516
|
+
score -= (this.#weights.lengthPenaltyWeight * extraLen) / 10;
|
|
517
|
+
// Proximity boost: only applied when the query carries `near` AND the candidate has real
|
|
518
|
+
// coordinates. The formula decays smoothly with distance so close-but-not-exact hits
|
|
519
|
+
// still benefit; tunable via proximityBoost + proximityScaleKm.
|
|
520
|
+
let distanceKm;
|
|
521
|
+
if (query.near && row.lat !== null && row.lon !== null && !(row.lat === 0 && row.lon === 0)) {
|
|
522
|
+
distanceKm = haversineKm(query.near.lat, query.near.lon, row.lat, row.lon);
|
|
523
|
+
score += this.#weights.proximityBoost / (1 + distanceKm / this.#weights.proximityScaleKm);
|
|
524
|
+
}
|
|
525
|
+
// Population boost: capped at `populationBoost` magnitude at `10^populationScaleLog10`
|
|
526
|
+
// people. Missing population → no contribution. Never penalizes.
|
|
527
|
+
if (row.population !== null && row.population > 0 && this.#weights.populationScaleLog10 > 0) {
|
|
528
|
+
const popLog = Math.log10(1 + row.population);
|
|
529
|
+
const popFraction = Math.min(1, popLog / this.#weights.populationScaleLog10);
|
|
530
|
+
score += this.#weights.populationBoost * popFraction;
|
|
531
|
+
}
|
|
532
|
+
const candidate = {
|
|
533
|
+
id: row.id,
|
|
534
|
+
name: row.name,
|
|
535
|
+
placetype: row.placetype,
|
|
536
|
+
country: row.country ?? "",
|
|
537
|
+
lat: row.lat ?? 0,
|
|
538
|
+
lon: row.lon ?? 0,
|
|
539
|
+
parent_id: row.parent_id ?? undefined,
|
|
540
|
+
score,
|
|
541
|
+
};
|
|
542
|
+
if (distanceKm !== undefined)
|
|
543
|
+
candidate.distanceKm = distanceKm;
|
|
544
|
+
if (row.population !== null && row.population > 0)
|
|
545
|
+
candidate.population = row.population;
|
|
546
|
+
// Candidate bbox — parity with the WASM lookup (resolver-wof-wasm/lookup.ts), whose
|
|
547
|
+
// consumers (the demo cascade's region constraint) read it. Without this the Node
|
|
548
|
+
// backend's region→bbox constraint is dead and disambiguation falls to population
|
|
549
|
+
// ranking (the Springfield-IL→MO failure the #524 smoke eval caught).
|
|
550
|
+
if (row.min_latitude != null &&
|
|
551
|
+
row.max_latitude != null &&
|
|
552
|
+
row.min_longitude != null &&
|
|
553
|
+
row.max_longitude != null) {
|
|
554
|
+
candidate.bbox = {
|
|
555
|
+
minLat: row.min_latitude,
|
|
556
|
+
maxLat: row.max_latitude,
|
|
557
|
+
minLon: row.min_longitude,
|
|
558
|
+
maxLon: row.max_longitude,
|
|
559
|
+
};
|
|
560
|
+
}
|
|
561
|
+
return candidate;
|
|
562
|
+
});
|
|
563
|
+
// Exact-match tiering: a candidate whose name OR any alias equals the query text (case-folded)
|
|
564
|
+
// ranks above any partial match, with the weighted-sum score (incl. population) breaking ties
|
|
565
|
+
// WITHIN a tier. See the RankingWeights.exactMatchTiering docstring for why this aligns the
|
|
566
|
+
// population prior rather than overriding it. One cheap indexed lookup over the candidate ids.
|
|
567
|
+
// Runs even for a SINGLE candidate so `exactMatch` is stamped consistently (parity with the
|
|
568
|
+
// WASM lookup) — a sole alias hit ("New York City" → New York) must still carry the flag the
|
|
569
|
+
// demo cascade / #369 re-rank read.
|
|
570
|
+
if (this.#weights.exactMatchTiering && candidates.length > 0) {
|
|
571
|
+
const exactIds = this.#exactMatchIds(sch, candidates.map((c) => c.id), query.text);
|
|
572
|
+
// Stamp the tier onto every candidate (not just when the tiering sort fires) so a downstream
|
|
573
|
+
// re-rank — #369's postcode-anchor country pin in `resolveTree` — can keep the country pin from
|
|
574
|
+
// crossing the exact/partial boundary ("ME" → Maine, not the more-populous Missouri).
|
|
575
|
+
for (const c of candidates)
|
|
576
|
+
c.exactMatch = exactIds.has(c.id);
|
|
577
|
+
if (exactIds.size > 0 && exactIds.size < candidates.length) {
|
|
578
|
+
candidates.sort((a, b) => {
|
|
579
|
+
const ax = exactIds.has(a.id) ? 1 : 0;
|
|
580
|
+
const bx = exactIds.has(b.id) ? 1 : 0;
|
|
581
|
+
return bx - ax || b.score - a.score;
|
|
582
|
+
});
|
|
583
|
+
return Promise.resolve(candidates.slice(0, limit));
|
|
584
|
+
}
|
|
585
|
+
}
|
|
586
|
+
candidates.sort((a, b) => b.score - a.score);
|
|
587
|
+
return Promise.resolve(candidates.slice(0, limit));
|
|
588
|
+
}
|
|
589
|
+
#isLocalityQuery(query) {
|
|
590
|
+
const pts = normalizePlacetypes(query.placetype);
|
|
591
|
+
return !pts || pts.includes("locality");
|
|
592
|
+
}
|
|
593
|
+
/**
|
|
594
|
+
* Resolve the effective convention for a query (the Geographic Rule Engine entry point). The
|
|
595
|
+
* ancestor chain is keyed by WOF polygon id; for #289 it carries just the country level —
|
|
596
|
+
* resolved from `query.country` via the cached code→WOF-id lookup — so the EU locales, which have
|
|
597
|
+
* no override rows, resolve to `WORLD_DEFAULT` and dispatch is byte-identical to the pre-engine
|
|
598
|
+
* path. E4 (JP) extends the chain with the resolved locality's `ancestors` row, so a
|
|
599
|
+
* region/locality-level convention (e.g. Sapporo's grid) deep-merges over the country one.
|
|
600
|
+
*/
|
|
601
|
+
#conventionFor(query) {
|
|
602
|
+
const chain = [];
|
|
603
|
+
if (query.country) {
|
|
604
|
+
const cid = this.#countryWofId(query.country);
|
|
605
|
+
if (cid !== null)
|
|
606
|
+
chain.push(cid);
|
|
607
|
+
}
|
|
608
|
+
return resolveConvention(this.#conventionSource, chain);
|
|
609
|
+
}
|
|
610
|
+
/**
|
|
611
|
+
* Country ISO code → its WOF polygon id (the coarsest convention key). Cached — one indexed `spr`
|
|
612
|
+
* query per distinct country, then memoized (including a not-found `null`) so findPlace never
|
|
613
|
+
* pays for it twice.
|
|
614
|
+
*/
|
|
615
|
+
#countryWofId(code) {
|
|
616
|
+
const cached = this.#countryWofIdCache.get(code);
|
|
617
|
+
if (cached !== undefined)
|
|
618
|
+
return cached;
|
|
619
|
+
let id = null;
|
|
620
|
+
try {
|
|
621
|
+
const row = this.#db
|
|
622
|
+
.prepare(`SELECT id FROM main.spr WHERE placetype = 'country' AND country = ? AND is_current != 0 LIMIT 1`)
|
|
623
|
+
.get(code);
|
|
624
|
+
id = row?.id ?? null;
|
|
625
|
+
}
|
|
626
|
+
catch {
|
|
627
|
+
id = null;
|
|
628
|
+
}
|
|
629
|
+
this.#countryWofIdCache.set(code, id);
|
|
630
|
+
return id;
|
|
631
|
+
}
|
|
632
|
+
/**
|
|
633
|
+
* Coordinate-first locality resolution. The postcode_locality table maps the sibling postcode to
|
|
634
|
+
* the locality whose polygon contains the postcode centroid (+ a few nearby ones for the
|
|
635
|
+
* abutting- postcode case). We union those COORDINATE candidates with the FTS NAME candidates and
|
|
636
|
+
* soft-score the union `0.6·S_pc + 0.3·S_name + 0.1·S_pop` — so a small town the name-match never
|
|
637
|
+
* finds is recovered by the postcode, while an unambiguous name (Berlin) still wins on name +
|
|
638
|
+
* population. Returns null when the postcode isn't in the table (→ caller falls back to the FTS
|
|
639
|
+
* path).
|
|
640
|
+
*/
|
|
641
|
+
async #findLocalityCoordFirst(query, sch, convention) {
|
|
642
|
+
const w = convention.scoringWeights;
|
|
643
|
+
const pc = query.postcode.trim();
|
|
644
|
+
const pcWhere = query.country ? "postcode = ? AND country = ?" : "postcode = ?";
|
|
645
|
+
const pcParams = query.country ? [pc, query.country] : [pc];
|
|
646
|
+
const pcRows = this.#db
|
|
647
|
+
.prepare(`SELECT locality_id AS id, aliases, distance_km AS dist, is_containing AS containing
|
|
648
|
+
FROM ${sch}.${POSTCODE_LOCALITY_TABLE} WHERE ${pcWhere}`)
|
|
649
|
+
.all(...pcParams);
|
|
650
|
+
if (pcRows.length === 0)
|
|
651
|
+
return null;
|
|
652
|
+
const limit = query.limit ?? 10;
|
|
653
|
+
// Name-match candidates via the normal FTS path (postcode cleared → no recursion).
|
|
654
|
+
const ftsCands = await this.findPlace({ ...query, postcode: undefined, limit: Math.max(limit, 10) });
|
|
655
|
+
const pcInfo = new Map();
|
|
656
|
+
for (const r of pcRows) {
|
|
657
|
+
pcInfo.set(r.id, { dist: r.dist, containing: r.containing === 1, aliases: r.aliases ? r.aliases.split("|") : [] });
|
|
658
|
+
}
|
|
659
|
+
// #475 (opt-in): observed postal-city aliases for this postcode, keyed by the geographic
|
|
660
|
+
// locality name they map to. A user-typed postal city ("Antioch", 37013) becomes a name-match
|
|
661
|
+
// alias for the geographic locality the postcode sits in ("Nashville"). Empty when the reader
|
|
662
|
+
// isn't supplied → the scoring loop below is byte-identical to pre-#475.
|
|
663
|
+
const postalAliasByGeo = new Map();
|
|
664
|
+
if (this.#postalCityAliases) {
|
|
665
|
+
for (const a of await this.#postalCityAliases.getDivergentAliases(pc)) {
|
|
666
|
+
const key = cfNormalize(a.geoLocality);
|
|
667
|
+
if (!key)
|
|
668
|
+
continue;
|
|
669
|
+
const bag = postalAliasByGeo.get(key);
|
|
670
|
+
if (bag)
|
|
671
|
+
bag.push(a.postalCity);
|
|
672
|
+
else
|
|
673
|
+
postalAliasByGeo.set(key, [a.postalCity]);
|
|
674
|
+
}
|
|
675
|
+
}
|
|
676
|
+
const merged = new Map();
|
|
677
|
+
for (const c of ftsCands)
|
|
678
|
+
merged.set(c.id, c);
|
|
679
|
+
const missing = [...pcInfo.keys()].filter((id) => !merged.has(id));
|
|
680
|
+
for (const row of this.#fetchLocalitiesById(missing))
|
|
681
|
+
merged.set(row.id, row);
|
|
682
|
+
const scored = [];
|
|
683
|
+
for (const cand of merged.values()) {
|
|
684
|
+
const info = pcInfo.get(cand.id);
|
|
685
|
+
const sPc = info ? (info.containing ? 1 : Math.exp(-info.dist / CF_PC_DECAY_KM)) : 0;
|
|
686
|
+
// Fold any postal-city aliases for this candidate's geographic name into the soft name match
|
|
687
|
+
// (#475). `postalAliasByGeo` is empty unless the opt-in reader was supplied, so when off this
|
|
688
|
+
// reduces to the original `info?.aliases ?? []` and the score is unchanged.
|
|
689
|
+
const wofAliases = info?.aliases ?? [];
|
|
690
|
+
const aliases = postalAliasByGeo.size > 0
|
|
691
|
+
? [...wofAliases, ...(postalAliasByGeo.get(cfNormalize(cand.name)) ?? [])]
|
|
692
|
+
: wofAliases;
|
|
693
|
+
const sName = softNameScore(query.text, cand.name, aliases);
|
|
694
|
+
const sPop = cand.population && cand.population > 0 ? Math.min(1, Math.log10(1 + cand.population) / 6) : 0;
|
|
695
|
+
scored.push({ ...cand, score: w.pc * sPc + w.name * sName + w.pop * sPop, exact: sName >= 1 });
|
|
696
|
+
}
|
|
697
|
+
// Exact-name tiering (same philosophy as the FTS path): an EXACT name/alias match tiers above
|
|
698
|
+
// coordinate-only candidates, with the soft-score breaking ties WITHIN a tier. This keeps an
|
|
699
|
+
// unambiguous city ("Berlin", exact + huge population) ahead of the fine-grained Ortsteil its
|
|
700
|
+
// postcode centroid lands in, while a small town the name-match never finds (no exact tier) is
|
|
701
|
+
// still recovered by its postcode's containing locality.
|
|
702
|
+
scored.sort((a, b) => Number(b.exact) - Number(a.exact) || b.score - a.score);
|
|
703
|
+
// Conflict flag: if the chosen locality is NOT the postcode's containing locality and sits far
|
|
704
|
+
// from it, the postcode and the city name disagree (a transposed / wrong-for-the-city postcode).
|
|
705
|
+
// We keep the name-chosen locality but flag it — the falsehood signal a BM25 geocoder can't give.
|
|
706
|
+
const top = scored[0];
|
|
707
|
+
if (top) {
|
|
708
|
+
// The postcode's geographic anchor: among the postcode's candidate localities that actually
|
|
709
|
+
// resolved (some — e.g. unnamed Ortsteile — are in the postcode table but not the admin DB),
|
|
710
|
+
// prefer the containing one, else the nearest. Postcodes whose centroid falls just outside
|
|
711
|
+
// every locality polygon still anchor to the closest town.
|
|
712
|
+
const anchorRow = pcRows
|
|
713
|
+
.filter((r) => merged.has(r.id))
|
|
714
|
+
.sort((a, b) => b.containing - a.containing || a.dist - b.dist)[0];
|
|
715
|
+
const anchor = anchorRow ? merged.get(anchorRow.id) : undefined;
|
|
716
|
+
if (anchor && top.id !== anchorRow.id) {
|
|
717
|
+
if (haversineKm(top.lat, top.lon, anchor.lat, anchor.lon) > CF_MISMATCH_KM)
|
|
718
|
+
top.mismatch = true;
|
|
719
|
+
}
|
|
720
|
+
}
|
|
721
|
+
return scored.slice(0, limit).map(({ exact, ...c }) => {
|
|
722
|
+
void exact;
|
|
723
|
+
return c;
|
|
724
|
+
});
|
|
725
|
+
}
|
|
726
|
+
/** Fetch locality spr rows (from main) for the postcode-injected candidate ids the FTS set missed. */
|
|
727
|
+
#fetchLocalitiesById(ids) {
|
|
728
|
+
if (ids.length === 0)
|
|
729
|
+
return [];
|
|
730
|
+
const hasPop = this.#hasPopulationIndex.get("main") === true;
|
|
731
|
+
const popSelect = hasPop ? `pp.population AS population` : `NULL AS population`;
|
|
732
|
+
const popJoin = hasPop ? `LEFT JOIN main.${PLACE_POPULATION_TABLE} pp ON pp.id = s.id` : "";
|
|
733
|
+
const ph = ids.map(() => "?").join(", ");
|
|
734
|
+
const rows = this.#db
|
|
735
|
+
.prepare(`SELECT s.id AS id, s.name AS name, s.country AS country, s.parent_id AS parent_id,
|
|
736
|
+
s.latitude AS lat, s.longitude AS lon, s.placetype AS placetype, ${popSelect}
|
|
737
|
+
FROM main.spr s ${popJoin}
|
|
738
|
+
WHERE s.id IN (${ph}) AND s.is_current != 0`)
|
|
739
|
+
.all(...ids);
|
|
740
|
+
return rows.map((row) => {
|
|
741
|
+
const c = {
|
|
742
|
+
id: row.id,
|
|
743
|
+
name: row.name,
|
|
744
|
+
placetype: row.placetype,
|
|
745
|
+
country: row.country ?? "",
|
|
746
|
+
lat: row.lat ?? 0,
|
|
747
|
+
lon: row.lon ?? 0,
|
|
748
|
+
parent_id: row.parent_id ?? undefined,
|
|
749
|
+
score: 0,
|
|
750
|
+
};
|
|
751
|
+
if (row.population !== null && row.population > 0)
|
|
752
|
+
c.population = row.population;
|
|
753
|
+
return c;
|
|
754
|
+
});
|
|
755
|
+
}
|
|
756
|
+
/**
|
|
757
|
+
* Among `ids`, return the subset whose name OR any alias equals `text` case-insensitively — the
|
|
758
|
+
* exact-match tier for ranking. One indexed query over `<schema>.names`. When the shard has no
|
|
759
|
+
* `names` table (a slim DB built with `dropNames`, or a postcode-only shard), fall back to the
|
|
760
|
+
* self-contained `place_search` FTS content: its `alt_names` column is the same alias set joined
|
|
761
|
+
* on the boundary-preserving `ALIAS_SEPARATOR` (#523), so `aliasBagExactMatch` recovers the exact
|
|
762
|
+
* alias tier ("New York City" → New York) that the dropped `names` table used to provide.
|
|
763
|
+
*/
|
|
764
|
+
#exactMatchIds(schemaName, ids, text) {
|
|
765
|
+
const out = new Set();
|
|
766
|
+
const trimmed = text.trim();
|
|
767
|
+
if (ids.length === 0 || !trimmed)
|
|
768
|
+
return out;
|
|
769
|
+
const placeholders = ids.map(() => "?").join(", ");
|
|
770
|
+
try {
|
|
771
|
+
const rows = this.#db
|
|
772
|
+
.prepare(`SELECT DISTINCT id FROM ${schemaName}.names WHERE id IN (${placeholders}) AND name = ? COLLATE NOCASE`)
|
|
773
|
+
.all(...ids, trimmed);
|
|
774
|
+
for (const r of rows)
|
|
775
|
+
out.add(r.id);
|
|
776
|
+
return out;
|
|
777
|
+
}
|
|
778
|
+
catch {
|
|
779
|
+
// No `names` table on this shard — fall through to the place_search alias bag.
|
|
780
|
+
}
|
|
781
|
+
try {
|
|
782
|
+
const rows = this.#db
|
|
783
|
+
.prepare(`SELECT wof_id AS id, name, alt_names FROM ${schemaName}.place_search WHERE wof_id IN (${placeholders})`)
|
|
784
|
+
.all(...ids);
|
|
785
|
+
const norm = (s) => s.toLowerCase().trim().replace(/\s+/g, " ");
|
|
786
|
+
const needle = norm(trimmed);
|
|
787
|
+
for (const r of rows) {
|
|
788
|
+
if (r.name !== null && norm(r.name) === needle)
|
|
789
|
+
out.add(r.id);
|
|
790
|
+
}
|
|
791
|
+
// Alias pass via the shared bag parser (#523). Separated bags (built since #523) get a true
|
|
792
|
+
// per-alias equality check, ungated — matching the `names`-table branch above, where an
|
|
793
|
+
// alias match counts as exact regardless of other candidates. Legacy bags (no separator)
|
|
794
|
+
// fall back to padded containment, gated on "no canonical exact in the pool" because their
|
|
795
|
+
// lost boundaries would otherwise false-promote interior fragments ("York" inside the alias
|
|
796
|
+
// "New York City") or cross-alias fragments ("York New" across "…York" + "New City…").
|
|
797
|
+
const anyCanonicalExact = out.size > 0;
|
|
798
|
+
for (const r of rows) {
|
|
799
|
+
if (aliasBagExactMatch(r.alt_names, needle, anyCanonicalExact))
|
|
800
|
+
out.add(r.id);
|
|
801
|
+
}
|
|
802
|
+
}
|
|
803
|
+
catch {
|
|
804
|
+
// Shard without place_search either → no exact-match tier. Falls back to weighted-sum order.
|
|
805
|
+
}
|
|
806
|
+
return out;
|
|
807
|
+
}
|
|
808
|
+
close() {
|
|
809
|
+
// Destroying the Kysely instance closes the underlying connection IF we own it. If the caller
|
|
810
|
+
// passed in a pre-opened DatabaseSync (test fixture), respect their ownership.
|
|
811
|
+
void this.#kysely.destroy();
|
|
812
|
+
if (this.#ownsDb) {
|
|
813
|
+
this.#db.close();
|
|
814
|
+
}
|
|
815
|
+
}
|
|
816
|
+
[Symbol.dispose]() {
|
|
817
|
+
this.close();
|
|
818
|
+
}
|
|
819
|
+
/** Build the FTS5 virtual table from the `names` + `places` tables. */
|
|
820
|
+
#ensureFts() {
|
|
821
|
+
buildPlaceSearchFts(this.#db);
|
|
822
|
+
}
|
|
823
|
+
#assertFtsExists() {
|
|
824
|
+
if (!placeSearchFtsExists(this.#db)) {
|
|
825
|
+
throw new Error("WofSqlitePlaceLookup: `place_search` FTS5 table is missing. Pass `buildFts: true` to build it on open, or run `mailwoman-wof-build-fts <path-to-wof.db>` ahead of time (see resolver-wof-sqlite/README.md).");
|
|
826
|
+
}
|
|
827
|
+
}
|
|
828
|
+
}
|
|
829
|
+
function normalizePlacetypes(p) {
|
|
830
|
+
if (!p)
|
|
831
|
+
return null;
|
|
832
|
+
return Array.isArray(p) ? p : [p];
|
|
833
|
+
}
|
|
834
|
+
/**
|
|
835
|
+
* Make an arbitrary user-typed string safe for FTS5 MATCH.
|
|
836
|
+
*
|
|
837
|
+
* FTS5 has its own query syntax (`"phrase"`, `term1 OR term2`, `prefix*`, NEAR/N, etc.). Letting
|
|
838
|
+
* raw user input through means a user typing `Paris's` or `St. (Petersburg)` causes a syntax
|
|
839
|
+
* error.
|
|
840
|
+
*
|
|
841
|
+
* Per-token rules:
|
|
842
|
+
*
|
|
843
|
+
* - Strip all punctuation except trailing `*` from each whitespace-separated token.
|
|
844
|
+
* - **Trailing `*`** is preserved as FTS5 **prefix syntax** — `627*` becomes the literal `627*`
|
|
845
|
+
* (unquoted). The caller signaled they want a prefix; respect that.
|
|
846
|
+
* - All other tokens are wrapped in `"..."` as a single-word phrase. Conservative — handles
|
|
847
|
+
* apostrophes, parens, accented input, etc. safely.
|
|
848
|
+
* - Multiple tokens join with implicit AND.
|
|
849
|
+
*
|
|
850
|
+
* Examples:
|
|
851
|
+
*
|
|
852
|
+
* - `"Paris"` → `"Paris"` (phrase)
|
|
853
|
+
* - `"627*"` → `627*` (prefix)
|
|
854
|
+
* - `"St. (Petersburg)"` → `"St" "Petersburg"` (two phrases, AND-joined)
|
|
855
|
+
* - `"Pari* TX"` → `Pari* "TX"` (mixed prefix + phrase)
|
|
856
|
+
* - `"*"` alone → `""` (no body → drop)
|
|
857
|
+
*/
|
|
858
|
+
function sanitizeFtsQuery(text) {
|
|
859
|
+
const out = [];
|
|
860
|
+
for (const rawToken of text.normalize("NFKC").split(/\s+/u)) {
|
|
861
|
+
const trimmed = rawToken.trim();
|
|
862
|
+
if (!trimmed)
|
|
863
|
+
continue;
|
|
864
|
+
const hasPrefixStar = trimmed.endsWith("*");
|
|
865
|
+
// Strip everything except letters + numbers from the token body. Apostrophes / hyphens /
|
|
866
|
+
// any embedded `*` all go. The trailing `*` (if any) is reapplied separately below.
|
|
867
|
+
const body = trimmed.replace(/[^\p{L}\p{N}]/gu, "");
|
|
868
|
+
if (!body)
|
|
869
|
+
continue;
|
|
870
|
+
out.push(hasPrefixStar ? `${body}*` : `"${body.replace(/"/g, '""')}"`);
|
|
871
|
+
}
|
|
872
|
+
return out.join(" ");
|
|
873
|
+
}
|
|
874
|
+
// `sql` is imported only because future Kysely-typed queries will use it; silence "unused" linting.
|
|
875
|
+
void sql;
|
|
876
|
+
//# sourceMappingURL=lookup.js.map
|