@mailwoman/resolver-wof-sqlite 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +250 -0
- package/out/address-point-interpolation.d.ts +48 -0
- package/out/address-point-interpolation.d.ts.map +1 -0
- package/out/address-point-interpolation.js +164 -0
- package/out/address-point-interpolation.js.map +1 -0
- package/out/address-point-schema.d.ts +58 -0
- package/out/address-point-schema.d.ts.map +1 -0
- package/out/address-point-schema.js +67 -0
- package/out/address-point-schema.js.map +1 -0
- package/out/address-point.d.ts +29 -0
- package/out/address-point.d.ts.map +1 -0
- package/out/address-point.js +62 -0
- package/out/address-point.js.map +1 -0
- package/out/ancestry.d.ts +40 -0
- package/out/ancestry.d.ts.map +1 -0
- package/out/ancestry.js +53 -0
- package/out/ancestry.js.map +1 -0
- package/out/build-candidate-cli.d.ts +16 -0
- package/out/build-candidate-cli.d.ts.map +1 -0
- package/out/build-candidate-cli.js +80 -0
- package/out/build-candidate-cli.js.map +1 -0
- package/out/build-candidate.d.ts +54 -0
- package/out/build-candidate.d.ts.map +1 -0
- package/out/build-candidate.js +230 -0
- package/out/build-candidate.js.map +1 -0
- package/out/build-coincident-roles-cli.d.ts +16 -0
- package/out/build-coincident-roles-cli.d.ts.map +1 -0
- package/out/build-coincident-roles-cli.js +94 -0
- package/out/build-coincident-roles-cli.js.map +1 -0
- package/out/build-fts-cli.d.ts +23 -0
- package/out/build-fts-cli.d.ts.map +1 -0
- package/out/build-fts-cli.js +117 -0
- package/out/build-fts-cli.js.map +1 -0
- package/out/build-slim-cli.d.ts +14 -0
- package/out/build-slim-cli.d.ts.map +1 -0
- package/out/build-slim-cli.js +130 -0
- package/out/build-slim-cli.js.map +1 -0
- package/out/build-slim.d.ts +71 -0
- package/out/build-slim.d.ts.map +1 -0
- package/out/build-slim.js +267 -0
- package/out/build-slim.js.map +1 -0
- package/out/candidate-lookup.d.ts +43 -0
- package/out/candidate-lookup.d.ts.map +1 -0
- package/out/candidate-lookup.js +191 -0
- package/out/candidate-lookup.js.map +1 -0
- package/out/candidate-schema.d.ts +86 -0
- package/out/candidate-schema.d.ts.map +1 -0
- package/out/candidate-schema.js +109 -0
- package/out/candidate-schema.js.map +1 -0
- package/out/coincident-roles.d.ts +86 -0
- package/out/coincident-roles.d.ts.map +1 -0
- package/out/coincident-roles.js +160 -0
- package/out/coincident-roles.js.map +1 -0
- package/out/convention.d.ts +109 -0
- package/out/convention.d.ts.map +1 -0
- package/out/convention.js +94 -0
- package/out/convention.js.map +1 -0
- package/out/fst-autocomplete.d.ts +49 -0
- package/out/fst-autocomplete.d.ts.map +1 -0
- package/out/fst-autocomplete.js +124 -0
- package/out/fst-autocomplete.js.map +1 -0
- package/out/fst-builder.d.ts +20 -0
- package/out/fst-builder.d.ts.map +1 -0
- package/out/fst-builder.js +219 -0
- package/out/fst-builder.js.map +1 -0
- package/out/fst-deserialize-web.d.ts +16 -0
- package/out/fst-deserialize-web.d.ts.map +1 -0
- package/out/fst-deserialize-web.js +133 -0
- package/out/fst-deserialize-web.js.map +1 -0
- package/out/fst-matcher.d.ts +33 -0
- package/out/fst-matcher.d.ts.map +1 -0
- package/out/fst-matcher.js +117 -0
- package/out/fst-matcher.js.map +1 -0
- package/out/fst-serialize.d.ts +30 -0
- package/out/fst-serialize.d.ts.map +1 -0
- package/out/fst-serialize.js +261 -0
- package/out/fst-serialize.js.map +1 -0
- package/out/fst-types.d.ts +60 -0
- package/out/fst-types.d.ts.map +1 -0
- package/out/fst-types.js +11 -0
- package/out/fst-types.js.map +1 -0
- package/out/fts.d.ts +158 -0
- package/out/fts.d.ts.map +1 -0
- package/out/fts.js +261 -0
- package/out/fts.js.map +1 -0
- package/out/geo.d.ts +74 -0
- package/out/geo.d.ts.map +1 -0
- package/out/geo.js +88 -0
- package/out/geo.js.map +1 -0
- package/out/index.d.ts +27 -0
- package/out/index.d.ts.map +1 -0
- package/out/index.js +22 -0
- package/out/index.js.map +1 -0
- package/out/interpolation.d.ts +84 -0
- package/out/interpolation.d.ts.map +1 -0
- package/out/interpolation.js +150 -0
- package/out/interpolation.js.map +1 -0
- package/out/lookup.d.ts +156 -0
- package/out/lookup.d.ts.map +1 -0
- package/out/lookup.js +876 -0
- package/out/lookup.js.map +1 -0
- package/out/postal-city-alias-lookup.d.ts +50 -0
- package/out/postal-city-alias-lookup.d.ts.map +1 -0
- package/out/postal-city-alias-lookup.js +66 -0
- package/out/postal-city-alias-lookup.js.map +1 -0
- package/out/postal-city-alias-schema.d.ts +51 -0
- package/out/postal-city-alias-schema.d.ts.map +1 -0
- package/out/postal-city-alias-schema.js +47 -0
- package/out/postal-city-alias-schema.js.map +1 -0
- package/out/postal-city-candidate-schema.d.ts +58 -0
- package/out/postal-city-candidate-schema.d.ts.map +1 -0
- package/out/postal-city-candidate-schema.js +56 -0
- package/out/postal-city-candidate-schema.js.map +1 -0
- package/out/postcode-point-lookup.d.ts +38 -0
- package/out/postcode-point-lookup.d.ts.map +1 -0
- package/out/postcode-point-lookup.js +46 -0
- package/out/postcode-point-lookup.js.map +1 -0
- package/out/reverse.d.ts +99 -0
- package/out/reverse.d.ts.map +1 -0
- package/out/reverse.js +290 -0
- package/out/reverse.js.map +1 -0
- package/out/schema.d.ts +163 -0
- package/out/schema.d.ts.map +1 -0
- package/out/schema.js +18 -0
- package/out/schema.js.map +1 -0
- package/out/sharding.d.ts +96 -0
- package/out/sharding.d.ts.map +1 -0
- package/out/sharding.js +129 -0
- package/out/sharding.js.map +1 -0
- package/out/sqlite-convention-source.d.ts +29 -0
- package/out/sqlite-convention-source.d.ts.map +1 -0
- package/out/sqlite-convention-source.js +53 -0
- package/out/sqlite-convention-source.js.map +1 -0
- package/out/sqlite-utils.d.ts +17 -0
- package/out/sqlite-utils.d.ts.map +1 -0
- package/out/sqlite-utils.js +24 -0
- package/out/sqlite-utils.js.map +1 -0
- package/out/street-morphology-fst-builder.d.ts +59 -0
- package/out/street-morphology-fst-builder.d.ts.map +1 -0
- package/out/street-morphology-fst-builder.js +174 -0
- package/out/street-morphology-fst-builder.js.map +1 -0
- package/out/street-normalize.d.ts +66 -0
- package/out/street-normalize.d.ts.map +1 -0
- package/out/street-normalize.js +176 -0
- package/out/street-normalize.js.map +1 -0
- package/out/street-segment-schema.d.ts +61 -0
- package/out/street-segment-schema.d.ts.map +1 -0
- package/out/street-segment-schema.js +64 -0
- package/out/street-segment-schema.js.map +1 -0
- package/out/types.d.ts +137 -0
- package/out/types.d.ts.map +1 -0
- package/out/types.js +13 -0
- package/out/types.js.map +1 -0
- package/out/unified-schema.d.ts +25 -0
- package/out/unified-schema.d.ts.map +1 -0
- package/out/unified-schema.js +142 -0
- package/out/unified-schema.js.map +1 -0
- package/package.json +54 -0
package/out/sharding.js
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Multi-shard support for `WofSqlitePlaceLookup` — opens multiple WOF SQLite distributions on one
|
|
7
|
+
* connection via `ATTACH DATABASE`, and routes queries to the right shard based on placetype.
|
|
8
|
+
*
|
|
9
|
+
* ## The FTS5 syntax rule that drove this design
|
|
10
|
+
*
|
|
11
|
+
* The naive `SELECT … FROM pc.place_search WHERE pc.place_search MATCH ?` fails — SQLite parses the
|
|
12
|
+
* schema-qualified table on the left of MATCH as "column place_search of table pc". Discovered in
|
|
13
|
+
* the spike at PR review time; documented as `_SHARD_RULE.md` should it ever bite again.
|
|
14
|
+
*
|
|
15
|
+
* The working form: schema-qualified in FROM, bare table name in MATCH:
|
|
16
|
+
*
|
|
17
|
+
* ```sql
|
|
18
|
+
* SELECT … FROM pc.place_search WHERE place_search MATCH ?
|
|
19
|
+
* ```
|
|
20
|
+
*
|
|
21
|
+
* Identical table names across attached shards (which is what we have — every shard ships its own
|
|
22
|
+
* `place_search` + `place_bbox`) are fine because the bare-name MATCH resolves against FROM
|
|
23
|
+
* scope.
|
|
24
|
+
*/
|
|
25
|
+
import { basename } from "node:path";
|
|
26
|
+
/**
|
|
27
|
+
* Derive a SQL-safe schema name from a WOF distribution filename. Used by `ATTACH DATABASE … AS
|
|
28
|
+
* <name>` so each shard gets a stable, predictable handle.
|
|
29
|
+
*
|
|
30
|
+
* Convention strips the `whosonfirst-data-` prefix and the `-latest.db` (or just `.db`) suffix,
|
|
31
|
+
* then replaces `-` with `_` for SQL identifier safety.
|
|
32
|
+
*
|
|
33
|
+
* Examples:
|
|
34
|
+
*
|
|
35
|
+
* - `whosonfirst-data-admin-us-latest.db` → `admin_us`
|
|
36
|
+
* - `whosonfirst-data-postalcode-us-latest.db` → `postalcode_us`
|
|
37
|
+
* - `whosonfirst-data-admin-latest.db` → `admin`
|
|
38
|
+
* - `my-custom.db` → `my_custom`
|
|
39
|
+
*
|
|
40
|
+
* Callers can override the derived name explicitly via `ShardConfig.schemaName` when the filename
|
|
41
|
+
* doesn't follow WOF convention.
|
|
42
|
+
*/
|
|
43
|
+
export function deriveSchemaName(path) {
|
|
44
|
+
const stem = basename(path)
|
|
45
|
+
.replace(/^whosonfirst-data-/u, "")
|
|
46
|
+
.replace(/-latest\.db$/u, "")
|
|
47
|
+
.replace(/\.db$/u, "")
|
|
48
|
+
.replace(/[^a-zA-Z0-9_]/g, "_");
|
|
49
|
+
if (!stem) {
|
|
50
|
+
throw new Error(`deriveSchemaName: could not derive a SQL schema name from path ${JSON.stringify(path)}`);
|
|
51
|
+
}
|
|
52
|
+
return stem;
|
|
53
|
+
}
|
|
54
|
+
/** SQLite identifier regex — `[A-Za-z_][A-Za-z0-9_]*`. */
|
|
55
|
+
const SQLITE_IDENT_RE = /^[A-Za-z_][A-Za-z0-9_]*$/u;
|
|
56
|
+
/**
|
|
57
|
+
* Normalize the user-provided `databasePath` opt (which may be a single string, an array of
|
|
58
|
+
* strings, or an array of `ShardConfig` objects) into a uniform `ResolvedShard[]`.
|
|
59
|
+
*
|
|
60
|
+
* The first shard becomes `main` regardless of its derived schema name — that's the SQLite
|
|
61
|
+
* convention. Subsequent shards keep their derived (or override) schema name.
|
|
62
|
+
*/
|
|
63
|
+
export function resolveShards(input) {
|
|
64
|
+
const list = typeof input === "string" ? [input] : input;
|
|
65
|
+
if (list.length === 0)
|
|
66
|
+
throw new Error("resolveShards: at least one shard is required");
|
|
67
|
+
const seen = new Set();
|
|
68
|
+
const out = [];
|
|
69
|
+
for (let i = 0; i < list.length; i++) {
|
|
70
|
+
const entry = list[i];
|
|
71
|
+
const cfg = typeof entry === "string" ? { path: entry } : entry;
|
|
72
|
+
const derived = cfg.schemaName ?? deriveSchemaName(cfg.path);
|
|
73
|
+
if (!SQLITE_IDENT_RE.test(derived)) {
|
|
74
|
+
throw new Error(`resolveShards: schema name ${JSON.stringify(derived)} is not a valid SQLite identifier ` +
|
|
75
|
+
`(derived from path ${JSON.stringify(cfg.path)}). Pass an explicit ` +
|
|
76
|
+
`{ path, schemaName } to override.`);
|
|
77
|
+
}
|
|
78
|
+
// The first shard is always main per SQLite semantics — its derived name is informational
|
|
79
|
+
// only. Subsequent shards must have unique non-main names.
|
|
80
|
+
const schemaName = i === 0 ? "main" : derived;
|
|
81
|
+
if (i > 0 && (schemaName === "main" || seen.has(schemaName))) {
|
|
82
|
+
throw new Error(`resolveShards: schema name ${JSON.stringify(schemaName)} collides ` +
|
|
83
|
+
`(either with "main" or another shard). Pass an explicit { path, schemaName }.`);
|
|
84
|
+
}
|
|
85
|
+
seen.add(schemaName);
|
|
86
|
+
out.push({
|
|
87
|
+
path: cfg.path,
|
|
88
|
+
schemaName,
|
|
89
|
+
placetypes: cfg.placetypes ?? [],
|
|
90
|
+
});
|
|
91
|
+
}
|
|
92
|
+
return out;
|
|
93
|
+
}
|
|
94
|
+
/**
|
|
95
|
+
* Pick the shard to route a query to given the requested placetype(s).
|
|
96
|
+
*
|
|
97
|
+
* Routing rules, in order:
|
|
98
|
+
*
|
|
99
|
+
* 1. If any shard has explicit `placetypes` that includes the requested placetype, use it.
|
|
100
|
+
* 2. Otherwise, if a non-main shard's `schemaName` matches the placetype (e.g. `postalcode_us` matches
|
|
101
|
+
* `postalcode`), use it.
|
|
102
|
+
* 3. Otherwise, fall back to `main`.
|
|
103
|
+
*
|
|
104
|
+
* This deliberately doesn't UNION across shards — BM25 scores aren't comparable across separately-
|
|
105
|
+
* indexed corpora, and the typical mailwoman query has a single placetype anyway. If a caller needs
|
|
106
|
+
* cross-shard results they can issue two `findPlace` calls.
|
|
107
|
+
*/
|
|
108
|
+
export function pickShardForPlacetype(shards, placetype) {
|
|
109
|
+
if (!placetype)
|
|
110
|
+
return shards[0];
|
|
111
|
+
for (const s of shards) {
|
|
112
|
+
if (s.placetypes.includes(placetype))
|
|
113
|
+
return s;
|
|
114
|
+
}
|
|
115
|
+
for (const s of shards) {
|
|
116
|
+
if (s.schemaName === "main")
|
|
117
|
+
continue;
|
|
118
|
+
// Substring match: `postalcode_us` matches `postalcode`. Conservative — requires the
|
|
119
|
+
// placetype to appear at a word boundary in the schema name to avoid false hits like
|
|
120
|
+
// `region` matching `arboregion`.
|
|
121
|
+
if (s.schemaName === placetype ||
|
|
122
|
+
s.schemaName.startsWith(`${placetype}_`) ||
|
|
123
|
+
s.schemaName.endsWith(`_${placetype}`)) {
|
|
124
|
+
return s;
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
return shards[0];
|
|
128
|
+
}
|
|
129
|
+
//# sourceMappingURL=sharding.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"sharding.js","sourceRoot":"","sources":["../sharding.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AAEH,OAAO,EAAE,QAAQ,EAAE,MAAM,WAAW,CAAA;AAEpC;;;;;;;;;;;;;;;;GAgBG;AACH,MAAM,UAAU,gBAAgB,CAAC,IAAY;IAC5C,MAAM,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAC;SACzB,OAAO,CAAC,qBAAqB,EAAE,EAAE,CAAC;SAClC,OAAO,CAAC,eAAe,EAAE,EAAE,CAAC;SAC5B,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC;SACrB,OAAO,CAAC,gBAAgB,EAAE,GAAG,CAAC,CAAA;IAChC,IAAI,CAAC,IAAI,EAAE,CAAC;QACX,MAAM,IAAI,KAAK,CAAC,kEAAkE,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,EAAE,CAAC,CAAA;IAC1G,CAAC;IACD,OAAO,IAAI,CAAA;AACZ,CAAC;AAkCD,0DAA0D;AAC1D,MAAM,eAAe,GAAG,2BAA2B,CAAA;AAEnD;;;;;;GAMG;AACH,MAAM,UAAU,aAAa,CAAC,KAAmD;IAChF,MAAM,IAAI,GAAG,OAAO,KAAK,KAAK,QAAQ,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAA;IACxD,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;QAAE,MAAM,IAAI,KAAK,CAAC,+CAA+C,CAAC,CAAA;IAEvF,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAA;IAC9B,MAAM,GAAG,GAAoB,EAAE,CAAA;IAC/B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,KAAK,GAAG,IAAI,CAAC,CAAC,CAAE,CAAA;QACtB,MAAM,GAAG,GAAgB,OAAO,KAAK,KAAK,QAAQ,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,KAAK,EAAE,CAAC,CAAC,CAAC,KAAK,CAAA;QAC5E,MAAM,OAAO,GAAG,GAAG,CAAC,UAAU,IAAI,gBAAgB,CAAC,GAAG,CAAC,IAAI,CAAC,CAAA;QAC5D,IAAI,CAAC,eAAe,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC;YACpC,MAAM,IAAI,KAAK,CACd,8BAA8B,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC,oCAAoC;gBACxF,sBAAsB,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,IAAI,CAAC,sBAAsB;gBACpE,mCAAmC,CACpC,CAAA;QACF,CAAC;QACD,0FAA0F;QAC1F,2DAA2D;QAC3D,MAAM,UAAU,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,OAAO,CAAA;QAC7C,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,UAAU,KAAK,MAAM,IAAI,IAAI,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC,EAAE,CAAC;YAC9D,MAAM,IAAI,KAAK,CACd,8BAA8B,IAAI,CAAC,SAAS,CAAC,UAAU,CAAC,YAAY;gBACnE,+EAA+E,CAChF,CAAA;QACF,CAAC;QACD,IAAI,CAAC,GAAG,CAAC,UAAU,CAAC,CAAA;QACpB,GAAG,CAAC,IAAI,CAAC;YACR,IAAI,EAAE,GAAG,CAAC,IAAI;YACd,UAAU;YACV,UAAU,EAAE,GAAG,CAAC,UAAU,IAAI,EAAE;SAChC,CAAC,CAAA;IACH,CAAC;IACD,OAAO,GAAG,CAAA;AACX,CAAC;AAED;;;;;;;;;;;;;GAaG;AACH,MAAM,UAAU,qBAAqB,CAAC,MAAuB,EAAE,SAA6B;IAC3F,IAAI,CAAC,SAAS;QAAE,OAAO,MAAM,CAAC,CAAC,CAAE,CAAA;IACjC,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;QACxB,IAAI,CAAC,CAAC,UAAU,CAAC,QAAQ,CAAC,SAAS,CAAC;YAAE,OAAO,CAAC,CAAA;IAC/C,CAAC;IACD,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;QACxB,IAAI,CAAC,CAAC,UAAU,KAAK,MAAM;YAAE,SAAQ;QACrC,qFAAqF;QACrF,qFAAqF;QACrF,kCAAkC;QAClC,IACC,CAAC,CAAC,UAAU,KAAK,SAAS;YAC1B,CAAC,CAAC,UAAU,CAAC,UAAU,CAAC,GAAG,SAAS,GAAG,CAAC;YACxC,CAAC,CAAC,UAAU,CAAC,QAAQ,CAAC,IAAI,SAAS,EAAE,CAAC,EACrC,CAAC;YACF,OAAO,CAAC,CAAA;QACT,CAAC;IACF,CAAC;IACD,OAAO,MAAM,CAAC,CAAC,CAAE,CAAA;AAClB,CAAC"}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* `SqliteConventionSource` — a `ConventionSource` backed by the build-from-source convention asset
|
|
7
|
+
* (#290, Direction E). Conventions live in a read-only, provenance-stamped `address_convention`
|
|
8
|
+
* table keyed by WOF polygon id; this source queries them ON DEMAND by id (one indexed lookup,
|
|
9
|
+
* memoized) rather than paging the whole table into memory as a code constant — the deliberate
|
|
10
|
+
* counter to the Pelias "giant dictionary in RAM, no provenance" pattern (see the operator design
|
|
11
|
+
* value in memory `feedback-no-load-bearing-trivia`).
|
|
12
|
+
*
|
|
13
|
+
* The asset is the queryable, distributable artifact; the strategy IMPLEMENTATIONS stay in code. An
|
|
14
|
+
* unknown strategy NAME is surfaced loudly at dispatch (see `lookup.ts`), not silently
|
|
15
|
+
* swallowed.
|
|
16
|
+
*/
|
|
17
|
+
import type { DatabaseSync } from "node:sqlite";
|
|
18
|
+
import { type Convention, type ConventionSource } from "./convention.js";
|
|
19
|
+
export declare class SqliteConventionSource implements ConventionSource {
|
|
20
|
+
#private;
|
|
21
|
+
/**
|
|
22
|
+
* @param db An open handle to a DB that has the convention asset attached (or is it).
|
|
23
|
+
* @param schema The schema name the `address_convention` table lives under (`main` or an ATTACHed
|
|
24
|
+
* shard name — `WofSqlitePlaceLookup` auto-detects which shard carries the table).
|
|
25
|
+
*/
|
|
26
|
+
constructor(db: DatabaseSync, schema: string);
|
|
27
|
+
get(wofId: number): Convention | undefined;
|
|
28
|
+
}
|
|
29
|
+
//# sourceMappingURL=sqlite-convention-source.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"sqlite-convention-source.d.ts","sourceRoot":"","sources":["../sqlite-convention-source.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAEH,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,aAAa,CAAA;AAE/C,OAAO,EAA4B,KAAK,UAAU,EAAE,KAAK,gBAAgB,EAAE,MAAM,iBAAiB,CAAA;AAElG,qBAAa,sBAAuB,YAAW,gBAAgB;;IAM9D;;;;OAIG;gBACS,EAAE,EAAE,YAAY,EAAE,MAAM,EAAE,MAAM;IAK5C,GAAG,CAAC,KAAK,EAAE,MAAM,GAAG,UAAU,GAAG,SAAS;CAiB1C"}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* `SqliteConventionSource` — a `ConventionSource` backed by the build-from-source convention asset
|
|
7
|
+
* (#290, Direction E). Conventions live in a read-only, provenance-stamped `address_convention`
|
|
8
|
+
* table keyed by WOF polygon id; this source queries them ON DEMAND by id (one indexed lookup,
|
|
9
|
+
* memoized) rather than paging the whole table into memory as a code constant — the deliberate
|
|
10
|
+
* counter to the Pelias "giant dictionary in RAM, no provenance" pattern (see the operator design
|
|
11
|
+
* value in memory `feedback-no-load-bearing-trivia`).
|
|
12
|
+
*
|
|
13
|
+
* The asset is the queryable, distributable artifact; the strategy IMPLEMENTATIONS stay in code. An
|
|
14
|
+
* unknown strategy NAME is surfaced loudly at dispatch (see `lookup.ts`), not silently
|
|
15
|
+
* swallowed.
|
|
16
|
+
*/
|
|
17
|
+
import { ADDRESS_CONVENTION_TABLE } from "./convention.js";
|
|
18
|
+
export class SqliteConventionSource {
|
|
19
|
+
#db;
|
|
20
|
+
#schema;
|
|
21
|
+
/** Memoize per-id lookups (including misses, as `null`) so a hot ancestor chain is queried once. */
|
|
22
|
+
#cache = new Map();
|
|
23
|
+
/**
|
|
24
|
+
* @param db An open handle to a DB that has the convention asset attached (or is it).
|
|
25
|
+
* @param schema The schema name the `address_convention` table lives under (`main` or an ATTACHed
|
|
26
|
+
* shard name — `WofSqlitePlaceLookup` auto-detects which shard carries the table).
|
|
27
|
+
*/
|
|
28
|
+
constructor(db, schema) {
|
|
29
|
+
this.#db = db;
|
|
30
|
+
this.#schema = schema;
|
|
31
|
+
}
|
|
32
|
+
get(wofId) {
|
|
33
|
+
const cached = this.#cache.get(wofId);
|
|
34
|
+
if (cached !== undefined)
|
|
35
|
+
return cached ?? undefined;
|
|
36
|
+
let value = null;
|
|
37
|
+
try {
|
|
38
|
+
const row = this.#db
|
|
39
|
+
.prepare(`SELECT convention FROM ${this.#schema}.${ADDRESS_CONVENTION_TABLE} WHERE wof_id = ?`)
|
|
40
|
+
.get(wofId);
|
|
41
|
+
if (row?.convention)
|
|
42
|
+
value = JSON.parse(row.convention);
|
|
43
|
+
}
|
|
44
|
+
catch {
|
|
45
|
+
// Malformed JSON or a missing table → treat as no override (the chain falls back to
|
|
46
|
+
// WORLD_DEFAULT). The build script validates structure, so this is purely defensive.
|
|
47
|
+
value = null;
|
|
48
|
+
}
|
|
49
|
+
this.#cache.set(wofId, value);
|
|
50
|
+
return value ?? undefined;
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
//# sourceMappingURL=sqlite-convention-source.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"sqlite-convention-source.js","sourceRoot":"","sources":["../sqlite-convention-source.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAIH,OAAO,EAAE,wBAAwB,EAA0C,MAAM,iBAAiB,CAAA;AAElG,MAAM,OAAO,sBAAsB;IACzB,GAAG,CAAc;IACjB,OAAO,CAAQ;IACxB,oGAAoG;IAC3F,MAAM,GAAG,IAAI,GAAG,EAA6B,CAAA;IAEtD;;;;OAIG;IACH,YAAY,EAAgB,EAAE,MAAc;QAC3C,IAAI,CAAC,GAAG,GAAG,EAAE,CAAA;QACb,IAAI,CAAC,OAAO,GAAG,MAAM,CAAA;IACtB,CAAC;IAED,GAAG,CAAC,KAAa;QAChB,MAAM,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,KAAK,CAAC,CAAA;QACrC,IAAI,MAAM,KAAK,SAAS;YAAE,OAAO,MAAM,IAAI,SAAS,CAAA;QACpD,IAAI,KAAK,GAAsB,IAAI,CAAA;QACnC,IAAI,CAAC;YACJ,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG;iBAClB,OAAO,CAAC,0BAA0B,IAAI,CAAC,OAAO,IAAI,wBAAwB,mBAAmB,CAAC;iBAC9F,GAAG,CAAC,KAAK,CAAuC,CAAA;YAClD,IAAI,GAAG,EAAE,UAAU;gBAAE,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,UAAU,CAAe,CAAA;QACtE,CAAC;QAAC,MAAM,CAAC;YACR,oFAAoF;YACpF,qFAAqF;YACrF,KAAK,GAAG,IAAI,CAAA;QACb,CAAC;QACD,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,KAAK,EAAE,KAAK,CAAC,CAAA;QAC7B,OAAO,KAAK,IAAI,SAAS,CAAA;IAC1B,CAAC;CACD"}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Small shared helpers for the SQLite-backed lookups.
|
|
7
|
+
*/
|
|
8
|
+
import type { DatabaseSync } from "node:sqlite";
|
|
9
|
+
/**
|
|
10
|
+
* True when `name` is a table in the open database. The street-level lookups use this to degrade
|
|
11
|
+
* gracefully on an empty/tableless shard — an interrupted `build-*-shard.ts`, or a stray 0-byte
|
|
12
|
+
* file (e.g. `sqlite3 <missing>.db "…"` CREATES one) — rather than throwing `no such table` at
|
|
13
|
+
* construction and taking down a whole state's geocode (#568). A missing table makes the lookup a
|
|
14
|
+
* no-op miss.
|
|
15
|
+
*/
|
|
16
|
+
export declare function hasTable(db: DatabaseSync, name: string): boolean;
|
|
17
|
+
//# sourceMappingURL=sqlite-utils.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"sqlite-utils.d.ts","sourceRoot":"","sources":["../sqlite-utils.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,aAAa,CAAA;AAE/C;;;;;;GAMG;AACH,wBAAgB,QAAQ,CAAC,EAAE,EAAE,YAAY,EAAE,IAAI,EAAE,MAAM,GAAG,OAAO,CAOhE"}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Small shared helpers for the SQLite-backed lookups.
|
|
7
|
+
*/
|
|
8
|
+
/**
|
|
9
|
+
* True when `name` is a table in the open database. The street-level lookups use this to degrade
|
|
10
|
+
* gracefully on an empty/tableless shard — an interrupted `build-*-shard.ts`, or a stray 0-byte
|
|
11
|
+
* file (e.g. `sqlite3 <missing>.db "…"` CREATES one) — rather than throwing `no such table` at
|
|
12
|
+
* construction and taking down a whole state's geocode (#568). A missing table makes the lookup a
|
|
13
|
+
* no-op miss.
|
|
14
|
+
*/
|
|
15
|
+
export function hasTable(db, name) {
|
|
16
|
+
try {
|
|
17
|
+
const row = db.prepare("SELECT 1 FROM sqlite_master WHERE type = 'table' AND name = ? LIMIT 1").get(name);
|
|
18
|
+
return row !== undefined;
|
|
19
|
+
}
|
|
20
|
+
catch {
|
|
21
|
+
return false;
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
//# sourceMappingURL=sqlite-utils.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"sqlite-utils.js","sourceRoot":"","sources":["../sqlite-utils.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAIH;;;;;;GAMG;AACH,MAAM,UAAU,QAAQ,CAAC,EAAgB,EAAE,IAAY;IACtD,IAAI,CAAC;QACJ,MAAM,GAAG,GAAG,EAAE,CAAC,OAAO,CAAC,uEAAuE,CAAC,CAAC,GAAG,CAAC,IAAI,CAAC,CAAA;QACzG,OAAO,GAAG,KAAK,SAAS,CAAA;IACzB,CAAC;IAAC,MAAM,CAAC;QACR,OAAO,KAAK,CAAA;IACb,CAAC;AACF,CAAC"}
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Build a street-morphology FST from libpostal's street_types dictionaries. The morphology FST maps
|
|
7
|
+
* street-typing affixes (Street/Avenue/rue/Calle/Straße/...) to a single synthetic placetype
|
|
8
|
+
* `"street_affix"` — distinct from the admin FST in source data, intent, and binary artifact.
|
|
9
|
+
*
|
|
10
|
+
* The morphology FST closes the inference-time vacuum identified by the v0.6.1 postmortem: street
|
|
11
|
+
* tokens have no admin-FST anchor, so synth-street training pushed the model toward over-emitting
|
|
12
|
+
* `dependent_locality` on subcomponents. With the morphology FST, the neural decoder gets
|
|
13
|
+
* positive evidence for street-typing affixes and the adjacent name tokens, plus negative
|
|
14
|
+
* evidence away from `dependent_locality` on the same neighbours.
|
|
15
|
+
*
|
|
16
|
+
* Design rationale + the four-layer street-supplement architecture lives in
|
|
17
|
+
* `docs/articles/concepts/street-supplement-architecture.md`.
|
|
18
|
+
*
|
|
19
|
+
* Source: `core/data/libpostal/dictionaries/{locale}/street_types.txt`. Each line is pipe-delimited
|
|
20
|
+
* surface forms with the canonical form first: avenue|av|ave|aven|avenu|avn|avnu|avnue
|
|
21
|
+
*
|
|
22
|
+
* Output: an `FstMatcher` ready to serialize via `serializeFst` to e.g.
|
|
23
|
+
* `fst-street-morphology.bin`.
|
|
24
|
+
*/
|
|
25
|
+
import { FstMatcher } from "./fst-matcher.js";
|
|
26
|
+
import type { FstProvenance } from "./fst-types.js";
|
|
27
|
+
export interface BuildStreetMorphologyFstOpts {
|
|
28
|
+
/** Path to the `core/data/libpostal/dictionaries` directory containing per-locale subfolders. */
|
|
29
|
+
dictionariesDir: string;
|
|
30
|
+
/**
|
|
31
|
+
* Optional locale filter — only ingest these locale subfolders. Defaults to all that have a
|
|
32
|
+
* `street_types.txt`.
|
|
33
|
+
*/
|
|
34
|
+
locales?: string[];
|
|
35
|
+
/**
|
|
36
|
+
* Minimum length (in characters, post-normalization) of variant surface forms to insert into the
|
|
37
|
+
* trie. Defaults to 3.
|
|
38
|
+
*
|
|
39
|
+
* Rationale: libpostal's street_types dictionaries contain 1-2 character abbreviations (`a`, `b`,
|
|
40
|
+
* `av`, `bd`, `br`, ...) that collide with non-affix tokens at parse time — notably US state
|
|
41
|
+
* abbreviations (`OR`, `CA`, `ND`, `NY`), single-letter unit designators, and arbitrary short
|
|
42
|
+
* tokens. Empirically these collisions push the morphology prior to mis-tag state abbreviations
|
|
43
|
+
* as `street_suffix`. A minimum length of 3 retains useful forms (`ave`, `blvd`, `rue`, `str`)
|
|
44
|
+
* while filtering out the noise.
|
|
45
|
+
*/
|
|
46
|
+
minVariantLength?: number;
|
|
47
|
+
/** Optional progress callback. */
|
|
48
|
+
onProgress?: (phase: string, detail?: string) => void;
|
|
49
|
+
}
|
|
50
|
+
export interface BuildStreetMorphologyFstResult {
|
|
51
|
+
matcher: FstMatcher;
|
|
52
|
+
provenance: FstProvenance;
|
|
53
|
+
canonicalCount: number;
|
|
54
|
+
variantCount: number;
|
|
55
|
+
insertCount: number;
|
|
56
|
+
locales: string[];
|
|
57
|
+
}
|
|
58
|
+
export declare function buildStreetMorphologyFst(opts: BuildStreetMorphologyFstOpts): BuildStreetMorphologyFstResult;
|
|
59
|
+
//# sourceMappingURL=street-morphology-fst-builder.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"street-morphology-fst-builder.d.ts","sourceRoot":"","sources":["../street-morphology-fst-builder.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AAKH,OAAO,EAAE,UAAU,EAAmB,MAAM,kBAAkB,CAAA;AAC9D,OAAO,KAAK,EAAE,aAAa,EAAc,MAAM,gBAAgB,CAAA;AAW/D,MAAM,WAAW,4BAA4B;IAC5C,iGAAiG;IACjG,eAAe,EAAE,MAAM,CAAA;IACvB;;;OAGG;IACH,OAAO,CAAC,EAAE,MAAM,EAAE,CAAA;IAClB;;;;;;;;;;OAUG;IACH,gBAAgB,CAAC,EAAE,MAAM,CAAA;IACzB,kCAAkC;IAClC,UAAU,CAAC,EAAE,CAAC,KAAK,EAAE,MAAM,EAAE,MAAM,CAAC,EAAE,MAAM,KAAK,IAAI,CAAA;CACrD;AAED,MAAM,WAAW,8BAA8B;IAC9C,OAAO,EAAE,UAAU,CAAA;IACnB,UAAU,EAAE,aAAa,CAAA;IACzB,cAAc,EAAE,MAAM,CAAA;IACtB,YAAY,EAAE,MAAM,CAAA;IACpB,WAAW,EAAE,MAAM,CAAA;IACnB,OAAO,EAAE,MAAM,EAAE,CAAA;CACjB;AAmBD,wBAAgB,wBAAwB,CAAC,IAAI,EAAE,4BAA4B,GAAG,8BAA8B,CAwH3G"}
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Build a street-morphology FST from libpostal's street_types dictionaries. The morphology FST maps
|
|
7
|
+
* street-typing affixes (Street/Avenue/rue/Calle/Straße/...) to a single synthetic placetype
|
|
8
|
+
* `"street_affix"` — distinct from the admin FST in source data, intent, and binary artifact.
|
|
9
|
+
*
|
|
10
|
+
* The morphology FST closes the inference-time vacuum identified by the v0.6.1 postmortem: street
|
|
11
|
+
* tokens have no admin-FST anchor, so synth-street training pushed the model toward over-emitting
|
|
12
|
+
* `dependent_locality` on subcomponents. With the morphology FST, the neural decoder gets
|
|
13
|
+
* positive evidence for street-typing affixes and the adjacent name tokens, plus negative
|
|
14
|
+
* evidence away from `dependent_locality` on the same neighbours.
|
|
15
|
+
*
|
|
16
|
+
* Design rationale + the four-layer street-supplement architecture lives in
|
|
17
|
+
* `docs/articles/concepts/street-supplement-architecture.md`.
|
|
18
|
+
*
|
|
19
|
+
* Source: `core/data/libpostal/dictionaries/{locale}/street_types.txt`. Each line is pipe-delimited
|
|
20
|
+
* surface forms with the canonical form first: avenue|av|ave|aven|avenu|avn|avnu|avnue
|
|
21
|
+
*
|
|
22
|
+
* Output: an `FstMatcher` ready to serialize via `serializeFst` to e.g.
|
|
23
|
+
* `fst-street-morphology.bin`.
|
|
24
|
+
*/
|
|
25
|
+
import { readdirSync, readFileSync, statSync } from "node:fs";
|
|
26
|
+
import { join } from "node:path";
|
|
27
|
+
import { FstMatcher, normalizeTokens } from "./fst-matcher.js";
|
|
28
|
+
/**
|
|
29
|
+
* Reserved synthetic wofID base for street-morphology entries. 32-bit unsigned, well above any
|
|
30
|
+
* realistic WOF allocation. Reusing the same base across rebuilds keeps IDs stable for any consumer
|
|
31
|
+
* that caches them. See [[project-schema-storage-decision]] for the reserved range policy.
|
|
32
|
+
*/
|
|
33
|
+
const STREET_AFFIX_WOFID_BASE = 1_900_000_000;
|
|
34
|
+
const STREET_TYPES_FILENAME = "street_types.txt";
|
|
35
|
+
/**
|
|
36
|
+
* Parse one `street_types.txt` line into `{ canonical, variants }`. Canonical is the first token
|
|
37
|
+
* (pre-`|`); variants are all whitespace-stripped non-empty tokens including the canonical.
|
|
38
|
+
*
|
|
39
|
+
* Lines with no `|` are treated as a single-form entry where canonical == variant.
|
|
40
|
+
*/
|
|
41
|
+
function parseLine(line) {
|
|
42
|
+
const trimmed = line.trim();
|
|
43
|
+
if (trimmed.length === 0 || trimmed.startsWith("#"))
|
|
44
|
+
return null;
|
|
45
|
+
const parts = trimmed
|
|
46
|
+
.split("|")
|
|
47
|
+
.map((s) => s.trim())
|
|
48
|
+
.filter((s) => s.length > 0);
|
|
49
|
+
if (parts.length === 0)
|
|
50
|
+
return null;
|
|
51
|
+
return { canonical: parts[0], variants: parts };
|
|
52
|
+
}
|
|
53
|
+
export function buildStreetMorphologyFst(opts) {
|
|
54
|
+
const progress = opts.onProgress ?? (() => { });
|
|
55
|
+
const minVariantLength = opts.minVariantLength ?? 3;
|
|
56
|
+
// Discover locales — either provided explicitly, or all directories containing street_types.txt.
|
|
57
|
+
let locales;
|
|
58
|
+
if (opts.locales && opts.locales.length > 0) {
|
|
59
|
+
locales = opts.locales;
|
|
60
|
+
}
|
|
61
|
+
else {
|
|
62
|
+
locales = readdirSync(opts.dictionariesDir).filter((entry) => {
|
|
63
|
+
const localePath = join(opts.dictionariesDir, entry);
|
|
64
|
+
if (!statSync(localePath).isDirectory())
|
|
65
|
+
return false;
|
|
66
|
+
try {
|
|
67
|
+
statSync(join(localePath, STREET_TYPES_FILENAME));
|
|
68
|
+
return true;
|
|
69
|
+
}
|
|
70
|
+
catch {
|
|
71
|
+
return false;
|
|
72
|
+
}
|
|
73
|
+
});
|
|
74
|
+
}
|
|
75
|
+
progress("discover", `Found ${locales.length} locales with ${STREET_TYPES_FILENAME}`);
|
|
76
|
+
// Collect canonical → set-of-variants across all locales. Same canonical form may appear in
|
|
77
|
+
// multiple locales (e.g. "avenue" in en/fr); we union the variant sets.
|
|
78
|
+
const canonicalToVariants = new Map();
|
|
79
|
+
for (const locale of locales) {
|
|
80
|
+
const filePath = join(opts.dictionariesDir, locale, STREET_TYPES_FILENAME);
|
|
81
|
+
const content = readFileSync(filePath, "utf8");
|
|
82
|
+
for (const line of content.split("\n")) {
|
|
83
|
+
const parsed = parseLine(line);
|
|
84
|
+
if (!parsed)
|
|
85
|
+
continue;
|
|
86
|
+
const existing = canonicalToVariants.get(parsed.canonical) ?? new Set();
|
|
87
|
+
for (const variant of parsed.variants)
|
|
88
|
+
existing.add(variant);
|
|
89
|
+
canonicalToVariants.set(parsed.canonical, existing);
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
progress("collect", `Collected ${canonicalToVariants.size} canonical affixes`);
|
|
93
|
+
// Assign stable synthetic wofIDs. Sort canonicals for determinism.
|
|
94
|
+
const sortedCanonicals = [...canonicalToVariants.keys()].sort();
|
|
95
|
+
const canonicalToWofID = new Map();
|
|
96
|
+
for (let i = 0; i < sortedCanonicals.length; i++) {
|
|
97
|
+
canonicalToWofID.set(sortedCanonicals[i], STREET_AFFIX_WOFID_BASE + i);
|
|
98
|
+
}
|
|
99
|
+
// Build the trie. Each variant is inserted as a token sequence pointing to its canonical's
|
|
100
|
+
// PlaceEntry — so all variants of "avenue" (av/ave/aven/...) lead to the same terminal entry.
|
|
101
|
+
const nodes = [{ edges: new Map(), places: [] }];
|
|
102
|
+
function insertName(tokens, entry) {
|
|
103
|
+
if (tokens.length === 0)
|
|
104
|
+
return;
|
|
105
|
+
let stateId = 0;
|
|
106
|
+
for (const t of tokens) {
|
|
107
|
+
const node = nodes[stateId];
|
|
108
|
+
let next = node.edges.get(t);
|
|
109
|
+
if (next === undefined) {
|
|
110
|
+
next = nodes.length;
|
|
111
|
+
nodes.push({ edges: new Map(), places: [] });
|
|
112
|
+
node.edges.set(t, next);
|
|
113
|
+
}
|
|
114
|
+
stateId = next;
|
|
115
|
+
}
|
|
116
|
+
const existing = nodes[stateId].places;
|
|
117
|
+
if (!existing.some((p) => p.wofID === entry.wofID && p.placetype === entry.placetype)) {
|
|
118
|
+
existing.push(entry);
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
let insertCount = 0;
|
|
122
|
+
let variantCount = 0;
|
|
123
|
+
for (const canonical of sortedCanonicals) {
|
|
124
|
+
const variants = canonicalToVariants.get(canonical);
|
|
125
|
+
const wofID = canonicalToWofID.get(canonical);
|
|
126
|
+
const entry = {
|
|
127
|
+
wofID,
|
|
128
|
+
placetype: "street_affix",
|
|
129
|
+
name: canonical,
|
|
130
|
+
parentChain: [],
|
|
131
|
+
// Fixed importance: street affixes are structurally unambiguous (Avenue is almost never
|
|
132
|
+
// anything but street-typing). The morphology prior caps bias separately; this value
|
|
133
|
+
// just feeds the cap formula `importance * cap`.
|
|
134
|
+
importance: 1.0,
|
|
135
|
+
lat: 0,
|
|
136
|
+
lon: 0,
|
|
137
|
+
};
|
|
138
|
+
for (const variant of variants) {
|
|
139
|
+
const tokens = normalizeTokens(variant);
|
|
140
|
+
if (tokens.length === 0)
|
|
141
|
+
continue;
|
|
142
|
+
// Filter out collision-prone short surface forms — see `minVariantLength` docstring.
|
|
143
|
+
// We measure against the joined token form (no spaces) since FST keys are token sequences.
|
|
144
|
+
const joined = tokens.join("");
|
|
145
|
+
if (joined.length < minVariantLength)
|
|
146
|
+
continue;
|
|
147
|
+
insertName(tokens, entry);
|
|
148
|
+
insertCount++;
|
|
149
|
+
variantCount++;
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
progress("trie", `Built trie: ${nodes.length} states, ${insertCount} variant insertions`);
|
|
153
|
+
const edgeCount = nodes.reduce((sum, n) => sum + n.edges.size, 0);
|
|
154
|
+
const matcher = FstMatcher.fromNodes(nodes);
|
|
155
|
+
const provenance = {
|
|
156
|
+
builtAt: new Date().toISOString(),
|
|
157
|
+
countries: locales, // Reuse `countries` slot for locale provenance — semantics differ from admin FST.
|
|
158
|
+
stateCount: nodes.length,
|
|
159
|
+
placeCount: sortedCanonicals.length,
|
|
160
|
+
edgeCount,
|
|
161
|
+
nameInsertions: insertCount,
|
|
162
|
+
importanceMatches: 0, // No importance scoring for morphology — fixed at 1.0.
|
|
163
|
+
sourceDb: opts.dictionariesDir,
|
|
164
|
+
};
|
|
165
|
+
return {
|
|
166
|
+
matcher,
|
|
167
|
+
provenance,
|
|
168
|
+
canonicalCount: sortedCanonicals.length,
|
|
169
|
+
variantCount,
|
|
170
|
+
insertCount,
|
|
171
|
+
locales,
|
|
172
|
+
};
|
|
173
|
+
}
|
|
174
|
+
//# sourceMappingURL=street-morphology-fst-builder.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"street-morphology-fst-builder.js","sourceRoot":"","sources":["../street-morphology-fst-builder.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AAEH,OAAO,EAAE,WAAW,EAAE,YAAY,EAAE,QAAQ,EAAE,MAAM,SAAS,CAAA;AAC7D,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAA;AAEhC,OAAO,EAAE,UAAU,EAAE,eAAe,EAAE,MAAM,kBAAkB,CAAA;AAG9D;;;;GAIG;AACH,MAAM,uBAAuB,GAAG,aAAa,CAAA;AAE7C,MAAM,qBAAqB,GAAG,kBAAkB,CAAA;AAmChD;;;;;GAKG;AACH,SAAS,SAAS,CAAC,IAAY;IAC9B,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAA;IAC3B,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,IAAI,OAAO,CAAC,UAAU,CAAC,GAAG,CAAC;QAAE,OAAO,IAAI,CAAA;IAChE,MAAM,KAAK,GAAG,OAAO;SACnB,KAAK,CAAC,GAAG,CAAC;SACV,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;SACpB,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;IAC7B,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,IAAI,CAAA;IACnC,OAAO,EAAE,SAAS,EAAE,KAAK,CAAC,CAAC,CAAE,EAAE,QAAQ,EAAE,KAAK,EAAE,CAAA;AACjD,CAAC;AAED,MAAM,UAAU,wBAAwB,CAAC,IAAkC;IAC1E,MAAM,QAAQ,GAAG,IAAI,CAAC,UAAU,IAAI,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAA;IAC9C,MAAM,gBAAgB,GAAG,IAAI,CAAC,gBAAgB,IAAI,CAAC,CAAA;IAEnD,iGAAiG;IACjG,IAAI,OAAiB,CAAA;IACrB,IAAI,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC7C,OAAO,GAAG,IAAI,CAAC,OAAO,CAAA;IACvB,CAAC;SAAM,CAAC;QACP,OAAO,GAAG,WAAW,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC,MAAM,CAAC,CAAC,KAAK,EAAE,EAAE;YAC5D,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,eAAe,EAAE,KAAK,CAAC,CAAA;YACpD,IAAI,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC,WAAW,EAAE;gBAAE,OAAO,KAAK,CAAA;YACrD,IAAI,CAAC;gBACJ,QAAQ,CAAC,IAAI,CAAC,UAAU,EAAE,qBAAqB,CAAC,CAAC,CAAA;gBACjD,OAAO,IAAI,CAAA;YACZ,CAAC;YAAC,MAAM,CAAC;gBACR,OAAO,KAAK,CAAA;YACb,CAAC;QACF,CAAC,CAAC,CAAA;IACH,CAAC;IACD,QAAQ,CAAC,UAAU,EAAE,SAAS,OAAO,CAAC,MAAM,iBAAiB,qBAAqB,EAAE,CAAC,CAAA;IAErF,4FAA4F;IAC5F,wEAAwE;IACxE,MAAM,mBAAmB,GAAG,IAAI,GAAG,EAAuB,CAAA;IAC1D,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;QAC9B,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,eAAe,EAAE,MAAM,EAAE,qBAAqB,CAAC,CAAA;QAC1E,MAAM,OAAO,GAAG,YAAY,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAA;QAC9C,KAAK,MAAM,IAAI,IAAI,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC;YACxC,MAAM,MAAM,GAAG,SAAS,CAAC,IAAI,CAAC,CAAA;YAC9B,IAAI,CAAC,MAAM;gBAAE,SAAQ;YACrB,MAAM,QAAQ,GAAG,mBAAmB,CAAC,GAAG,CAAC,MAAM,CAAC,SAAS,CAAC,IAAI,IAAI,GAAG,EAAU,CAAA;YAC/E,KAAK,MAAM,OAAO,IAAI,MAAM,CAAC,QAAQ;gBAAE,QAAQ,CAAC,GAAG,CAAC,OAAO,CAAC,CAAA;YAC5D,mBAAmB,CAAC,GAAG,CAAC,MAAM,CAAC,SAAS,EAAE,QAAQ,CAAC,CAAA;QACpD,CAAC;IACF,CAAC;IACD,QAAQ,CAAC,SAAS,EAAE,aAAa,mBAAmB,CAAC,IAAI,oBAAoB,CAAC,CAAA;IAE9E,mEAAmE;IACnE,MAAM,gBAAgB,GAAG,CAAC,GAAG,mBAAmB,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;IAC/D,MAAM,gBAAgB,GAAG,IAAI,GAAG,EAAkB,CAAA;IAClD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,gBAAgB,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAClD,gBAAgB,CAAC,GAAG,CAAC,gBAAgB,CAAC,CAAC,CAAE,EAAE,uBAAuB,GAAG,CAAC,CAAC,CAAA;IACxE,CAAC;IAED,2FAA2F;IAC3F,8FAA8F;IAC9F,MAAM,KAAK,GAAc,CAAC,EAAE,KAAK,EAAE,IAAI,GAAG,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,CAAC,CAAA;IAE3D,SAAS,UAAU,CAAC,MAAgB,EAAE,KAAiB;QACtD,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC;YAAE,OAAM;QAC/B,IAAI,OAAO,GAAG,CAAC,CAAA;QACf,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;YACxB,MAAM,IAAI,GAAG,KAAK,CAAC,OAAO,CAAE,CAAA;YAC5B,IAAI,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAA;YAC5B,IAAI,IAAI,KAAK,SAAS,EAAE,CAAC;gBACxB,IAAI,GAAG,KAAK,CAAC,MAAM,CAAA;gBACnB,KAAK,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,IAAI,GAAG,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,CAAC,CAAA;gBAC5C,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,CAAA;YACxB,CAAC;YACD,OAAO,GAAG,IAAI,CAAA;QACf,CAAC;QACD,MAAM,QAAQ,GAAG,KAAK,CAAC,OAAO,CAAE,CAAC,MAAM,CAAA;QACvC,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,KAAK,KAAK,CAAC,KAAK,IAAI,CAAC,CAAC,SAAS,KAAK,KAAK,CAAC,SAAS,CAAC,EAAE,CAAC;YACvF,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;QACrB,CAAC;IACF,CAAC;IAED,IAAI,WAAW,GAAG,CAAC,CAAA;IACnB,IAAI,YAAY,GAAG,CAAC,CAAA;IACpB,KAAK,MAAM,SAAS,IAAI,gBAAgB,EAAE,CAAC;QAC1C,MAAM,QAAQ,GAAG,mBAAmB,CAAC,GAAG,CAAC,SAAS,CAAE,CAAA;QACpD,MAAM,KAAK,GAAG,gBAAgB,CAAC,GAAG,CAAC,SAAS,CAAE,CAAA;QAC9C,MAAM,KAAK,GAAe;YACzB,KAAK;YACL,SAAS,EAAE,cAAc;YACzB,IAAI,EAAE,SAAS;YACf,WAAW,EAAE,EAAE;YACf,wFAAwF;YACxF,qFAAqF;YACrF,iDAAiD;YACjD,UAAU,EAAE,GAAG;YACf,GAAG,EAAE,CAAC;YACN,GAAG,EAAE,CAAC;SACN,CAAA;QACD,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;YAChC,MAAM,MAAM,GAAG,eAAe,CAAC,OAAO,CAAC,CAAA;YACvC,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC;gBAAE,SAAQ;YACjC,qFAAqF;YACrF,2FAA2F;YAC3F,MAAM,MAAM,GAAG,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,CAAA;YAC9B,IAAI,MAAM,CAAC,MAAM,GAAG,gBAAgB;gBAAE,SAAQ;YAC9C,UAAU,CAAC,MAAM,EAAE,KAAK,CAAC,CAAA;YACzB,WAAW,EAAE,CAAA;YACb,YAAY,EAAE,CAAA;QACf,CAAC;IACF,CAAC;IACD,QAAQ,CAAC,MAAM,EAAE,eAAe,KAAK,CAAC,MAAM,YAAY,WAAW,qBAAqB,CAAC,CAAA;IAEzF,MAAM,SAAS,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC,CAAC,CAAA;IACjE,MAAM,OAAO,GAAG,UAAU,CAAC,SAAS,CAAC,KAAK,CAAC,CAAA;IAC3C,MAAM,UAAU,GAAkB;QACjC,OAAO,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;QACjC,SAAS,EAAE,OAAO,EAAE,kFAAkF;QACtG,UAAU,EAAE,KAAK,CAAC,MAAM;QACxB,UAAU,EAAE,gBAAgB,CAAC,MAAM;QACnC,SAAS;QACT,cAAc,EAAE,WAAW;QAC3B,iBAAiB,EAAE,CAAC,EAAE,uDAAuD;QAC7E,QAAQ,EAAE,IAAI,CAAC,eAAe;KAC9B,CAAA;IAED,OAAO;QACN,OAAO;QACP,UAAU;QACV,cAAc,EAAE,gBAAgB,CAAC,MAAM;QACvC,YAAY;QACZ,WAAW;QACX,OAAO;KACP,CAAA;AACF,CAAC"}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* THE street normalizer for the address-point tier (#476). One function, used by BOTH the shard
|
|
7
|
+
* builder (`scripts/build-address-point-shard.ts`) and the lookup tier (`address-point.ts`) —
|
|
8
|
+
* never two implementations (the PLACETYPE_ORDER lesson: parallel copies silently corrupt).
|
|
9
|
+
*
|
|
10
|
+
* Normalization contract (deliberately aggressive — both sides apply the same function, so
|
|
11
|
+
* collisions only need to be _consistent_, not linguistically perfect):
|
|
12
|
+
*
|
|
13
|
+
* 1. Lowercase, NFKD-fold diacritics, collapse whitespace, strip punctuation (periods, commas,
|
|
14
|
+
* apostrophes).
|
|
15
|
+
* 2. Expand USPS directional abbreviations at the FIRST and LAST token position (`n` → `north`, `se` →
|
|
16
|
+
* `southeast`) — Overture sources abbreviate inconsistently.
|
|
17
|
+
* 3. Canonicalize a trailing USPS street-type token via the codex suffix table to its canonical full
|
|
18
|
+
* form (`st`/`str`/`street` → `street`).
|
|
19
|
+
*
|
|
20
|
+
* Numbered streets are left as digits (`5th` stays `5th`); a SPELLED ordinal before a street suffix
|
|
21
|
+
* folds to its digit form (`tenth street` → `10th street`, #723) so the grid-city ordinal
|
|
22
|
+
* cross-streets the source data spells with digits become reachable.
|
|
23
|
+
*/
|
|
24
|
+
/**
|
|
25
|
+
* Normalize a street name for address-point keying. Same function at build time and lookup time —
|
|
26
|
+
* see module docstring for the contract.
|
|
27
|
+
*/
|
|
28
|
+
export declare function normalizeStreetForKey(street: string): string;
|
|
29
|
+
/** Normalize a locality name for address-point keying (fold only — no street semantics). */
|
|
30
|
+
export declare function normalizeLocalityForKey(locality: string): string;
|
|
31
|
+
/**
|
|
32
|
+
* Strip a locality QUALIFIER for a query-side fallback — when an OA locality's exact normalized
|
|
33
|
+
* name misses the gazetteer's canonical name, retry with the qualifier removed. OA address data
|
|
34
|
+
* carries disambiguating qualifiers the gazetteer's canonical name omits: Austrian `Kraubath/Mur`
|
|
35
|
+
* and `Hart b.Graz` → `Hart`; Swiss `Lenk im Simmental` → `Lenk`, `Roche VD` → `Roche`; Danish
|
|
36
|
+
* `Odense S`, `Hurup Thy`. A FALLBACK ONLY — the exact name is tried first, and the region-bbox
|
|
37
|
+
* disambiguation resolves any base-name ambiguity downstream. The candidate table is unchanged
|
|
38
|
+
* (this is purely query-side); feed the result back through {@link normalizeLocalityForKey}. Returns
|
|
39
|
+
* "" when nothing was stripped (no point re-probing the identical key).
|
|
40
|
+
*
|
|
41
|
+
* Measured (`scripts/eval/candidate-recall.ts --strip-fallback`, EU OA holdouts): recovers AT
|
|
42
|
+
* 74.1→88.2% (+14.1pp), DK 91.5→96.2%, CH 90.4→92.6%; +1.3pp overall (diluted by the already-100%
|
|
43
|
+
* locales). Conservative by design — only the qualifier forms above; FI/PT/SI misses are
|
|
44
|
+
* untouched.
|
|
45
|
+
*/
|
|
46
|
+
export declare function stripLocalityQualifier(locality: string): string;
|
|
47
|
+
/**
|
|
48
|
+
* Fold numbered-route designators to a canonical key, applied AFTER {@link normalizeStreetForKey}.
|
|
49
|
+
* Sources disagree systematically on how they spell a route: TIGER says `State Rte 100` / `US Hwy
|
|
50
|
+
* 5` where E911/Overture say `VT ROUTE 100` / `US ROUTE 5` — the dominant street-name miss class in
|
|
51
|
+
* the #483 interpolation eval (rural addresses live on routes). `us <designator> N…` folds to `us
|
|
52
|
+
* route N…`; `state <designator> N…` and `<2-letter-prefix> <designator> N…` (the state
|
|
53
|
+
* abbreviation form) fold to `state route N…`. Only digit-leading route numbers fold — `State
|
|
54
|
+
* Street` and friends never match.
|
|
55
|
+
*
|
|
56
|
+
* Used by BOTH the segment-shard builder (`scripts/build-interpolation-shard.ts`) and the
|
|
57
|
+
* interpolation lookup — same one-function discipline as {@link normalizeStreetForKey}. The
|
|
58
|
+
* address-point tier (#476) does NOT apply it yet: adopting it there requires a shard rebuild
|
|
59
|
+
* (noted on #483).
|
|
60
|
+
*
|
|
61
|
+
* A same-numbered US and state route stay DISTINCT keys (`us route 5` vs `state route 5`); only the
|
|
62
|
+
* BARE `route N` form is ambiguous (designator unknown) and it stays unfolded — a bare-route query
|
|
63
|
+
* therefore misses rather than guessing a designator.
|
|
64
|
+
*/
|
|
65
|
+
export declare function canonicalizeRouteKey(streetNorm: string): string;
|
|
66
|
+
//# sourceMappingURL=street-normalize.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"street-normalize.d.ts","sourceRoot":"","sources":["../street-normalize.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAuDH;;;GAGG;AACH,wBAAgB,qBAAqB,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,CA6C5D;AAED,4FAA4F;AAC5F,wBAAgB,uBAAuB,CAAC,QAAQ,EAAE,MAAM,GAAG,MAAM,CAEhE;AAED;;;;;;;;;;;;;;GAcG;AACH,wBAAgB,sBAAsB,CAAC,QAAQ,EAAE,MAAM,GAAG,MAAM,CAQ/D;AAED;;;;;;;;;;;;;;;;;GAiBG;AACH,wBAAgB,oBAAoB,CAAC,UAAU,EAAE,MAAM,GAAG,MAAM,CAI/D"}
|