@mailwoman/formatter 4.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/out/format.d.ts +72 -0
- package/out/format.d.ts.map +1 -0
- package/out/format.js +176 -0
- package/out/format.js.map +1 -0
- package/out/index.d.ts +15 -0
- package/out/index.d.ts.map +1 -0
- package/out/index.js +15 -0
- package/out/index.js.map +1 -0
- package/out/key.d.ts +41 -0
- package/out/key.d.ts.map +1 -0
- package/out/key.js +82 -0
- package/out/key.js.map +1 -0
- package/package.json +34 -0
package/out/format.d.ts
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Render a `ComponentTag`-keyed dict into a country-localized string — the inverse of the parser.
|
|
7
|
+
*
|
|
8
|
+
* This is the canonical home for Mailwoman's address formatting, consolidated from two earlier
|
|
9
|
+
* half-implementations: the `core/formatter` stub (which wrapped OpenCage but hardcoded `US`) and
|
|
10
|
+
* the corpus synthesis formatter (`corpus/src/format.ts`, the fuller one this is ported from).
|
|
11
|
+
*
|
|
12
|
+
* It bridges Mailwoman's `ComponentTag` schema to OpenCage's `address-formatting` templates
|
|
13
|
+
* (vendored via `@fragaria/address-formatter`, MIT) so callers get idiomatic per-country output
|
|
14
|
+
* without reinventing template logic. Owning our own templates — so we can express the slots
|
|
15
|
+
* OpenCage can't (`unit`, `intersection`, `cedex`, the JP tags) — is a deliberate follow-up; this
|
|
16
|
+
* first cut keeps Fragaria as the engine and concentrates the mapping in one place.
|
|
17
|
+
*
|
|
18
|
+
* Known limitations inherited from the OpenCage vocabulary (documented, not blockers):
|
|
19
|
+
*
|
|
20
|
+
* - `unit`: no slot, so units ride the road line (`"Pennsylvania Ave NW Apt 4B"`).
|
|
21
|
+
* - `intersection_a` / `intersection_b`: joined as `"<a> & <b>"` into the road field.
|
|
22
|
+
* - `cedex` (FR): folded into `postcode` (`"75008 CEDEX 08"`) so the FR template slots it right.
|
|
23
|
+
* - JP-specific tags (`prefecture`, `municipality`, …): no mapping yet.
|
|
24
|
+
*/
|
|
25
|
+
import type { ClassificationMap } from "@mailwoman/core/classification";
|
|
26
|
+
import type { ComponentTag } from "@mailwoman/core/types";
|
|
27
|
+
/** A partial map of `ComponentTag` → string value — the canonical formatter input. */
|
|
28
|
+
export type ComponentDict = Partial<Record<ComponentTag, string>>;
|
|
29
|
+
/** Options accepted by `formatAddress`. */
|
|
30
|
+
export interface FormatAddressOptions {
|
|
31
|
+
/**
|
|
32
|
+
* Append the country name as a final line (`"USA"`, `"France"`). Default `false`: most rows are
|
|
33
|
+
* intra-country and the country line is redundant noise.
|
|
34
|
+
*/
|
|
35
|
+
appendCountry?: boolean;
|
|
36
|
+
/**
|
|
37
|
+
* Apply OpenCage's per-country abbreviation rules (`"Avenue"` → `"Ave"`). Default `false` —
|
|
38
|
+
* callers that want abbreviation usually run it as their own augmentation pass.
|
|
39
|
+
*/
|
|
40
|
+
abbreviate?: boolean;
|
|
41
|
+
/**
|
|
42
|
+
* Replace the template's newlines with this separator. Default `undefined` (keep newlines). Use
|
|
43
|
+
* `", "` for single-line output, or `" "` to strip internal punctuation.
|
|
44
|
+
*/
|
|
45
|
+
separator?: string;
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* Render a component dict into an idiomatic per-country address string.
|
|
49
|
+
*
|
|
50
|
+
* Returns an empty string if `components` is empty after translation. Throws nothing — bad inputs
|
|
51
|
+
* degrade to the longest meaningful prefix.
|
|
52
|
+
*/
|
|
53
|
+
export declare function formatAddress(components: ComponentDict, country: string, opts?: FormatAddressOptions): string;
|
|
54
|
+
/**
|
|
55
|
+
* Format a legacy {@linkcode ClassificationMap} (`Map<VisibleClassification, string[]>`, as emitted
|
|
56
|
+
* by the rule-based pipeline) into an idiomatic address string. Subsumes the former
|
|
57
|
+
* `core/formatter` stub. Multi-span values are space-joined; unit-like labels are merged.
|
|
58
|
+
*/
|
|
59
|
+
export declare function formatFromClassificationMap(map: ClassificationMap, country: string, opts?: FormatAddressOptions): string;
|
|
60
|
+
/**
|
|
61
|
+
* Drop any component whose value isn't actually present in the formatted `raw`. OpenCage's
|
|
62
|
+
* per-country templates legitimately omit some inputs (FR regions absorbed by the postcode; US
|
|
63
|
+
* state names abbreviated), and downstream alignment requires `components[tag]` to occur in `raw`.
|
|
64
|
+
* Comparison is case- and whitespace-insensitive; the retained value is the original input.
|
|
65
|
+
*/
|
|
66
|
+
export declare function reconcileComponents(components: ComponentDict, raw: string): ComponentDict;
|
|
67
|
+
/**
|
|
68
|
+
* Translate a `ComponentTag` dict to the OpenCage vocabulary `@fragaria/address-formatter` expects.
|
|
69
|
+
* Exported for testing and for callers that pre-build the dict for batch formatting.
|
|
70
|
+
*/
|
|
71
|
+
export declare function toOpenCageComponents(components: ComponentDict, country: string): Record<string, string>;
|
|
72
|
+
//# sourceMappingURL=format.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"format.d.ts","sourceRoot":"","sources":["../format.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AAGH,OAAO,KAAK,EAAE,iBAAiB,EAAyB,MAAM,gCAAgC,CAAA;AAC9F,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,uBAAuB,CAAA;AAEzD,sFAAsF;AACtF,MAAM,MAAM,aAAa,GAAG,OAAO,CAAC,MAAM,CAAC,YAAY,EAAE,MAAM,CAAC,CAAC,CAAA;AAEjE,2CAA2C;AAC3C,MAAM,WAAW,oBAAoB;IACpC;;;OAGG;IACH,aAAa,CAAC,EAAE,OAAO,CAAA;IAEvB;;;OAGG;IACH,UAAU,CAAC,EAAE,OAAO,CAAA;IAEpB;;;OAGG;IACH,SAAS,CAAC,EAAE,MAAM,CAAA;CAClB;AAED;;;;;GAKG;AACH,wBAAgB,aAAa,CAAC,UAAU,EAAE,aAAa,EAAE,OAAO,EAAE,MAAM,EAAE,IAAI,GAAE,oBAAyB,GAAG,MAAM,CAWjH;AAoBD;;;;GAIG;AACH,wBAAgB,2BAA2B,CAC1C,GAAG,EAAE,iBAAiB,EACtB,OAAO,EAAE,MAAM,EACf,IAAI,GAAE,oBAAyB,GAC7B,MAAM,CAoBR;AAED;;;;;GAKG;AACH,wBAAgB,mBAAmB,CAAC,UAAU,EAAE,aAAa,EAAE,GAAG,EAAE,MAAM,GAAG,aAAa,CAWzF;AAED;;;GAGG;AACH,wBAAgB,oBAAoB,CAAC,UAAU,EAAE,aAAa,EAAE,OAAO,EAAE,MAAM,GAAG,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CA4BvG"}
|
package/out/format.js
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Render a `ComponentTag`-keyed dict into a country-localized string — the inverse of the parser.
|
|
7
|
+
*
|
|
8
|
+
* This is the canonical home for Mailwoman's address formatting, consolidated from two earlier
|
|
9
|
+
* half-implementations: the `core/formatter` stub (which wrapped OpenCage but hardcoded `US`) and
|
|
10
|
+
* the corpus synthesis formatter (`corpus/src/format.ts`, the fuller one this is ported from).
|
|
11
|
+
*
|
|
12
|
+
* It bridges Mailwoman's `ComponentTag` schema to OpenCage's `address-formatting` templates
|
|
13
|
+
* (vendored via `@fragaria/address-formatter`, MIT) so callers get idiomatic per-country output
|
|
14
|
+
* without reinventing template logic. Owning our own templates — so we can express the slots
|
|
15
|
+
* OpenCage can't (`unit`, `intersection`, `cedex`, the JP tags) — is a deliberate follow-up; this
|
|
16
|
+
* first cut keeps Fragaria as the engine and concentrates the mapping in one place.
|
|
17
|
+
*
|
|
18
|
+
* Known limitations inherited from the OpenCage vocabulary (documented, not blockers):
|
|
19
|
+
*
|
|
20
|
+
* - `unit`: no slot, so units ride the road line (`"Pennsylvania Ave NW Apt 4B"`).
|
|
21
|
+
* - `intersection_a` / `intersection_b`: joined as `"<a> & <b>"` into the road field.
|
|
22
|
+
* - `cedex` (FR): folded into `postcode` (`"75008 CEDEX 08"`) so the FR template slots it right.
|
|
23
|
+
* - JP-specific tags (`prefecture`, `municipality`, …): no mapping yet.
|
|
24
|
+
*/
|
|
25
|
+
import addressFormatter from "@fragaria/address-formatter";
|
|
26
|
+
/**
|
|
27
|
+
* Render a component dict into an idiomatic per-country address string.
|
|
28
|
+
*
|
|
29
|
+
* Returns an empty string if `components` is empty after translation. Throws nothing — bad inputs
|
|
30
|
+
* degrade to the longest meaningful prefix.
|
|
31
|
+
*/
|
|
32
|
+
export function formatAddress(components, country, opts = {}) {
|
|
33
|
+
const ocComponents = toOpenCageComponents(components, country);
|
|
34
|
+
if (Object.keys(ocComponents).length === 0)
|
|
35
|
+
return "";
|
|
36
|
+
const raw = addressFormatter.format(ocComponents, {
|
|
37
|
+
abbreviate: opts.abbreviate ?? false,
|
|
38
|
+
appendCountry: opts.appendCountry ?? false,
|
|
39
|
+
});
|
|
40
|
+
const trimmed = raw.replace(/\s+$/g, "");
|
|
41
|
+
return opts.separator !== undefined ? trimmed.replace(/\n+/g, opts.separator) : trimmed;
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* Map of legacy rule-classifier {@linkcode VisibleClassification} labels to the canonical
|
|
45
|
+
* `ComponentTag` schema. The two vocabularies are kept independent on purpose (rule classifiers
|
|
46
|
+
* emit one, the neural classifier the other); this adapter is the bridge so a `ClassificationMap`
|
|
47
|
+
* can use the same formatter. `level` / `unit_designator` / `level_designator` are folded into
|
|
48
|
+
* `unit`.
|
|
49
|
+
*/
|
|
50
|
+
const CLASSIFICATION_TO_TAG = {
|
|
51
|
+
country: "country",
|
|
52
|
+
region: "region",
|
|
53
|
+
locality: "locality",
|
|
54
|
+
dependency: "dependent_locality",
|
|
55
|
+
postcode: "postcode",
|
|
56
|
+
house_number: "house_number",
|
|
57
|
+
street: "street",
|
|
58
|
+
venue: "venue",
|
|
59
|
+
};
|
|
60
|
+
/**
|
|
61
|
+
* Format a legacy {@linkcode ClassificationMap} (`Map<VisibleClassification, string[]>`, as emitted
|
|
62
|
+
* by the rule-based pipeline) into an idiomatic address string. Subsumes the former
|
|
63
|
+
* `core/formatter` stub. Multi-span values are space-joined; unit-like labels are merged.
|
|
64
|
+
*/
|
|
65
|
+
export function formatFromClassificationMap(map, country, opts = {}) {
|
|
66
|
+
const components = {};
|
|
67
|
+
const unitParts = [];
|
|
68
|
+
for (const [classification, values] of map) {
|
|
69
|
+
const value = values.filter(Boolean).join(" ").replace(/\s+/g, " ").trim();
|
|
70
|
+
if (!value)
|
|
71
|
+
continue;
|
|
72
|
+
if (classification === "unit" || classification === "level") {
|
|
73
|
+
unitParts.push(value);
|
|
74
|
+
continue;
|
|
75
|
+
}
|
|
76
|
+
const tag = CLASSIFICATION_TO_TAG[classification];
|
|
77
|
+
if (tag)
|
|
78
|
+
components[tag] = value;
|
|
79
|
+
}
|
|
80
|
+
if (unitParts.length)
|
|
81
|
+
components.unit = unitParts.join(" ");
|
|
82
|
+
return formatAddress(components, country, opts);
|
|
83
|
+
}
|
|
84
|
+
/**
|
|
85
|
+
* Drop any component whose value isn't actually present in the formatted `raw`. OpenCage's
|
|
86
|
+
* per-country templates legitimately omit some inputs (FR regions absorbed by the postcode; US
|
|
87
|
+
* state names abbreviated), and downstream alignment requires `components[tag]` to occur in `raw`.
|
|
88
|
+
* Comparison is case- and whitespace-insensitive; the retained value is the original input.
|
|
89
|
+
*/
|
|
90
|
+
export function reconcileComponents(components, raw) {
|
|
91
|
+
const haystack = raw.toLowerCase().replace(/\s+/g, " ");
|
|
92
|
+
const out = {};
|
|
93
|
+
for (const [k, v] of Object.entries(components)) {
|
|
94
|
+
if (!v)
|
|
95
|
+
continue;
|
|
96
|
+
const needle = v.toLowerCase().replace(/\s+/g, " ");
|
|
97
|
+
if (haystack.includes(needle))
|
|
98
|
+
out[k] = v;
|
|
99
|
+
}
|
|
100
|
+
return out;
|
|
101
|
+
}
|
|
102
|
+
/**
|
|
103
|
+
* Translate a `ComponentTag` dict to the OpenCage vocabulary `@fragaria/address-formatter` expects.
|
|
104
|
+
* Exported for testing and for callers that pre-build the dict for batch formatting.
|
|
105
|
+
*/
|
|
106
|
+
export function toOpenCageComponents(components, country) {
|
|
107
|
+
const out = {};
|
|
108
|
+
const road = composeRoad(components);
|
|
109
|
+
if (road)
|
|
110
|
+
out.road = road;
|
|
111
|
+
if (components.house_number)
|
|
112
|
+
out.house_number = components.house_number;
|
|
113
|
+
if (components.venue)
|
|
114
|
+
out.house = components.venue;
|
|
115
|
+
if (components.locality)
|
|
116
|
+
out.city = components.locality;
|
|
117
|
+
if (components.dependent_locality)
|
|
118
|
+
out.suburb = components.dependent_locality;
|
|
119
|
+
if (components.subregion)
|
|
120
|
+
out.county = components.subregion;
|
|
121
|
+
if (components.region)
|
|
122
|
+
out.state = components.region;
|
|
123
|
+
const postcode = composePostcode(components);
|
|
124
|
+
if (postcode)
|
|
125
|
+
out.postcode = postcode;
|
|
126
|
+
if (components.po_box)
|
|
127
|
+
out.po_box = components.po_box;
|
|
128
|
+
if (components.attention)
|
|
129
|
+
out.attention = components.attention;
|
|
130
|
+
if (components.country)
|
|
131
|
+
out.country = components.country;
|
|
132
|
+
// country_code drives template selection, not output. Only emit it alongside another component —
|
|
133
|
+
// otherwise the template renders the bare code ("US") as a fallback line, which no caller wants.
|
|
134
|
+
const cc = country.trim().toLowerCase();
|
|
135
|
+
if (cc && Object.keys(out).length > 0)
|
|
136
|
+
out.country_code = cc;
|
|
137
|
+
return out;
|
|
138
|
+
}
|
|
139
|
+
/**
|
|
140
|
+
* Build the `road` line from prefix / particle / street / suffix / unit / intersection components:
|
|
141
|
+
*
|
|
142
|
+
* ```
|
|
143
|
+
* [intersection_a & intersection_b]
|
|
144
|
+
* OR
|
|
145
|
+
* [street_prefix] [street_prefix_particle] [street] [street_suffix] [unit]
|
|
146
|
+
* ```
|
|
147
|
+
*/
|
|
148
|
+
function composeRoad(components) {
|
|
149
|
+
if (components.intersection_a && components.intersection_b) {
|
|
150
|
+
return `${components.intersection_a} & ${components.intersection_b}`;
|
|
151
|
+
}
|
|
152
|
+
const parts = [];
|
|
153
|
+
if (components.street_prefix)
|
|
154
|
+
parts.push(components.street_prefix);
|
|
155
|
+
if (components.street_prefix_particle)
|
|
156
|
+
parts.push(components.street_prefix_particle);
|
|
157
|
+
if (components.street)
|
|
158
|
+
parts.push(components.street);
|
|
159
|
+
if (components.street_suffix)
|
|
160
|
+
parts.push(components.street_suffix);
|
|
161
|
+
if (components.unit)
|
|
162
|
+
parts.push(components.unit);
|
|
163
|
+
return parts.join(" ").replace(/\s+/g, " ").trim();
|
|
164
|
+
}
|
|
165
|
+
/**
|
|
166
|
+
* Fold CEDEX into postcode for FR-style output: `"75008"` + `"CEDEX 08"` → `"75008 CEDEX 08"`. If
|
|
167
|
+
* only one is present, return it; if neither, return empty.
|
|
168
|
+
*/
|
|
169
|
+
function composePostcode(components) {
|
|
170
|
+
const base = components.postcode?.trim() ?? "";
|
|
171
|
+
const cedex = components.cedex?.trim() ?? "";
|
|
172
|
+
if (base && cedex)
|
|
173
|
+
return `${base} ${cedex}`.replace(/\s+/g, " ");
|
|
174
|
+
return base || cedex;
|
|
175
|
+
}
|
|
176
|
+
//# sourceMappingURL=format.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"format.js","sourceRoot":"","sources":["../format.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AAEH,OAAO,gBAAgB,MAAM,6BAA6B,CAAA;AA4B1D;;;;;GAKG;AACH,MAAM,UAAU,aAAa,CAAC,UAAyB,EAAE,OAAe,EAAE,OAA6B,EAAE;IACxG,MAAM,YAAY,GAAG,oBAAoB,CAAC,UAAU,EAAE,OAAO,CAAC,CAAA;IAC9D,IAAI,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAA;IAErD,MAAM,GAAG,GAAG,gBAAgB,CAAC,MAAM,CAAC,YAAY,EAAE;QACjD,UAAU,EAAE,IAAI,CAAC,UAAU,IAAI,KAAK;QACpC,aAAa,EAAE,IAAI,CAAC,aAAa,IAAI,KAAK;KAC1C,CAAC,CAAA;IAEF,MAAM,OAAO,GAAG,GAAG,CAAC,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC,CAAA;IACxC,OAAO,IAAI,CAAC,SAAS,KAAK,SAAS,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,MAAM,EAAE,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,OAAO,CAAA;AACxF,CAAC;AAED;;;;;;GAMG;AACH,MAAM,qBAAqB,GAAyD;IACnF,OAAO,EAAE,SAAS;IAClB,MAAM,EAAE,QAAQ;IAChB,QAAQ,EAAE,UAAU;IACpB,UAAU,EAAE,oBAAoB;IAChC,QAAQ,EAAE,UAAU;IACpB,YAAY,EAAE,cAAc;IAC5B,MAAM,EAAE,QAAQ;IAChB,KAAK,EAAE,OAAO;CACd,CAAA;AAED;;;;GAIG;AACH,MAAM,UAAU,2BAA2B,CAC1C,GAAsB,EACtB,OAAe,EACf,OAA6B,EAAE;IAE/B,MAAM,UAAU,GAAkB,EAAE,CAAA;IACpC,MAAM,SAAS,GAAa,EAAE,CAAA;IAE9B,KAAK,MAAM,CAAC,cAAc,EAAE,MAAM,CAAC,IAAI,GAAG,EAAE,CAAC;QAC5C,MAAM,KAAK,GAAG,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAA;QAC1E,IAAI,CAAC,KAAK;YAAE,SAAQ;QAEpB,IAAI,cAAc,KAAK,MAAM,IAAI,cAAc,KAAK,OAAO,EAAE,CAAC;YAC7D,SAAS,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;YACrB,SAAQ;QACT,CAAC;QAED,MAAM,GAAG,GAAG,qBAAqB,CAAC,cAAc,CAAC,CAAA;QACjD,IAAI,GAAG;YAAE,UAAU,CAAC,GAAG,CAAC,GAAG,KAAK,CAAA;IACjC,CAAC;IAED,IAAI,SAAS,CAAC,MAAM;QAAE,UAAU,CAAC,IAAI,GAAG,SAAS,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;IAE3D,OAAO,aAAa,CAAC,UAAU,EAAE,OAAO,EAAE,IAAI,CAAC,CAAA;AAChD,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,mBAAmB,CAAC,UAAyB,EAAE,GAAW;IACzE,MAAM,QAAQ,GAAG,GAAG,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAA;IACvD,MAAM,GAAG,GAAkB,EAAE,CAAA;IAE7B,KAAK,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,UAAU,CAAC,EAAE,CAAC;QACjD,IAAI,CAAC,CAAC;YAAE,SAAQ;QAChB,MAAM,MAAM,GAAG,CAAC,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAA;QACnD,IAAI,QAAQ,CAAC,QAAQ,CAAC,MAAM,CAAC;YAAE,GAAG,CAAC,CAAiB,CAAC,GAAG,CAAC,CAAA;IAC1D,CAAC;IAED,OAAO,GAAG,CAAA;AACX,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,oBAAoB,CAAC,UAAyB,EAAE,OAAe;IAC9E,MAAM,GAAG,GAA2B,EAAE,CAAA;IAEtC,MAAM,IAAI,GAAG,WAAW,CAAC,UAAU,CAAC,CAAA;IACpC,IAAI,IAAI;QAAE,GAAG,CAAC,IAAI,GAAG,IAAI,CAAA;IAEzB,IAAI,UAAU,CAAC,YAAY;QAAE,GAAG,CAAC,YAAY,GAAG,UAAU,CAAC,YAAY,CAAA;IACvE,IAAI,UAAU,CAAC,KAAK;QAAE,GAAG,CAAC,KAAK,GAAG,UAAU,CAAC,KAAK,CAAA;IAElD,IAAI,UAAU,CAAC,QAAQ;QAAE,GAAG,CAAC,IAAI,GAAG,UAAU,CAAC,QAAQ,CAAA;IACvD,IAAI,UAAU,CAAC,kBAAkB;QAAE,GAAG,CAAC,MAAM,GAAG,UAAU,CAAC,kBAAkB,CAAA;IAC7E,IAAI,UAAU,CAAC,SAAS;QAAE,GAAG,CAAC,MAAM,GAAG,UAAU,CAAC,SAAS,CAAA;IAC3D,IAAI,UAAU,CAAC,MAAM;QAAE,GAAG,CAAC,KAAK,GAAG,UAAU,CAAC,MAAM,CAAA;IAEpD,MAAM,QAAQ,GAAG,eAAe,CAAC,UAAU,CAAC,CAAA;IAC5C,IAAI,QAAQ;QAAE,GAAG,CAAC,QAAQ,GAAG,QAAQ,CAAA;IAErC,IAAI,UAAU,CAAC,MAAM;QAAE,GAAG,CAAC,MAAM,GAAG,UAAU,CAAC,MAAM,CAAA;IACrD,IAAI,UAAU,CAAC,SAAS;QAAE,GAAG,CAAC,SAAS,GAAG,UAAU,CAAC,SAAS,CAAA;IAE9D,IAAI,UAAU,CAAC,OAAO;QAAE,GAAG,CAAC,OAAO,GAAG,UAAU,CAAC,OAAO,CAAA;IAExD,iGAAiG;IACjG,iGAAiG;IACjG,MAAM,EAAE,GAAG,OAAO,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAA;IACvC,IAAI,EAAE,IAAI,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,MAAM,GAAG,CAAC;QAAE,GAAG,CAAC,YAAY,GAAG,EAAE,CAAA;IAE5D,OAAO,GAAG,CAAA;AACX,CAAC;AAED;;;;;;;;GAQG;AACH,SAAS,WAAW,CAAC,UAAyB;IAC7C,IAAI,UAAU,CAAC,cAAc,IAAI,UAAU,CAAC,cAAc,EAAE,CAAC;QAC5D,OAAO,GAAG,UAAU,CAAC,cAAc,MAAM,UAAU,CAAC,cAAc,EAAE,CAAA;IACrE,CAAC;IAED,MAAM,KAAK,GAAa,EAAE,CAAA;IAC1B,IAAI,UAAU,CAAC,aAAa;QAAE,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,aAAa,CAAC,CAAA;IAClE,IAAI,UAAU,CAAC,sBAAsB;QAAE,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,sBAAsB,CAAC,CAAA;IACpF,IAAI,UAAU,CAAC,MAAM;QAAE,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,CAAA;IACpD,IAAI,UAAU,CAAC,aAAa;QAAE,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,aAAa,CAAC,CAAA;IAClE,IAAI,UAAU,CAAC,IAAI;QAAE,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,CAAA;IAEhD,OAAO,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAA;AACnD,CAAC;AAED;;;GAGG;AACH,SAAS,eAAe,CAAC,UAAyB;IACjD,MAAM,IAAI,GAAG,UAAU,CAAC,QAAQ,EAAE,IAAI,EAAE,IAAI,EAAE,CAAA;IAC9C,MAAM,KAAK,GAAG,UAAU,CAAC,KAAK,EAAE,IAAI,EAAE,IAAI,EAAE,CAAA;IAC5C,IAAI,IAAI,IAAI,KAAK;QAAE,OAAO,GAAG,IAAI,IAAI,KAAK,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAA;IACjE,OAAO,IAAI,IAAI,KAAK,CAAA;AACrB,CAAC"}
|
package/out/index.d.ts
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* `@mailwoman/formatter` — the inverse of the parser.
|
|
7
|
+
*
|
|
8
|
+
* - {@linkcode formatAddress} / {@linkcode formatFromClassificationMap}: components → idiomatic,
|
|
9
|
+
* locale-aware address string (for display and corpus synthesis).
|
|
10
|
+
* - {@linkcode canonicalKey}: components → a normalized, deterministic match key (for the matcher's
|
|
11
|
+
* blocking stage).
|
|
12
|
+
*/
|
|
13
|
+
export * from "./format.js";
|
|
14
|
+
export * from "./key.js";
|
|
15
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,cAAc,aAAa,CAAA;AAC3B,cAAc,UAAU,CAAA"}
|
package/out/index.js
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* `@mailwoman/formatter` — the inverse of the parser.
|
|
7
|
+
*
|
|
8
|
+
* - {@linkcode formatAddress} / {@linkcode formatFromClassificationMap}: components → idiomatic,
|
|
9
|
+
* locale-aware address string (for display and corpus synthesis).
|
|
10
|
+
* - {@linkcode canonicalKey}: components → a normalized, deterministic match key (for the matcher's
|
|
11
|
+
* blocking stage).
|
|
12
|
+
*/
|
|
13
|
+
export * from "./format.js";
|
|
14
|
+
export * from "./key.js";
|
|
15
|
+
//# sourceMappingURL=index.js.map
|
package/out/index.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,cAAc,aAAa,CAAA;AAC3B,cAAc,UAAU,CAAA"}
|
package/out/key.d.ts
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* The canonical match key — a normalized, deterministic string derived from address components,
|
|
7
|
+
* distinct from the human-readable formatted string.
|
|
8
|
+
*
|
|
9
|
+
* Where `format.ts` produces something for a person to read, this produces something for a
|
|
10
|
+
* _machine_ to collide on: lowercased, diacritic-stripped, punctuation-flattened, whitespace-
|
|
11
|
+
* collapsed, fields in a fixed canonical order. Two records for the same address that differ only
|
|
12
|
+
* in spelling, case, or punctuation produce the same key — which is exactly what the matcher's
|
|
13
|
+
* blocking stage wants as one cheap, high-precision candidate signal (alongside geographic
|
|
14
|
+
* proximity, which carries the real weight — see the geocode-first record-matching concept doc).
|
|
15
|
+
*
|
|
16
|
+
* Deliberately NOT done yet (follow-ups, all gated on `@mailwoman/codex`): expanding street
|
|
17
|
+
* suffixes (`Ave` → `avenue`) and directionals (`N` → `north`) to a canonical form, and
|
|
18
|
+
* USPS-style standardization. This first cut is pure normalization with no dictionary expansion,
|
|
19
|
+
* so the key is stable and explainable; expansion is an additive refinement, not a rewrite.
|
|
20
|
+
*/
|
|
21
|
+
import type { ComponentDict } from "./format.js";
|
|
22
|
+
/** Options accepted by {@linkcode canonicalKey}. */
|
|
23
|
+
export interface CanonicalKeyOptions {
|
|
24
|
+
/** Field separator in the emitted key. Default `"|"` — preserves field boundaries for blocking. */
|
|
25
|
+
separator?: string;
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Normalize a single token for matching: Unicode-decompose and strip combining marks (so `é` →
|
|
29
|
+
* `e`), lowercase, replace `&`/`+`/`/` with spaces, drop every non-alphanumeric character, and
|
|
30
|
+
* collapse whitespace. Deterministic and reversible-free — the same input always yields the same
|
|
31
|
+
* output.
|
|
32
|
+
*/
|
|
33
|
+
export declare function normalizeAddressToken(input: string): string;
|
|
34
|
+
/**
|
|
35
|
+
* Derive the canonical match key from an address component dict: each present, address-identifying
|
|
36
|
+
* field normalized via {@linkcode normalizeAddressToken}, in fixed order, joined by the separator.
|
|
37
|
+
* Empty / whitespace-only fields are skipped. Returns an empty string if nothing identifying
|
|
38
|
+
* remains.
|
|
39
|
+
*/
|
|
40
|
+
export declare function canonicalKey(components: ComponentDict, opts?: CanonicalKeyOptions): string;
|
|
41
|
+
//# sourceMappingURL=key.d.ts.map
|
package/out/key.d.ts.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"key.d.ts","sourceRoot":"","sources":["../key.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;GAmBG;AAGH,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,aAAa,CAAA;AAwBhD,oDAAoD;AACpD,MAAM,WAAW,mBAAmB;IACnC,mGAAmG;IACnG,SAAS,CAAC,EAAE,MAAM,CAAA;CAClB;AAED;;;;;GAKG;AACH,wBAAgB,qBAAqB,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,CAgB3D;AAED;;;;;GAKG;AACH,wBAAgB,YAAY,CAAC,UAAU,EAAE,aAAa,EAAE,IAAI,GAAE,mBAAwB,GAAG,MAAM,CAY9F"}
|
package/out/key.js
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* The canonical match key — a normalized, deterministic string derived from address components,
|
|
7
|
+
* distinct from the human-readable formatted string.
|
|
8
|
+
*
|
|
9
|
+
* Where `format.ts` produces something for a person to read, this produces something for a
|
|
10
|
+
* _machine_ to collide on: lowercased, diacritic-stripped, punctuation-flattened, whitespace-
|
|
11
|
+
* collapsed, fields in a fixed canonical order. Two records for the same address that differ only
|
|
12
|
+
* in spelling, case, or punctuation produce the same key — which is exactly what the matcher's
|
|
13
|
+
* blocking stage wants as one cheap, high-precision candidate signal (alongside geographic
|
|
14
|
+
* proximity, which carries the real weight — see the geocode-first record-matching concept doc).
|
|
15
|
+
*
|
|
16
|
+
* Deliberately NOT done yet (follow-ups, all gated on `@mailwoman/codex`): expanding street
|
|
17
|
+
* suffixes (`Ave` → `avenue`) and directionals (`N` → `north`) to a canonical form, and
|
|
18
|
+
* USPS-style standardization. This first cut is pure normalization with no dictionary expansion,
|
|
19
|
+
* so the key is stable and explainable; expansion is an additive refinement, not a rewrite.
|
|
20
|
+
*/
|
|
21
|
+
/**
|
|
22
|
+
* The address-identifying components, in canonical key order. Venue / attention are intentionally
|
|
23
|
+
* excluded — those carry organization identity, which the record layer keys separately.
|
|
24
|
+
*/
|
|
25
|
+
const KEY_FIELD_ORDER = [
|
|
26
|
+
"po_box",
|
|
27
|
+
"house_number",
|
|
28
|
+
"street_prefix",
|
|
29
|
+
"street_prefix_particle",
|
|
30
|
+
"street",
|
|
31
|
+
"street_suffix",
|
|
32
|
+
"intersection_a",
|
|
33
|
+
"intersection_b",
|
|
34
|
+
"unit",
|
|
35
|
+
"dependent_locality",
|
|
36
|
+
"locality",
|
|
37
|
+
"subregion",
|
|
38
|
+
"region",
|
|
39
|
+
"postcode",
|
|
40
|
+
"country",
|
|
41
|
+
];
|
|
42
|
+
/**
|
|
43
|
+
* Normalize a single token for matching: Unicode-decompose and strip combining marks (so `é` →
|
|
44
|
+
* `e`), lowercase, replace `&`/`+`/`/` with spaces, drop every non-alphanumeric character, and
|
|
45
|
+
* collapse whitespace. Deterministic and reversible-free — the same input always yields the same
|
|
46
|
+
* output.
|
|
47
|
+
*/
|
|
48
|
+
export function normalizeAddressToken(input) {
|
|
49
|
+
return (input
|
|
50
|
+
.normalize("NFKD")
|
|
51
|
+
// strip combining marks (U+0300–U+036F) left by NFKD decomposition, so "é" → "e"
|
|
52
|
+
.replace(/[̀-ͯ]/g, "")
|
|
53
|
+
.toLowerCase()
|
|
54
|
+
// apostrophes are intra-word (possessives, "O'Brien") — delete so the token stays whole
|
|
55
|
+
.replace(/['’`]/g, "")
|
|
56
|
+
// connective punctuation becomes a space rather than vanishing (so "A&B" → "a b", not "ab")
|
|
57
|
+
.replace(/[&+/]/g, " ")
|
|
58
|
+
// everything else non-alphanumeric (keep spaces) is noise
|
|
59
|
+
.replace(/[^a-z0-9\s]/g, " ")
|
|
60
|
+
.replace(/\s+/g, " ")
|
|
61
|
+
.trim());
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* Derive the canonical match key from an address component dict: each present, address-identifying
|
|
65
|
+
* field normalized via {@linkcode normalizeAddressToken}, in fixed order, joined by the separator.
|
|
66
|
+
* Empty / whitespace-only fields are skipped. Returns an empty string if nothing identifying
|
|
67
|
+
* remains.
|
|
68
|
+
*/
|
|
69
|
+
export function canonicalKey(components, opts = {}) {
|
|
70
|
+
const separator = opts.separator ?? "|";
|
|
71
|
+
const parts = [];
|
|
72
|
+
for (const tag of KEY_FIELD_ORDER) {
|
|
73
|
+
const value = components[tag];
|
|
74
|
+
if (!value)
|
|
75
|
+
continue;
|
|
76
|
+
const normalized = normalizeAddressToken(value);
|
|
77
|
+
if (normalized)
|
|
78
|
+
parts.push(normalized);
|
|
79
|
+
}
|
|
80
|
+
return parts.join(separator);
|
|
81
|
+
}
|
|
82
|
+
//# sourceMappingURL=key.js.map
|
package/out/key.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"key.js","sourceRoot":"","sources":["../key.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;GAmBG;AAKH;;;GAGG;AACH,MAAM,eAAe,GAAG;IACvB,QAAQ;IACR,cAAc;IACd,eAAe;IACf,wBAAwB;IACxB,QAAQ;IACR,eAAe;IACf,gBAAgB;IAChB,gBAAgB;IAChB,MAAM;IACN,oBAAoB;IACpB,UAAU;IACV,WAAW;IACX,QAAQ;IACR,UAAU;IACV,SAAS;CACkC,CAAA;AAQ5C;;;;;GAKG;AACH,MAAM,UAAU,qBAAqB,CAAC,KAAa;IAClD,OAAO,CACN,KAAK;SACH,SAAS,CAAC,MAAM,CAAC;QAClB,iFAAiF;SAChF,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC;SACrB,WAAW,EAAE;QACd,wFAAwF;SACvF,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC;QACtB,4FAA4F;SAC3F,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC;QACvB,0DAA0D;SACzD,OAAO,CAAC,cAAc,EAAE,GAAG,CAAC;SAC5B,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC;SACpB,IAAI,EAAE,CACR,CAAA;AACF,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,YAAY,CAAC,UAAyB,EAAE,OAA4B,EAAE;IACrF,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,IAAI,GAAG,CAAA;IACvC,MAAM,KAAK,GAAa,EAAE,CAAA;IAE1B,KAAK,MAAM,GAAG,IAAI,eAAe,EAAE,CAAC;QACnC,MAAM,KAAK,GAAG,UAAU,CAAC,GAAG,CAAC,CAAA;QAC7B,IAAI,CAAC,KAAK;YAAE,SAAQ;QACpB,MAAM,UAAU,GAAG,qBAAqB,CAAC,KAAK,CAAC,CAAA;QAC/C,IAAI,UAAU;YAAE,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAA;IACvC,CAAC;IAED,OAAO,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAA;AAC7B,CAAC"}
|
package/package.json
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@mailwoman/formatter",
|
|
3
|
+
"version": "4.8.1",
|
|
4
|
+
"description": "The inverse of the parser: render Mailwoman `ComponentTag` components into an idiomatic, locale-aware address string — plus a canonical, normalized match key for record linkage. Consolidates the former core/formatter stub and corpus synthesis formatter behind one API.",
|
|
5
|
+
"license": "AGPL-3.0-only",
|
|
6
|
+
"repository": {
|
|
7
|
+
"type": "git",
|
|
8
|
+
"url": "https://github.com/sister-software/mailwoman.git",
|
|
9
|
+
"directory": "formatter"
|
|
10
|
+
},
|
|
11
|
+
"type": "module",
|
|
12
|
+
"exports": {
|
|
13
|
+
"./package.json": "./package.json",
|
|
14
|
+
".": "./out/index.js",
|
|
15
|
+
"./format": "./out/format.js",
|
|
16
|
+
"./key": "./out/key.js"
|
|
17
|
+
},
|
|
18
|
+
"dependencies": {
|
|
19
|
+
"@fragaria/address-formatter": "^6.7.1",
|
|
20
|
+
"@mailwoman/core": "4.8.0"
|
|
21
|
+
},
|
|
22
|
+
"devDependencies": {
|
|
23
|
+
"@types/node": "^25.9.2"
|
|
24
|
+
},
|
|
25
|
+
"files": [
|
|
26
|
+
"out/**/*.js",
|
|
27
|
+
"out/**/*.js.map",
|
|
28
|
+
"out/**/*.d.ts",
|
|
29
|
+
"out/**/*.d.ts.map"
|
|
30
|
+
],
|
|
31
|
+
"publishConfig": {
|
|
32
|
+
"access": "public"
|
|
33
|
+
}
|
|
34
|
+
}
|