@mailwoman/formatter 4.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,72 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Render a `ComponentTag`-keyed dict into a country-localized string — the inverse of the parser.
7
+ *
8
+ * This is the canonical home for Mailwoman's address formatting, consolidated from two earlier
9
+ * half-implementations: the `core/formatter` stub (which wrapped OpenCage but hardcoded `US`) and
10
+ * the corpus synthesis formatter (`corpus/src/format.ts`, the fuller one this is ported from).
11
+ *
12
+ * It bridges Mailwoman's `ComponentTag` schema to OpenCage's `address-formatting` templates
13
+ * (vendored via `@fragaria/address-formatter`, MIT) so callers get idiomatic per-country output
14
+ * without reinventing template logic. Owning our own templates — so we can express the slots
15
+ * OpenCage can't (`unit`, `intersection`, `cedex`, the JP tags) — is a deliberate follow-up; this
16
+ * first cut keeps Fragaria as the engine and concentrates the mapping in one place.
17
+ *
18
+ * Known limitations inherited from the OpenCage vocabulary (documented, not blockers):
19
+ *
20
+ * - `unit`: no slot, so units ride the road line (`"Pennsylvania Ave NW Apt 4B"`).
21
+ * - `intersection_a` / `intersection_b`: joined as `"<a> & <b>"` into the road field.
22
+ * - `cedex` (FR): folded into `postcode` (`"75008 CEDEX 08"`) so the FR template slots it right.
23
+ * - JP-specific tags (`prefecture`, `municipality`, …): no mapping yet.
24
+ */
25
+ import type { ClassificationMap } from "@mailwoman/core/classification";
26
+ import type { ComponentTag } from "@mailwoman/core/types";
27
+ /** A partial map of `ComponentTag` → string value — the canonical formatter input. */
28
+ export type ComponentDict = Partial<Record<ComponentTag, string>>;
29
+ /** Options accepted by `formatAddress`. */
30
+ export interface FormatAddressOptions {
31
+ /**
32
+ * Append the country name as a final line (`"USA"`, `"France"`). Default `false`: most rows are
33
+ * intra-country and the country line is redundant noise.
34
+ */
35
+ appendCountry?: boolean;
36
+ /**
37
+ * Apply OpenCage's per-country abbreviation rules (`"Avenue"` → `"Ave"`). Default `false` —
38
+ * callers that want abbreviation usually run it as their own augmentation pass.
39
+ */
40
+ abbreviate?: boolean;
41
+ /**
42
+ * Replace the template's newlines with this separator. Default `undefined` (keep newlines). Use
43
+ * `", "` for single-line output, or `" "` to strip internal punctuation.
44
+ */
45
+ separator?: string;
46
+ }
47
+ /**
48
+ * Render a component dict into an idiomatic per-country address string.
49
+ *
50
+ * Returns an empty string if `components` is empty after translation. Throws nothing — bad inputs
51
+ * degrade to the longest meaningful prefix.
52
+ */
53
+ export declare function formatAddress(components: ComponentDict, country: string, opts?: FormatAddressOptions): string;
54
+ /**
55
+ * Format a legacy {@linkcode ClassificationMap} (`Map<VisibleClassification, string[]>`, as emitted
56
+ * by the rule-based pipeline) into an idiomatic address string. Subsumes the former
57
+ * `core/formatter` stub. Multi-span values are space-joined; unit-like labels are merged.
58
+ */
59
+ export declare function formatFromClassificationMap(map: ClassificationMap, country: string, opts?: FormatAddressOptions): string;
60
+ /**
61
+ * Drop any component whose value isn't actually present in the formatted `raw`. OpenCage's
62
+ * per-country templates legitimately omit some inputs (FR regions absorbed by the postcode; US
63
+ * state names abbreviated), and downstream alignment requires `components[tag]` to occur in `raw`.
64
+ * Comparison is case- and whitespace-insensitive; the retained value is the original input.
65
+ */
66
+ export declare function reconcileComponents(components: ComponentDict, raw: string): ComponentDict;
67
+ /**
68
+ * Translate a `ComponentTag` dict to the OpenCage vocabulary `@fragaria/address-formatter` expects.
69
+ * Exported for testing and for callers that pre-build the dict for batch formatting.
70
+ */
71
+ export declare function toOpenCageComponents(components: ComponentDict, country: string): Record<string, string>;
72
+ //# sourceMappingURL=format.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"format.d.ts","sourceRoot":"","sources":["../format.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AAGH,OAAO,KAAK,EAAE,iBAAiB,EAAyB,MAAM,gCAAgC,CAAA;AAC9F,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,uBAAuB,CAAA;AAEzD,sFAAsF;AACtF,MAAM,MAAM,aAAa,GAAG,OAAO,CAAC,MAAM,CAAC,YAAY,EAAE,MAAM,CAAC,CAAC,CAAA;AAEjE,2CAA2C;AAC3C,MAAM,WAAW,oBAAoB;IACpC;;;OAGG;IACH,aAAa,CAAC,EAAE,OAAO,CAAA;IAEvB;;;OAGG;IACH,UAAU,CAAC,EAAE,OAAO,CAAA;IAEpB;;;OAGG;IACH,SAAS,CAAC,EAAE,MAAM,CAAA;CAClB;AAED;;;;;GAKG;AACH,wBAAgB,aAAa,CAAC,UAAU,EAAE,aAAa,EAAE,OAAO,EAAE,MAAM,EAAE,IAAI,GAAE,oBAAyB,GAAG,MAAM,CAWjH;AAoBD;;;;GAIG;AACH,wBAAgB,2BAA2B,CAC1C,GAAG,EAAE,iBAAiB,EACtB,OAAO,EAAE,MAAM,EACf,IAAI,GAAE,oBAAyB,GAC7B,MAAM,CAoBR;AAED;;;;;GAKG;AACH,wBAAgB,mBAAmB,CAAC,UAAU,EAAE,aAAa,EAAE,GAAG,EAAE,MAAM,GAAG,aAAa,CAWzF;AAED;;;GAGG;AACH,wBAAgB,oBAAoB,CAAC,UAAU,EAAE,aAAa,EAAE,OAAO,EAAE,MAAM,GAAG,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CA4BvG"}
package/out/format.js ADDED
@@ -0,0 +1,176 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Render a `ComponentTag`-keyed dict into a country-localized string — the inverse of the parser.
7
+ *
8
+ * This is the canonical home for Mailwoman's address formatting, consolidated from two earlier
9
+ * half-implementations: the `core/formatter` stub (which wrapped OpenCage but hardcoded `US`) and
10
+ * the corpus synthesis formatter (`corpus/src/format.ts`, the fuller one this is ported from).
11
+ *
12
+ * It bridges Mailwoman's `ComponentTag` schema to OpenCage's `address-formatting` templates
13
+ * (vendored via `@fragaria/address-formatter`, MIT) so callers get idiomatic per-country output
14
+ * without reinventing template logic. Owning our own templates — so we can express the slots
15
+ * OpenCage can't (`unit`, `intersection`, `cedex`, the JP tags) — is a deliberate follow-up; this
16
+ * first cut keeps Fragaria as the engine and concentrates the mapping in one place.
17
+ *
18
+ * Known limitations inherited from the OpenCage vocabulary (documented, not blockers):
19
+ *
20
+ * - `unit`: no slot, so units ride the road line (`"Pennsylvania Ave NW Apt 4B"`).
21
+ * - `intersection_a` / `intersection_b`: joined as `"<a> & <b>"` into the road field.
22
+ * - `cedex` (FR): folded into `postcode` (`"75008 CEDEX 08"`) so the FR template slots it right.
23
+ * - JP-specific tags (`prefecture`, `municipality`, …): no mapping yet.
24
+ */
25
+ import addressFormatter from "@fragaria/address-formatter";
26
+ /**
27
+ * Render a component dict into an idiomatic per-country address string.
28
+ *
29
+ * Returns an empty string if `components` is empty after translation. Throws nothing — bad inputs
30
+ * degrade to the longest meaningful prefix.
31
+ */
32
+ export function formatAddress(components, country, opts = {}) {
33
+ const ocComponents = toOpenCageComponents(components, country);
34
+ if (Object.keys(ocComponents).length === 0)
35
+ return "";
36
+ const raw = addressFormatter.format(ocComponents, {
37
+ abbreviate: opts.abbreviate ?? false,
38
+ appendCountry: opts.appendCountry ?? false,
39
+ });
40
+ const trimmed = raw.replace(/\s+$/g, "");
41
+ return opts.separator !== undefined ? trimmed.replace(/\n+/g, opts.separator) : trimmed;
42
+ }
43
+ /**
44
+ * Map of legacy rule-classifier {@linkcode VisibleClassification} labels to the canonical
45
+ * `ComponentTag` schema. The two vocabularies are kept independent on purpose (rule classifiers
46
+ * emit one, the neural classifier the other); this adapter is the bridge so a `ClassificationMap`
47
+ * can use the same formatter. `level` / `unit_designator` / `level_designator` are folded into
48
+ * `unit`.
49
+ */
50
+ const CLASSIFICATION_TO_TAG = {
51
+ country: "country",
52
+ region: "region",
53
+ locality: "locality",
54
+ dependency: "dependent_locality",
55
+ postcode: "postcode",
56
+ house_number: "house_number",
57
+ street: "street",
58
+ venue: "venue",
59
+ };
60
+ /**
61
+ * Format a legacy {@linkcode ClassificationMap} (`Map<VisibleClassification, string[]>`, as emitted
62
+ * by the rule-based pipeline) into an idiomatic address string. Subsumes the former
63
+ * `core/formatter` stub. Multi-span values are space-joined; unit-like labels are merged.
64
+ */
65
+ export function formatFromClassificationMap(map, country, opts = {}) {
66
+ const components = {};
67
+ const unitParts = [];
68
+ for (const [classification, values] of map) {
69
+ const value = values.filter(Boolean).join(" ").replace(/\s+/g, " ").trim();
70
+ if (!value)
71
+ continue;
72
+ if (classification === "unit" || classification === "level") {
73
+ unitParts.push(value);
74
+ continue;
75
+ }
76
+ const tag = CLASSIFICATION_TO_TAG[classification];
77
+ if (tag)
78
+ components[tag] = value;
79
+ }
80
+ if (unitParts.length)
81
+ components.unit = unitParts.join(" ");
82
+ return formatAddress(components, country, opts);
83
+ }
84
+ /**
85
+ * Drop any component whose value isn't actually present in the formatted `raw`. OpenCage's
86
+ * per-country templates legitimately omit some inputs (FR regions absorbed by the postcode; US
87
+ * state names abbreviated), and downstream alignment requires `components[tag]` to occur in `raw`.
88
+ * Comparison is case- and whitespace-insensitive; the retained value is the original input.
89
+ */
90
+ export function reconcileComponents(components, raw) {
91
+ const haystack = raw.toLowerCase().replace(/\s+/g, " ");
92
+ const out = {};
93
+ for (const [k, v] of Object.entries(components)) {
94
+ if (!v)
95
+ continue;
96
+ const needle = v.toLowerCase().replace(/\s+/g, " ");
97
+ if (haystack.includes(needle))
98
+ out[k] = v;
99
+ }
100
+ return out;
101
+ }
102
+ /**
103
+ * Translate a `ComponentTag` dict to the OpenCage vocabulary `@fragaria/address-formatter` expects.
104
+ * Exported for testing and for callers that pre-build the dict for batch formatting.
105
+ */
106
+ export function toOpenCageComponents(components, country) {
107
+ const out = {};
108
+ const road = composeRoad(components);
109
+ if (road)
110
+ out.road = road;
111
+ if (components.house_number)
112
+ out.house_number = components.house_number;
113
+ if (components.venue)
114
+ out.house = components.venue;
115
+ if (components.locality)
116
+ out.city = components.locality;
117
+ if (components.dependent_locality)
118
+ out.suburb = components.dependent_locality;
119
+ if (components.subregion)
120
+ out.county = components.subregion;
121
+ if (components.region)
122
+ out.state = components.region;
123
+ const postcode = composePostcode(components);
124
+ if (postcode)
125
+ out.postcode = postcode;
126
+ if (components.po_box)
127
+ out.po_box = components.po_box;
128
+ if (components.attention)
129
+ out.attention = components.attention;
130
+ if (components.country)
131
+ out.country = components.country;
132
+ // country_code drives template selection, not output. Only emit it alongside another component —
133
+ // otherwise the template renders the bare code ("US") as a fallback line, which no caller wants.
134
+ const cc = country.trim().toLowerCase();
135
+ if (cc && Object.keys(out).length > 0)
136
+ out.country_code = cc;
137
+ return out;
138
+ }
139
+ /**
140
+ * Build the `road` line from prefix / particle / street / suffix / unit / intersection components:
141
+ *
142
+ * ```
143
+ * [intersection_a & intersection_b]
144
+ * OR
145
+ * [street_prefix] [street_prefix_particle] [street] [street_suffix] [unit]
146
+ * ```
147
+ */
148
+ function composeRoad(components) {
149
+ if (components.intersection_a && components.intersection_b) {
150
+ return `${components.intersection_a} & ${components.intersection_b}`;
151
+ }
152
+ const parts = [];
153
+ if (components.street_prefix)
154
+ parts.push(components.street_prefix);
155
+ if (components.street_prefix_particle)
156
+ parts.push(components.street_prefix_particle);
157
+ if (components.street)
158
+ parts.push(components.street);
159
+ if (components.street_suffix)
160
+ parts.push(components.street_suffix);
161
+ if (components.unit)
162
+ parts.push(components.unit);
163
+ return parts.join(" ").replace(/\s+/g, " ").trim();
164
+ }
165
+ /**
166
+ * Fold CEDEX into postcode for FR-style output: `"75008"` + `"CEDEX 08"` → `"75008 CEDEX 08"`. If
167
+ * only one is present, return it; if neither, return empty.
168
+ */
169
+ function composePostcode(components) {
170
+ const base = components.postcode?.trim() ?? "";
171
+ const cedex = components.cedex?.trim() ?? "";
172
+ if (base && cedex)
173
+ return `${base} ${cedex}`.replace(/\s+/g, " ");
174
+ return base || cedex;
175
+ }
176
+ //# sourceMappingURL=format.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"format.js","sourceRoot":"","sources":["../format.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AAEH,OAAO,gBAAgB,MAAM,6BAA6B,CAAA;AA4B1D;;;;;GAKG;AACH,MAAM,UAAU,aAAa,CAAC,UAAyB,EAAE,OAAe,EAAE,OAA6B,EAAE;IACxG,MAAM,YAAY,GAAG,oBAAoB,CAAC,UAAU,EAAE,OAAO,CAAC,CAAA;IAC9D,IAAI,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAA;IAErD,MAAM,GAAG,GAAG,gBAAgB,CAAC,MAAM,CAAC,YAAY,EAAE;QACjD,UAAU,EAAE,IAAI,CAAC,UAAU,IAAI,KAAK;QACpC,aAAa,EAAE,IAAI,CAAC,aAAa,IAAI,KAAK;KAC1C,CAAC,CAAA;IAEF,MAAM,OAAO,GAAG,GAAG,CAAC,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC,CAAA;IACxC,OAAO,IAAI,CAAC,SAAS,KAAK,SAAS,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,MAAM,EAAE,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,OAAO,CAAA;AACxF,CAAC;AAED;;;;;;GAMG;AACH,MAAM,qBAAqB,GAAyD;IACnF,OAAO,EAAE,SAAS;IAClB,MAAM,EAAE,QAAQ;IAChB,QAAQ,EAAE,UAAU;IACpB,UAAU,EAAE,oBAAoB;IAChC,QAAQ,EAAE,UAAU;IACpB,YAAY,EAAE,cAAc;IAC5B,MAAM,EAAE,QAAQ;IAChB,KAAK,EAAE,OAAO;CACd,CAAA;AAED;;;;GAIG;AACH,MAAM,UAAU,2BAA2B,CAC1C,GAAsB,EACtB,OAAe,EACf,OAA6B,EAAE;IAE/B,MAAM,UAAU,GAAkB,EAAE,CAAA;IACpC,MAAM,SAAS,GAAa,EAAE,CAAA;IAE9B,KAAK,MAAM,CAAC,cAAc,EAAE,MAAM,CAAC,IAAI,GAAG,EAAE,CAAC;QAC5C,MAAM,KAAK,GAAG,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAA;QAC1E,IAAI,CAAC,KAAK;YAAE,SAAQ;QAEpB,IAAI,cAAc,KAAK,MAAM,IAAI,cAAc,KAAK,OAAO,EAAE,CAAC;YAC7D,SAAS,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;YACrB,SAAQ;QACT,CAAC;QAED,MAAM,GAAG,GAAG,qBAAqB,CAAC,cAAc,CAAC,CAAA;QACjD,IAAI,GAAG;YAAE,UAAU,CAAC,GAAG,CAAC,GAAG,KAAK,CAAA;IACjC,CAAC;IAED,IAAI,SAAS,CAAC,MAAM;QAAE,UAAU,CAAC,IAAI,GAAG,SAAS,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;IAE3D,OAAO,aAAa,CAAC,UAAU,EAAE,OAAO,EAAE,IAAI,CAAC,CAAA;AAChD,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,mBAAmB,CAAC,UAAyB,EAAE,GAAW;IACzE,MAAM,QAAQ,GAAG,GAAG,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAA;IACvD,MAAM,GAAG,GAAkB,EAAE,CAAA;IAE7B,KAAK,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,UAAU,CAAC,EAAE,CAAC;QACjD,IAAI,CAAC,CAAC;YAAE,SAAQ;QAChB,MAAM,MAAM,GAAG,CAAC,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAA;QACnD,IAAI,QAAQ,CAAC,QAAQ,CAAC,MAAM,CAAC;YAAE,GAAG,CAAC,CAAiB,CAAC,GAAG,CAAC,CAAA;IAC1D,CAAC;IAED,OAAO,GAAG,CAAA;AACX,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,oBAAoB,CAAC,UAAyB,EAAE,OAAe;IAC9E,MAAM,GAAG,GAA2B,EAAE,CAAA;IAEtC,MAAM,IAAI,GAAG,WAAW,CAAC,UAAU,CAAC,CAAA;IACpC,IAAI,IAAI;QAAE,GAAG,CAAC,IAAI,GAAG,IAAI,CAAA;IAEzB,IAAI,UAAU,CAAC,YAAY;QAAE,GAAG,CAAC,YAAY,GAAG,UAAU,CAAC,YAAY,CAAA;IACvE,IAAI,UAAU,CAAC,KAAK;QAAE,GAAG,CAAC,KAAK,GAAG,UAAU,CAAC,KAAK,CAAA;IAElD,IAAI,UAAU,CAAC,QAAQ;QAAE,GAAG,CAAC,IAAI,GAAG,UAAU,CAAC,QAAQ,CAAA;IACvD,IAAI,UAAU,CAAC,kBAAkB;QAAE,GAAG,CAAC,MAAM,GAAG,UAAU,CAAC,kBAAkB,CAAA;IAC7E,IAAI,UAAU,CAAC,SAAS;QAAE,GAAG,CAAC,MAAM,GAAG,UAAU,CAAC,SAAS,CAAA;IAC3D,IAAI,UAAU,CAAC,MAAM;QAAE,GAAG,CAAC,KAAK,GAAG,UAAU,CAAC,MAAM,CAAA;IAEpD,MAAM,QAAQ,GAAG,eAAe,CAAC,UAAU,CAAC,CAAA;IAC5C,IAAI,QAAQ;QAAE,GAAG,CAAC,QAAQ,GAAG,QAAQ,CAAA;IAErC,IAAI,UAAU,CAAC,MAAM;QAAE,GAAG,CAAC,MAAM,GAAG,UAAU,CAAC,MAAM,CAAA;IACrD,IAAI,UAAU,CAAC,SAAS;QAAE,GAAG,CAAC,SAAS,GAAG,UAAU,CAAC,SAAS,CAAA;IAE9D,IAAI,UAAU,CAAC,OAAO;QAAE,GAAG,CAAC,OAAO,GAAG,UAAU,CAAC,OAAO,CAAA;IAExD,iGAAiG;IACjG,iGAAiG;IACjG,MAAM,EAAE,GAAG,OAAO,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAA;IACvC,IAAI,EAAE,IAAI,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,MAAM,GAAG,CAAC;QAAE,GAAG,CAAC,YAAY,GAAG,EAAE,CAAA;IAE5D,OAAO,GAAG,CAAA;AACX,CAAC;AAED;;;;;;;;GAQG;AACH,SAAS,WAAW,CAAC,UAAyB;IAC7C,IAAI,UAAU,CAAC,cAAc,IAAI,UAAU,CAAC,cAAc,EAAE,CAAC;QAC5D,OAAO,GAAG,UAAU,CAAC,cAAc,MAAM,UAAU,CAAC,cAAc,EAAE,CAAA;IACrE,CAAC;IAED,MAAM,KAAK,GAAa,EAAE,CAAA;IAC1B,IAAI,UAAU,CAAC,aAAa;QAAE,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,aAAa,CAAC,CAAA;IAClE,IAAI,UAAU,CAAC,sBAAsB;QAAE,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,sBAAsB,CAAC,CAAA;IACpF,IAAI,UAAU,CAAC,MAAM;QAAE,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,CAAA;IACpD,IAAI,UAAU,CAAC,aAAa;QAAE,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,aAAa,CAAC,CAAA;IAClE,IAAI,UAAU,CAAC,IAAI;QAAE,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,CAAA;IAEhD,OAAO,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAA;AACnD,CAAC;AAED;;;GAGG;AACH,SAAS,eAAe,CAAC,UAAyB;IACjD,MAAM,IAAI,GAAG,UAAU,CAAC,QAAQ,EAAE,IAAI,EAAE,IAAI,EAAE,CAAA;IAC9C,MAAM,KAAK,GAAG,UAAU,CAAC,KAAK,EAAE,IAAI,EAAE,IAAI,EAAE,CAAA;IAC5C,IAAI,IAAI,IAAI,KAAK;QAAE,OAAO,GAAG,IAAI,IAAI,KAAK,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAA;IACjE,OAAO,IAAI,IAAI,KAAK,CAAA;AACrB,CAAC"}
package/out/index.d.ts ADDED
@@ -0,0 +1,15 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * `@mailwoman/formatter` — the inverse of the parser.
7
+ *
8
+ * - {@linkcode formatAddress} / {@linkcode formatFromClassificationMap}: components → idiomatic,
9
+ * locale-aware address string (for display and corpus synthesis).
10
+ * - {@linkcode canonicalKey}: components → a normalized, deterministic match key (for the matcher's
11
+ * blocking stage).
12
+ */
13
+ export * from "./format.js";
14
+ export * from "./key.js";
15
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,cAAc,aAAa,CAAA;AAC3B,cAAc,UAAU,CAAA"}
package/out/index.js ADDED
@@ -0,0 +1,15 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * `@mailwoman/formatter` — the inverse of the parser.
7
+ *
8
+ * - {@linkcode formatAddress} / {@linkcode formatFromClassificationMap}: components → idiomatic,
9
+ * locale-aware address string (for display and corpus synthesis).
10
+ * - {@linkcode canonicalKey}: components → a normalized, deterministic match key (for the matcher's
11
+ * blocking stage).
12
+ */
13
+ export * from "./format.js";
14
+ export * from "./key.js";
15
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,cAAc,aAAa,CAAA;AAC3B,cAAc,UAAU,CAAA"}
package/out/key.d.ts ADDED
@@ -0,0 +1,41 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * The canonical match key — a normalized, deterministic string derived from address components,
7
+ * distinct from the human-readable formatted string.
8
+ *
9
+ * Where `format.ts` produces something for a person to read, this produces something for a
10
+ * _machine_ to collide on: lowercased, diacritic-stripped, punctuation-flattened, whitespace-
11
+ * collapsed, fields in a fixed canonical order. Two records for the same address that differ only
12
+ * in spelling, case, or punctuation produce the same key — which is exactly what the matcher's
13
+ * blocking stage wants as one cheap, high-precision candidate signal (alongside geographic
14
+ * proximity, which carries the real weight — see the geocode-first record-matching concept doc).
15
+ *
16
+ * Deliberately NOT done yet (follow-ups, all gated on `@mailwoman/codex`): expanding street
17
+ * suffixes (`Ave` → `avenue`) and directionals (`N` → `north`) to a canonical form, and
18
+ * USPS-style standardization. This first cut is pure normalization with no dictionary expansion,
19
+ * so the key is stable and explainable; expansion is an additive refinement, not a rewrite.
20
+ */
21
+ import type { ComponentDict } from "./format.js";
22
+ /** Options accepted by {@linkcode canonicalKey}. */
23
+ export interface CanonicalKeyOptions {
24
+ /** Field separator in the emitted key. Default `"|"` — preserves field boundaries for blocking. */
25
+ separator?: string;
26
+ }
27
+ /**
28
+ * Normalize a single token for matching: Unicode-decompose and strip combining marks (so `é` →
29
+ * `e`), lowercase, replace `&`/`+`/`/` with spaces, drop every non-alphanumeric character, and
30
+ * collapse whitespace. Deterministic and reversible-free — the same input always yields the same
31
+ * output.
32
+ */
33
+ export declare function normalizeAddressToken(input: string): string;
34
+ /**
35
+ * Derive the canonical match key from an address component dict: each present, address-identifying
36
+ * field normalized via {@linkcode normalizeAddressToken}, in fixed order, joined by the separator.
37
+ * Empty / whitespace-only fields are skipped. Returns an empty string if nothing identifying
38
+ * remains.
39
+ */
40
+ export declare function canonicalKey(components: ComponentDict, opts?: CanonicalKeyOptions): string;
41
+ //# sourceMappingURL=key.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"key.d.ts","sourceRoot":"","sources":["../key.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;GAmBG;AAGH,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,aAAa,CAAA;AAwBhD,oDAAoD;AACpD,MAAM,WAAW,mBAAmB;IACnC,mGAAmG;IACnG,SAAS,CAAC,EAAE,MAAM,CAAA;CAClB;AAED;;;;;GAKG;AACH,wBAAgB,qBAAqB,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,CAgB3D;AAED;;;;;GAKG;AACH,wBAAgB,YAAY,CAAC,UAAU,EAAE,aAAa,EAAE,IAAI,GAAE,mBAAwB,GAAG,MAAM,CAY9F"}
package/out/key.js ADDED
@@ -0,0 +1,82 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * The canonical match key — a normalized, deterministic string derived from address components,
7
+ * distinct from the human-readable formatted string.
8
+ *
9
+ * Where `format.ts` produces something for a person to read, this produces something for a
10
+ * _machine_ to collide on: lowercased, diacritic-stripped, punctuation-flattened, whitespace-
11
+ * collapsed, fields in a fixed canonical order. Two records for the same address that differ only
12
+ * in spelling, case, or punctuation produce the same key — which is exactly what the matcher's
13
+ * blocking stage wants as one cheap, high-precision candidate signal (alongside geographic
14
+ * proximity, which carries the real weight — see the geocode-first record-matching concept doc).
15
+ *
16
+ * Deliberately NOT done yet (follow-ups, all gated on `@mailwoman/codex`): expanding street
17
+ * suffixes (`Ave` → `avenue`) and directionals (`N` → `north`) to a canonical form, and
18
+ * USPS-style standardization. This first cut is pure normalization with no dictionary expansion,
19
+ * so the key is stable and explainable; expansion is an additive refinement, not a rewrite.
20
+ */
21
+ /**
22
+ * The address-identifying components, in canonical key order. Venue / attention are intentionally
23
+ * excluded — those carry organization identity, which the record layer keys separately.
24
+ */
25
+ const KEY_FIELD_ORDER = [
26
+ "po_box",
27
+ "house_number",
28
+ "street_prefix",
29
+ "street_prefix_particle",
30
+ "street",
31
+ "street_suffix",
32
+ "intersection_a",
33
+ "intersection_b",
34
+ "unit",
35
+ "dependent_locality",
36
+ "locality",
37
+ "subregion",
38
+ "region",
39
+ "postcode",
40
+ "country",
41
+ ];
42
+ /**
43
+ * Normalize a single token for matching: Unicode-decompose and strip combining marks (so `é` →
44
+ * `e`), lowercase, replace `&`/`+`/`/` with spaces, drop every non-alphanumeric character, and
45
+ * collapse whitespace. Deterministic and reversible-free — the same input always yields the same
46
+ * output.
47
+ */
48
+ export function normalizeAddressToken(input) {
49
+ return (input
50
+ .normalize("NFKD")
51
+ // strip combining marks (U+0300–U+036F) left by NFKD decomposition, so "é" → "e"
52
+ .replace(/[̀-ͯ]/g, "")
53
+ .toLowerCase()
54
+ // apostrophes are intra-word (possessives, "O'Brien") — delete so the token stays whole
55
+ .replace(/['’`]/g, "")
56
+ // connective punctuation becomes a space rather than vanishing (so "A&B" → "a b", not "ab")
57
+ .replace(/[&+/]/g, " ")
58
+ // everything else non-alphanumeric (keep spaces) is noise
59
+ .replace(/[^a-z0-9\s]/g, " ")
60
+ .replace(/\s+/g, " ")
61
+ .trim());
62
+ }
63
+ /**
64
+ * Derive the canonical match key from an address component dict: each present, address-identifying
65
+ * field normalized via {@linkcode normalizeAddressToken}, in fixed order, joined by the separator.
66
+ * Empty / whitespace-only fields are skipped. Returns an empty string if nothing identifying
67
+ * remains.
68
+ */
69
+ export function canonicalKey(components, opts = {}) {
70
+ const separator = opts.separator ?? "|";
71
+ const parts = [];
72
+ for (const tag of KEY_FIELD_ORDER) {
73
+ const value = components[tag];
74
+ if (!value)
75
+ continue;
76
+ const normalized = normalizeAddressToken(value);
77
+ if (normalized)
78
+ parts.push(normalized);
79
+ }
80
+ return parts.join(separator);
81
+ }
82
+ //# sourceMappingURL=key.js.map
package/out/key.js.map ADDED
@@ -0,0 +1 @@
1
+ {"version":3,"file":"key.js","sourceRoot":"","sources":["../key.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;GAmBG;AAKH;;;GAGG;AACH,MAAM,eAAe,GAAG;IACvB,QAAQ;IACR,cAAc;IACd,eAAe;IACf,wBAAwB;IACxB,QAAQ;IACR,eAAe;IACf,gBAAgB;IAChB,gBAAgB;IAChB,MAAM;IACN,oBAAoB;IACpB,UAAU;IACV,WAAW;IACX,QAAQ;IACR,UAAU;IACV,SAAS;CACkC,CAAA;AAQ5C;;;;;GAKG;AACH,MAAM,UAAU,qBAAqB,CAAC,KAAa;IAClD,OAAO,CACN,KAAK;SACH,SAAS,CAAC,MAAM,CAAC;QAClB,iFAAiF;SAChF,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC;SACrB,WAAW,EAAE;QACd,wFAAwF;SACvF,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC;QACtB,4FAA4F;SAC3F,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC;QACvB,0DAA0D;SACzD,OAAO,CAAC,cAAc,EAAE,GAAG,CAAC;SAC5B,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC;SACpB,IAAI,EAAE,CACR,CAAA;AACF,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,YAAY,CAAC,UAAyB,EAAE,OAA4B,EAAE;IACrF,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,IAAI,GAAG,CAAA;IACvC,MAAM,KAAK,GAAa,EAAE,CAAA;IAE1B,KAAK,MAAM,GAAG,IAAI,eAAe,EAAE,CAAC;QACnC,MAAM,KAAK,GAAG,UAAU,CAAC,GAAG,CAAC,CAAA;QAC7B,IAAI,CAAC,KAAK;YAAE,SAAQ;QACpB,MAAM,UAAU,GAAG,qBAAqB,CAAC,KAAK,CAAC,CAAA;QAC/C,IAAI,UAAU;YAAE,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAA;IACvC,CAAC;IAED,OAAO,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAA;AAC7B,CAAC"}
package/package.json ADDED
@@ -0,0 +1,34 @@
1
+ {
2
+ "name": "@mailwoman/formatter",
3
+ "version": "4.8.1",
4
+ "description": "The inverse of the parser: render Mailwoman `ComponentTag` components into an idiomatic, locale-aware address string — plus a canonical, normalized match key for record linkage. Consolidates the former core/formatter stub and corpus synthesis formatter behind one API.",
5
+ "license": "AGPL-3.0-only",
6
+ "repository": {
7
+ "type": "git",
8
+ "url": "https://github.com/sister-software/mailwoman.git",
9
+ "directory": "formatter"
10
+ },
11
+ "type": "module",
12
+ "exports": {
13
+ "./package.json": "./package.json",
14
+ ".": "./out/index.js",
15
+ "./format": "./out/format.js",
16
+ "./key": "./out/key.js"
17
+ },
18
+ "dependencies": {
19
+ "@fragaria/address-formatter": "^6.7.1",
20
+ "@mailwoman/core": "4.8.0"
21
+ },
22
+ "devDependencies": {
23
+ "@types/node": "^25.9.2"
24
+ },
25
+ "files": [
26
+ "out/**/*.js",
27
+ "out/**/*.js.map",
28
+ "out/**/*.d.ts",
29
+ "out/**/*.d.ts.map"
30
+ ],
31
+ "publishConfig": {
32
+ "access": "public"
33
+ }
34
+ }