@mailwoman/record 4.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/out/address.d.ts +85 -0
- package/out/address.d.ts.map +1 -0
- package/out/address.js +44 -0
- package/out/address.js.map +1 -0
- package/out/index.d.ts +16 -0
- package/out/index.d.ts.map +1 -0
- package/out/index.js +16 -0
- package/out/index.js.map +1 -0
- package/out/name.d.ts +49 -0
- package/out/name.d.ts.map +1 -0
- package/out/name.js +253 -0
- package/out/name.js.map +1 -0
- package/out/organization.d.ts +37 -0
- package/out/organization.d.ts.map +1 -0
- package/out/organization.js +129 -0
- package/out/organization.js.map +1 -0
- package/package.json +34 -0
package/out/address.d.ts
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* The canonical address record — the matcher's unit of address identity, and the spine the
|
|
7
|
+
* organization and contact records build on.
|
|
8
|
+
*
|
|
9
|
+
* It is plain data: parser components + the formatter's match key + an optional resolved geocode,
|
|
10
|
+
* composed into one object. No ORM, no decorators, no schema-generation machinery — if we need a
|
|
11
|
+
* database we reach for Kysely at the call site, not a model layer here.
|
|
12
|
+
*
|
|
13
|
+
* The geocode fields mirror mailwoman's `GeocodeResult` (tier + calibrated uncertainty + hierarchy)
|
|
14
|
+
* on purpose: that is the location signal the Fellegi-Sunter scorer weights its distance evidence
|
|
15
|
+
* by — two records sharing a `address_point` coordinate is strong agreement; sharing an
|
|
16
|
+
* `interpolated` centroid is weak; a PO-box / multi-unit coordinate is barely location agreement
|
|
17
|
+
* at all (the NAACCR precedent, see the geocode-first record-matching concept doc).
|
|
18
|
+
*/
|
|
19
|
+
import { type ComponentDict, type FormatAddressOptions } from "@mailwoman/formatter";
|
|
20
|
+
/** A geographic coordinate (WGS84 decimal degrees). */
|
|
21
|
+
export interface GeoCoordinate {
|
|
22
|
+
latitude: number;
|
|
23
|
+
longitude: number;
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* The resolution tier that produced a coordinate, mirroring mailwoman's geocoder (`address_point` >
|
|
27
|
+
* `interpolated` > `admin`). Kept as a local plain union so this package stays decoupled from the
|
|
28
|
+
* heavy geocoder runtime; a `GeocodeResult.resolution_tier` maps in directly.
|
|
29
|
+
*/
|
|
30
|
+
export type ResolutionTier = "address_point" | "interpolated" | "admin";
|
|
31
|
+
/** One resolved admin-hierarchy ancestor (most specific first), for spelling-invariant blocking. */
|
|
32
|
+
export interface HierarchyNode {
|
|
33
|
+
tag: string;
|
|
34
|
+
value: string;
|
|
35
|
+
placeId?: string;
|
|
36
|
+
}
|
|
37
|
+
/** A resolved geocode attached to an address record — the location signal the matcher scores on. */
|
|
38
|
+
export interface AddressGeocode {
|
|
39
|
+
coordinate: GeoCoordinate;
|
|
40
|
+
tier: ResolutionTier;
|
|
41
|
+
/** Calibrated uncertainty radius in meters; `null` for the admin tier (no sub-locality estimate). */
|
|
42
|
+
uncertaintyMeters: number | null;
|
|
43
|
+
/** Resolved admin hierarchy, locality → country (most specific first). */
|
|
44
|
+
hierarchy?: HierarchyNode[];
|
|
45
|
+
/** A delivery point, not a building — weakens location agreement even at a precise coordinate. */
|
|
46
|
+
poBox?: boolean;
|
|
47
|
+
/** A multi-unit building where many records share one coordinate — weakens unit-level agreement. */
|
|
48
|
+
multiUnit?: boolean;
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* The canonical address record. Composes the parser's components, the formatter's match key, an
|
|
52
|
+
* optional human-readable form, and an optional resolved geocode. Plain data — no behavior.
|
|
53
|
+
*/
|
|
54
|
+
export interface PostalAddress {
|
|
55
|
+
/** Parsed address components (`ComponentTag`-keyed). */
|
|
56
|
+
components: ComponentDict;
|
|
57
|
+
/** Normalized, deterministic match key for blocking (from `@mailwoman/formatter`). */
|
|
58
|
+
canonicalKey: string;
|
|
59
|
+
/** Optional human-readable single-line form, for display. */
|
|
60
|
+
formatted?: string;
|
|
61
|
+
/** Resolved location, when geocoded. */
|
|
62
|
+
geocode?: AddressGeocode;
|
|
63
|
+
/** The original free-text input, when known (provenance). */
|
|
64
|
+
raw?: string;
|
|
65
|
+
}
|
|
66
|
+
/** Options for {@linkcode toPostalAddress}. */
|
|
67
|
+
export interface ToPostalAddressOptions {
|
|
68
|
+
/** Country (ISO-2 or name) for formatting. Defaults to the `country` component, else unset. */
|
|
69
|
+
country?: string;
|
|
70
|
+
/** The original free-text input to retain as provenance. */
|
|
71
|
+
raw?: string;
|
|
72
|
+
/** Also compute a human-readable `formatted` string. Default `true`. */
|
|
73
|
+
format?: boolean;
|
|
74
|
+
/** Formatting options forwarded to the formatter. Defaults to single-line (`", "`). */
|
|
75
|
+
formatOptions?: FormatAddressOptions;
|
|
76
|
+
}
|
|
77
|
+
/**
|
|
78
|
+
* Build a canonical {@linkcode PostalAddress} from parsed components: fills the match key (always)
|
|
79
|
+
* and a human-readable form (unless disabled). Attach a geocode separately with
|
|
80
|
+
* {@linkcode withGeocode} once the address is resolved.
|
|
81
|
+
*/
|
|
82
|
+
export declare function toPostalAddress(components: ComponentDict, opts?: ToPostalAddressOptions): PostalAddress;
|
|
83
|
+
/** Attach (or replace) a resolved geocode on an address record, returning a new record. */
|
|
84
|
+
export declare function withGeocode(record: PostalAddress, geocode: AddressGeocode): PostalAddress;
|
|
85
|
+
//# sourceMappingURL=address.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"address.d.ts","sourceRoot":"","sources":["../address.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;GAiBG;AAEH,OAAO,EAAE,KAAK,aAAa,EAAE,KAAK,oBAAoB,EAA+B,MAAM,sBAAsB,CAAA;AAEjH,uDAAuD;AACvD,MAAM,WAAW,aAAa;IAC7B,QAAQ,EAAE,MAAM,CAAA;IAChB,SAAS,EAAE,MAAM,CAAA;CACjB;AAED;;;;GAIG;AACH,MAAM,MAAM,cAAc,GAAG,eAAe,GAAG,cAAc,GAAG,OAAO,CAAA;AAEvE,oGAAoG;AACpG,MAAM,WAAW,aAAa;IAC7B,GAAG,EAAE,MAAM,CAAA;IACX,KAAK,EAAE,MAAM,CAAA;IACb,OAAO,CAAC,EAAE,MAAM,CAAA;CAChB;AAED,oGAAoG;AACpG,MAAM,WAAW,cAAc;IAC9B,UAAU,EAAE,aAAa,CAAA;IACzB,IAAI,EAAE,cAAc,CAAA;IACpB,qGAAqG;IACrG,iBAAiB,EAAE,MAAM,GAAG,IAAI,CAAA;IAChC,0EAA0E;IAC1E,SAAS,CAAC,EAAE,aAAa,EAAE,CAAA;IAC3B,kGAAkG;IAClG,KAAK,CAAC,EAAE,OAAO,CAAA;IACf,oGAAoG;IACpG,SAAS,CAAC,EAAE,OAAO,CAAA;CACnB;AAED;;;GAGG;AACH,MAAM,WAAW,aAAa;IAC7B,wDAAwD;IACxD,UAAU,EAAE,aAAa,CAAA;IACzB,sFAAsF;IACtF,YAAY,EAAE,MAAM,CAAA;IACpB,6DAA6D;IAC7D,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,wCAAwC;IACxC,OAAO,CAAC,EAAE,cAAc,CAAA;IACxB,6DAA6D;IAC7D,GAAG,CAAC,EAAE,MAAM,CAAA;CACZ;AAED,+CAA+C;AAC/C,MAAM,WAAW,sBAAsB;IACtC,+FAA+F;IAC/F,OAAO,CAAC,EAAE,MAAM,CAAA;IAChB,4DAA4D;IAC5D,GAAG,CAAC,EAAE,MAAM,CAAA;IACZ,wEAAwE;IACxE,MAAM,CAAC,EAAE,OAAO,CAAA;IAChB,uFAAuF;IACvF,aAAa,CAAC,EAAE,oBAAoB,CAAA;CACpC;AAED;;;;GAIG;AACH,wBAAgB,eAAe,CAAC,UAAU,EAAE,aAAa,EAAE,IAAI,GAAE,sBAA2B,GAAG,aAAa,CAgB3G;AAED,2FAA2F;AAC3F,wBAAgB,WAAW,CAAC,MAAM,EAAE,aAAa,EAAE,OAAO,EAAE,cAAc,GAAG,aAAa,CAEzF"}
|
package/out/address.js
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* The canonical address record — the matcher's unit of address identity, and the spine the
|
|
7
|
+
* organization and contact records build on.
|
|
8
|
+
*
|
|
9
|
+
* It is plain data: parser components + the formatter's match key + an optional resolved geocode,
|
|
10
|
+
* composed into one object. No ORM, no decorators, no schema-generation machinery — if we need a
|
|
11
|
+
* database we reach for Kysely at the call site, not a model layer here.
|
|
12
|
+
*
|
|
13
|
+
* The geocode fields mirror mailwoman's `GeocodeResult` (tier + calibrated uncertainty + hierarchy)
|
|
14
|
+
* on purpose: that is the location signal the Fellegi-Sunter scorer weights its distance evidence
|
|
15
|
+
* by — two records sharing a `address_point` coordinate is strong agreement; sharing an
|
|
16
|
+
* `interpolated` centroid is weak; a PO-box / multi-unit coordinate is barely location agreement
|
|
17
|
+
* at all (the NAACCR precedent, see the geocode-first record-matching concept doc).
|
|
18
|
+
*/
|
|
19
|
+
import { canonicalKey, formatAddress } from "@mailwoman/formatter";
|
|
20
|
+
/**
|
|
21
|
+
* Build a canonical {@linkcode PostalAddress} from parsed components: fills the match key (always)
|
|
22
|
+
* and a human-readable form (unless disabled). Attach a geocode separately with
|
|
23
|
+
* {@linkcode withGeocode} once the address is resolved.
|
|
24
|
+
*/
|
|
25
|
+
export function toPostalAddress(components, opts = {}) {
|
|
26
|
+
const country = opts.country ?? components.country ?? "";
|
|
27
|
+
const record = {
|
|
28
|
+
components,
|
|
29
|
+
canonicalKey: canonicalKey(components),
|
|
30
|
+
};
|
|
31
|
+
if (opts.raw !== undefined)
|
|
32
|
+
record.raw = opts.raw;
|
|
33
|
+
if (opts.format !== false) {
|
|
34
|
+
const formatted = formatAddress(components, country, opts.formatOptions ?? { separator: ", " });
|
|
35
|
+
if (formatted)
|
|
36
|
+
record.formatted = formatted;
|
|
37
|
+
}
|
|
38
|
+
return record;
|
|
39
|
+
}
|
|
40
|
+
/** Attach (or replace) a resolved geocode on an address record, returning a new record. */
|
|
41
|
+
export function withGeocode(record, geocode) {
|
|
42
|
+
return { ...record, geocode };
|
|
43
|
+
}
|
|
44
|
+
//# sourceMappingURL=address.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"address.js","sourceRoot":"","sources":["../address.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;GAiBG;AAEH,OAAO,EAAiD,YAAY,EAAE,aAAa,EAAE,MAAM,sBAAsB,CAAA;AAiEjH;;;;GAIG;AACH,MAAM,UAAU,eAAe,CAAC,UAAyB,EAAE,OAA+B,EAAE;IAC3F,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,IAAI,UAAU,CAAC,OAAO,IAAI,EAAE,CAAA;IAExD,MAAM,MAAM,GAAkB;QAC7B,UAAU;QACV,YAAY,EAAE,YAAY,CAAC,UAAU,CAAC;KACtC,CAAA;IAED,IAAI,IAAI,CAAC,GAAG,KAAK,SAAS;QAAE,MAAM,CAAC,GAAG,GAAG,IAAI,CAAC,GAAG,CAAA;IAEjD,IAAI,IAAI,CAAC,MAAM,KAAK,KAAK,EAAE,CAAC;QAC3B,MAAM,SAAS,GAAG,aAAa,CAAC,UAAU,EAAE,OAAO,EAAE,IAAI,CAAC,aAAa,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAA;QAC/F,IAAI,SAAS;YAAE,MAAM,CAAC,SAAS,GAAG,SAAS,CAAA;IAC5C,CAAC;IAED,OAAO,MAAM,CAAA;AACd,CAAC;AAED,2FAA2F;AAC3F,MAAM,UAAU,WAAW,CAAC,MAAqB,EAAE,OAAuB;IACzE,OAAO,EAAE,GAAG,MAAM,EAAE,OAAO,EAAE,CAAA;AAC9B,CAAC"}
|
package/out/index.d.ts
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* `@mailwoman/record` — the canonicalize layer for the geocode-first matcher.
|
|
7
|
+
*
|
|
8
|
+
* Address-first: {@linkcode PostalAddress} is the spine. The per-field normalizers
|
|
9
|
+
* ({@linkcode parsePersonName}, {@linkcode canonicalizeOrganizationName}) build on the same
|
|
10
|
+
* plain-data pattern. Contact records and the comparator/Fellegi-Sunter layer land in the
|
|
11
|
+
* matcher.
|
|
12
|
+
*/
|
|
13
|
+
export * from "./address.js";
|
|
14
|
+
export * from "./name.js";
|
|
15
|
+
export * from "./organization.js";
|
|
16
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,cAAc,cAAc,CAAA;AAC5B,cAAc,WAAW,CAAA;AACzB,cAAc,mBAAmB,CAAA"}
|
package/out/index.js
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* `@mailwoman/record` — the canonicalize layer for the geocode-first matcher.
|
|
7
|
+
*
|
|
8
|
+
* Address-first: {@linkcode PostalAddress} is the spine. The per-field normalizers
|
|
9
|
+
* ({@linkcode parsePersonName}, {@linkcode canonicalizeOrganizationName}) build on the same
|
|
10
|
+
* plain-data pattern. Contact records and the comparator/Fellegi-Sunter layer land in the
|
|
11
|
+
* matcher.
|
|
12
|
+
*/
|
|
13
|
+
export * from "./address.js";
|
|
14
|
+
export * from "./name.js";
|
|
15
|
+
export * from "./organization.js";
|
|
16
|
+
//# sourceMappingURL=index.js.map
|
package/out/index.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,cAAc,cAAc,CAAA;AAC5B,cAAc,WAAW,CAAA;AACzB,cAAc,mBAAmB,CAAA"}
|
package/out/name.d.ts
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Person-name parsing — split a full name into components for the matcher to canonicalize and
|
|
7
|
+
* compare.
|
|
8
|
+
*
|
|
9
|
+
* A rule-based positional parser, the portable recipe from `python-nameparser`: split on a comma to
|
|
10
|
+
* detect `Last, First` inversion, then classify tokens by position against configurable title /
|
|
11
|
+
* suffix / particle lists. We deliberately store the surname **particle** (`van`, `de la`, `von`)
|
|
12
|
+
* separately from the bare surname (the `theiconic/name-parser` pattern) so the matcher can
|
|
13
|
+
* compare `Vega` independent of `de la` — sources that drop or vary the particle still match.
|
|
14
|
+
*
|
|
15
|
+
* Scope + honesty (per the name-canonicalization research pass):
|
|
16
|
+
*
|
|
17
|
+
* - Western / romanized names only. Cultural given-family ORDER variation (East-Asian family-first)
|
|
18
|
+
* and transliteration are not handled here — a documented follow-up.
|
|
19
|
+
* - Nickname → canonical-root mapping is intentionally NOT done at parse time: it is lossy and
|
|
20
|
+
* gendered (Bobbie → Robert _or_ Roberta), so equivalence belongs in the matcher as a fuzzy
|
|
21
|
+
* agreement level, not a destructive rewrite. We only _extract_ a parenthetical/quoted
|
|
22
|
+
* nickname.
|
|
23
|
+
* - A CRF parser (probablepeople) is the gold-standard reference but too heavy to port; this
|
|
24
|
+
* positional parser covers the documented hard cases (inversion, particles, generational +
|
|
25
|
+
* professional suffixes) without a model.
|
|
26
|
+
*/
|
|
27
|
+
/** A parsed person name. All fields optional — the parser fills what it can identify. */
|
|
28
|
+
export interface PersonName {
|
|
29
|
+
/** Title / salutation that preceded the name (`Dr`, `Mr`, `Capt`). */
|
|
30
|
+
prefix?: string;
|
|
31
|
+
/** First / given name. */
|
|
32
|
+
given?: string;
|
|
33
|
+
/** Middle name(s) or initial. */
|
|
34
|
+
middle?: string;
|
|
35
|
+
/** Surname, _without_ any particle (`Vega`, not `de la Vega`). */
|
|
36
|
+
family?: string;
|
|
37
|
+
/** Surname particle, stored separately (`de la`, `van der`, `von`). */
|
|
38
|
+
familyParticle?: string;
|
|
39
|
+
/** Generational or professional suffix (`Jr`, `III`, `PhD`, `MD`). */
|
|
40
|
+
suffix?: string;
|
|
41
|
+
/** A parenthetical or quoted nickname (`"Gob"` in `George "Gob" Bluth`). */
|
|
42
|
+
nickname?: string;
|
|
43
|
+
}
|
|
44
|
+
/**
|
|
45
|
+
* Parse a full name into components. Returns `null` for empty input. Best-effort and non-throwing —
|
|
46
|
+
* ambiguous input degrades gracefully rather than erroring.
|
|
47
|
+
*/
|
|
48
|
+
export declare function parsePersonName(input: string | null | undefined): PersonName | null;
|
|
49
|
+
//# sourceMappingURL=name.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"name.d.ts","sourceRoot":"","sources":["../name.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AAEH,yFAAyF;AACzF,MAAM,WAAW,UAAU;IAC1B,sEAAsE;IACtE,MAAM,CAAC,EAAE,MAAM,CAAA;IACf,0BAA0B;IAC1B,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,iCAAiC;IACjC,MAAM,CAAC,EAAE,MAAM,CAAA;IACf,kEAAkE;IAClE,MAAM,CAAC,EAAE,MAAM,CAAA;IACf,uEAAuE;IACvE,cAAc,CAAC,EAAE,MAAM,CAAA;IACvB,sEAAsE;IACtE,MAAM,CAAC,EAAE,MAAM,CAAA;IACf,4EAA4E;IAC5E,QAAQ,CAAC,EAAE,MAAM,CAAA;CACjB;AA8ID;;;GAGG;AACH,wBAAgB,eAAe,CAAC,KAAK,EAAE,MAAM,GAAG,IAAI,GAAG,SAAS,GAAG,UAAU,GAAG,IAAI,CAsFnF"}
|
package/out/name.js
ADDED
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Person-name parsing — split a full name into components for the matcher to canonicalize and
|
|
7
|
+
* compare.
|
|
8
|
+
*
|
|
9
|
+
* A rule-based positional parser, the portable recipe from `python-nameparser`: split on a comma to
|
|
10
|
+
* detect `Last, First` inversion, then classify tokens by position against configurable title /
|
|
11
|
+
* suffix / particle lists. We deliberately store the surname **particle** (`van`, `de la`, `von`)
|
|
12
|
+
* separately from the bare surname (the `theiconic/name-parser` pattern) so the matcher can
|
|
13
|
+
* compare `Vega` independent of `de la` — sources that drop or vary the particle still match.
|
|
14
|
+
*
|
|
15
|
+
* Scope + honesty (per the name-canonicalization research pass):
|
|
16
|
+
*
|
|
17
|
+
* - Western / romanized names only. Cultural given-family ORDER variation (East-Asian family-first)
|
|
18
|
+
* and transliteration are not handled here — a documented follow-up.
|
|
19
|
+
* - Nickname → canonical-root mapping is intentionally NOT done at parse time: it is lossy and
|
|
20
|
+
* gendered (Bobbie → Robert _or_ Roberta), so equivalence belongs in the matcher as a fuzzy
|
|
21
|
+
* agreement level, not a destructive rewrite. We only _extract_ a parenthetical/quoted
|
|
22
|
+
* nickname.
|
|
23
|
+
* - A CRF parser (probablepeople) is the gold-standard reference but too heavy to port; this
|
|
24
|
+
* positional parser covers the documented hard cases (inversion, particles, generational +
|
|
25
|
+
* professional suffixes) without a model.
|
|
26
|
+
*/
|
|
27
|
+
/** Titles / salutations that lead a name. Matched case-insensitively, trailing `.` ignored. */
|
|
28
|
+
const TITLES = new Set([
|
|
29
|
+
"airman",
|
|
30
|
+
"br",
|
|
31
|
+
"brig",
|
|
32
|
+
"brigadier",
|
|
33
|
+
"capt",
|
|
34
|
+
"captain",
|
|
35
|
+
"cmdr",
|
|
36
|
+
"col",
|
|
37
|
+
"colonel",
|
|
38
|
+
"commander",
|
|
39
|
+
"commissioner",
|
|
40
|
+
"cpl",
|
|
41
|
+
"cpt",
|
|
42
|
+
"dep",
|
|
43
|
+
"deputy",
|
|
44
|
+
"doctor",
|
|
45
|
+
"dr",
|
|
46
|
+
"father",
|
|
47
|
+
"fr",
|
|
48
|
+
"gen",
|
|
49
|
+
"general",
|
|
50
|
+
"hon",
|
|
51
|
+
"honorable",
|
|
52
|
+
"judge",
|
|
53
|
+
"lt",
|
|
54
|
+
"ltcol",
|
|
55
|
+
"ltgen",
|
|
56
|
+
"maj",
|
|
57
|
+
"major",
|
|
58
|
+
"master",
|
|
59
|
+
"miss",
|
|
60
|
+
"mr",
|
|
61
|
+
"mrs",
|
|
62
|
+
"ms",
|
|
63
|
+
"mx",
|
|
64
|
+
"pastor",
|
|
65
|
+
"pfc",
|
|
66
|
+
"pres",
|
|
67
|
+
"president",
|
|
68
|
+
"private",
|
|
69
|
+
"prof",
|
|
70
|
+
"professor",
|
|
71
|
+
"pvt",
|
|
72
|
+
"rabbi",
|
|
73
|
+
"rep",
|
|
74
|
+
"representative",
|
|
75
|
+
"rev",
|
|
76
|
+
"reverend",
|
|
77
|
+
"sen",
|
|
78
|
+
"senator",
|
|
79
|
+
"sgt",
|
|
80
|
+
"sir",
|
|
81
|
+
"sister",
|
|
82
|
+
]);
|
|
83
|
+
/** Generational + professional suffixes that trail a name. */
|
|
84
|
+
const SUFFIXES = new Set([
|
|
85
|
+
// generational
|
|
86
|
+
"jr",
|
|
87
|
+
"sr",
|
|
88
|
+
"i",
|
|
89
|
+
"ii",
|
|
90
|
+
"iii",
|
|
91
|
+
"iv",
|
|
92
|
+
"v",
|
|
93
|
+
"vi",
|
|
94
|
+
"vii",
|
|
95
|
+
"viii",
|
|
96
|
+
// professional / honorific
|
|
97
|
+
"phd",
|
|
98
|
+
"md",
|
|
99
|
+
"do",
|
|
100
|
+
"dds",
|
|
101
|
+
"dmd",
|
|
102
|
+
"dvm",
|
|
103
|
+
"esq",
|
|
104
|
+
"esquire",
|
|
105
|
+
"jd",
|
|
106
|
+
"llm",
|
|
107
|
+
"cpa",
|
|
108
|
+
"rn",
|
|
109
|
+
"lpn",
|
|
110
|
+
"pa",
|
|
111
|
+
"pe",
|
|
112
|
+
"od",
|
|
113
|
+
"dc",
|
|
114
|
+
"dpm",
|
|
115
|
+
"psyd",
|
|
116
|
+
"edd",
|
|
117
|
+
"mba",
|
|
118
|
+
"mfa",
|
|
119
|
+
"msw",
|
|
120
|
+
"pharmd",
|
|
121
|
+
]);
|
|
122
|
+
/**
|
|
123
|
+
* Surname particles. Consecutive particles fold together (`de` + `la` → `de la`), and the next
|
|
124
|
+
* non-particle token begins the bare surname.
|
|
125
|
+
*/
|
|
126
|
+
const PARTICLES = new Set([
|
|
127
|
+
"al",
|
|
128
|
+
"bin",
|
|
129
|
+
"da",
|
|
130
|
+
"das",
|
|
131
|
+
"de",
|
|
132
|
+
"del",
|
|
133
|
+
"della",
|
|
134
|
+
"den",
|
|
135
|
+
"der",
|
|
136
|
+
"di",
|
|
137
|
+
"do",
|
|
138
|
+
"dos",
|
|
139
|
+
"du",
|
|
140
|
+
"el",
|
|
141
|
+
"ibn",
|
|
142
|
+
"la",
|
|
143
|
+
"le",
|
|
144
|
+
"lo",
|
|
145
|
+
"mac",
|
|
146
|
+
"mc",
|
|
147
|
+
"san",
|
|
148
|
+
"santa",
|
|
149
|
+
"st",
|
|
150
|
+
"ter",
|
|
151
|
+
"van",
|
|
152
|
+
"vande",
|
|
153
|
+
"vanden",
|
|
154
|
+
"vander",
|
|
155
|
+
"vere",
|
|
156
|
+
"von",
|
|
157
|
+
"zu",
|
|
158
|
+
"zur",
|
|
159
|
+
]);
|
|
160
|
+
const isPresent = (s) => typeof s === "string" && s.trim().length > 0;
|
|
161
|
+
const norm = (token) => token.replace(/\.$/, "").toLowerCase();
|
|
162
|
+
const countChar = (s, c) => s.split(c).length - 1;
|
|
163
|
+
/**
|
|
164
|
+
* Parse a full name into components. Returns `null` for empty input. Best-effort and non-throwing —
|
|
165
|
+
* ambiguous input degrades gracefully rather than erroring.
|
|
166
|
+
*/
|
|
167
|
+
export function parsePersonName(input) {
|
|
168
|
+
if (!isPresent(input))
|
|
169
|
+
return null;
|
|
170
|
+
const result = {};
|
|
171
|
+
// 1. Extract a parenthetical "(Jim)" or quoted "Jim" nickname, then strip it out.
|
|
172
|
+
let working = input
|
|
173
|
+
.replace(/\s*\(([^)]+)\)\s*/g, (_m, n) => {
|
|
174
|
+
if (!result.nickname)
|
|
175
|
+
result.nickname = n.trim();
|
|
176
|
+
return " ";
|
|
177
|
+
})
|
|
178
|
+
.replace(/\s*"([^"]+)"\s*/g, (_m, n) => {
|
|
179
|
+
if (!result.nickname)
|
|
180
|
+
result.nickname = n.trim();
|
|
181
|
+
return " ";
|
|
182
|
+
})
|
|
183
|
+
.replace(/\s+/g, " ")
|
|
184
|
+
.trim();
|
|
185
|
+
// 2. Resolve a single comma: "Last, First" inversion, unless the tail is a known suffix
|
|
186
|
+
// ("John Smith, Jr."), in which case keep order and treat the tail as a suffix.
|
|
187
|
+
if (countChar(working, ",") === 1) {
|
|
188
|
+
const [head, tail] = working.split(",").map((p) => p.trim());
|
|
189
|
+
if (tail && tail.split(/\s+/).every((t) => SUFFIXES.has(norm(t)))) {
|
|
190
|
+
result.suffix = tail;
|
|
191
|
+
working = head;
|
|
192
|
+
}
|
|
193
|
+
else if (head && tail) {
|
|
194
|
+
working = `${tail} ${head}`;
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
const tokens = working.split(/\s+/).filter(Boolean);
|
|
198
|
+
if (tokens.length === 0)
|
|
199
|
+
return Object.keys(result).length ? result : null;
|
|
200
|
+
// 3. Leading titles → prefix.
|
|
201
|
+
const prefixParts = [];
|
|
202
|
+
while (tokens.length > 1 && TITLES.has(norm(tokens[0]))) {
|
|
203
|
+
prefixParts.push(tokens.shift());
|
|
204
|
+
}
|
|
205
|
+
if (prefixParts.length)
|
|
206
|
+
result.prefix = prefixParts.join(" ");
|
|
207
|
+
// 4. Trailing suffixes → suffix (a single name token must remain).
|
|
208
|
+
const suffixParts = [];
|
|
209
|
+
while (tokens.length > 1 && SUFFIXES.has(norm(tokens[tokens.length - 1]))) {
|
|
210
|
+
suffixParts.unshift(tokens.pop());
|
|
211
|
+
}
|
|
212
|
+
if (suffixParts.length) {
|
|
213
|
+
result.suffix = isPresent(result.suffix) ? `${suffixParts.join(" ")} ${result.suffix}` : suffixParts.join(" ");
|
|
214
|
+
}
|
|
215
|
+
if (tokens.length === 0)
|
|
216
|
+
return result;
|
|
217
|
+
// 5. Locate the surname particle run; everything from it onward is the (particled) surname.
|
|
218
|
+
let particleStart = -1;
|
|
219
|
+
for (let i = 0; i < tokens.length; i++) {
|
|
220
|
+
// A particle only starts a surname if a bare-surname token follows it.
|
|
221
|
+
if (PARTICLES.has(norm(tokens[i])) && i < tokens.length - 1) {
|
|
222
|
+
particleStart = i;
|
|
223
|
+
break;
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
if (particleStart >= 0) {
|
|
227
|
+
let i = particleStart;
|
|
228
|
+
const particleParts = [];
|
|
229
|
+
while (i < tokens.length - 1 && PARTICLES.has(norm(tokens[i]))) {
|
|
230
|
+
particleParts.push(tokens[i]);
|
|
231
|
+
i++;
|
|
232
|
+
}
|
|
233
|
+
result.familyParticle = particleParts.join(" ");
|
|
234
|
+
result.family = tokens.slice(i).join(" ");
|
|
235
|
+
const before = tokens.slice(0, particleStart);
|
|
236
|
+
if (before.length)
|
|
237
|
+
result.given = before[0];
|
|
238
|
+
if (before.length > 1)
|
|
239
|
+
result.middle = before.slice(1).join(" ");
|
|
240
|
+
return result;
|
|
241
|
+
}
|
|
242
|
+
// 6. No particle: last token is the surname, first is given, the rest is middle.
|
|
243
|
+
if (tokens.length === 1) {
|
|
244
|
+
result.given = tokens[0];
|
|
245
|
+
return result;
|
|
246
|
+
}
|
|
247
|
+
result.given = tokens[0];
|
|
248
|
+
result.family = tokens[tokens.length - 1];
|
|
249
|
+
if (tokens.length > 2)
|
|
250
|
+
result.middle = tokens.slice(1, -1).join(" ");
|
|
251
|
+
return result;
|
|
252
|
+
}
|
|
253
|
+
//# sourceMappingURL=name.js.map
|
package/out/name.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"name.js","sourceRoot":"","sources":["../name.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AAoBH,+FAA+F;AAC/F,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC;IACtB,QAAQ;IACR,IAAI;IACJ,MAAM;IACN,WAAW;IACX,MAAM;IACN,SAAS;IACT,MAAM;IACN,KAAK;IACL,SAAS;IACT,WAAW;IACX,cAAc;IACd,KAAK;IACL,KAAK;IACL,KAAK;IACL,QAAQ;IACR,QAAQ;IACR,IAAI;IACJ,QAAQ;IACR,IAAI;IACJ,KAAK;IACL,SAAS;IACT,KAAK;IACL,WAAW;IACX,OAAO;IACP,IAAI;IACJ,OAAO;IACP,OAAO;IACP,KAAK;IACL,OAAO;IACP,QAAQ;IACR,MAAM;IACN,IAAI;IACJ,KAAK;IACL,IAAI;IACJ,IAAI;IACJ,QAAQ;IACR,KAAK;IACL,MAAM;IACN,WAAW;IACX,SAAS;IACT,MAAM;IACN,WAAW;IACX,KAAK;IACL,OAAO;IACP,KAAK;IACL,gBAAgB;IAChB,KAAK;IACL,UAAU;IACV,KAAK;IACL,SAAS;IACT,KAAK;IACL,KAAK;IACL,QAAQ;CACR,CAAC,CAAA;AAEF,8DAA8D;AAC9D,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC;IACxB,eAAe;IACf,IAAI;IACJ,IAAI;IACJ,GAAG;IACH,IAAI;IACJ,KAAK;IACL,IAAI;IACJ,GAAG;IACH,IAAI;IACJ,KAAK;IACL,MAAM;IACN,2BAA2B;IAC3B,KAAK;IACL,IAAI;IACJ,IAAI;IACJ,KAAK;IACL,KAAK;IACL,KAAK;IACL,KAAK;IACL,SAAS;IACT,IAAI;IACJ,KAAK;IACL,KAAK;IACL,IAAI;IACJ,KAAK;IACL,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,KAAK;IACL,MAAM;IACN,KAAK;IACL,KAAK;IACL,KAAK;IACL,KAAK;IACL,QAAQ;CACR,CAAC,CAAA;AAEF;;;GAGG;AACH,MAAM,SAAS,GAAG,IAAI,GAAG,CAAC;IACzB,IAAI;IACJ,KAAK;IACL,IAAI;IACJ,KAAK;IACL,IAAI;IACJ,KAAK;IACL,OAAO;IACP,KAAK;IACL,KAAK;IACL,IAAI;IACJ,IAAI;IACJ,KAAK;IACL,IAAI;IACJ,IAAI;IACJ,KAAK;IACL,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,KAAK;IACL,IAAI;IACJ,KAAK;IACL,OAAO;IACP,IAAI;IACJ,KAAK;IACL,KAAK;IACL,OAAO;IACP,QAAQ;IACR,QAAQ;IACR,MAAM;IACN,KAAK;IACL,IAAI;IACJ,KAAK;CACL,CAAC,CAAA;AAEF,MAAM,SAAS,GAAG,CAAC,CAA4B,EAAe,EAAE,CAAC,OAAO,CAAC,KAAK,QAAQ,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,CAAA;AAC7G,MAAM,IAAI,GAAG,CAAC,KAAa,EAAU,EAAE,CAAC,KAAK,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC,WAAW,EAAE,CAAA;AAC9E,MAAM,SAAS,GAAG,CAAC,CAAS,EAAE,CAAS,EAAU,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAA;AAEzE;;;GAGG;AACH,MAAM,UAAU,eAAe,CAAC,KAAgC;IAC/D,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC;QAAE,OAAO,IAAI,CAAA;IAElC,MAAM,MAAM,GAAe,EAAE,CAAA;IAE7B,kFAAkF;IAClF,IAAI,OAAO,GAAG,KAAK;SACjB,OAAO,CAAC,oBAAoB,EAAE,CAAC,EAAE,EAAE,CAAS,EAAE,EAAE;QAChD,IAAI,CAAC,MAAM,CAAC,QAAQ;YAAE,MAAM,CAAC,QAAQ,GAAG,CAAC,CAAC,IAAI,EAAE,CAAA;QAChD,OAAO,GAAG,CAAA;IACX,CAAC,CAAC;SACD,OAAO,CAAC,kBAAkB,EAAE,CAAC,EAAE,EAAE,CAAS,EAAE,EAAE;QAC9C,IAAI,CAAC,MAAM,CAAC,QAAQ;YAAE,MAAM,CAAC,QAAQ,GAAG,CAAC,CAAC,IAAI,EAAE,CAAA;QAChD,OAAO,GAAG,CAAA;IACX,CAAC,CAAC;SACD,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC;SACpB,IAAI,EAAE,CAAA;IAER,wFAAwF;IACxF,mFAAmF;IACnF,IAAI,SAAS,CAAC,OAAO,EAAE,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;QACnC,MAAM,CAAC,IAAI,EAAE,IAAI,CAAC,GAAG,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAA;QAC5D,IAAI,IAAI,IAAI,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;YACnE,MAAM,CAAC,MAAM,GAAG,IAAI,CAAA;YACpB,OAAO,GAAG,IAAK,CAAA;QAChB,CAAC;aAAM,IAAI,IAAI,IAAI,IAAI,EAAE,CAAC;YACzB,OAAO,GAAG,GAAG,IAAI,IAAI,IAAI,EAAE,CAAA;QAC5B,CAAC;IACF,CAAC;IAED,MAAM,MAAM,GAAG,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAA;IACnD,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI,CAAA;IAE1E,8BAA8B;IAC9B,MAAM,WAAW,GAAa,EAAE,CAAA;IAChC,OAAO,MAAM,CAAC,MAAM,GAAG,CAAC,IAAI,MAAM,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,CAAC,EAAE,CAAC;QAC1D,WAAW,CAAC,IAAI,CAAC,MAAM,CAAC,KAAK,EAAG,CAAC,CAAA;IAClC,CAAC;IACD,IAAI,WAAW,CAAC,MAAM;QAAE,MAAM,CAAC,MAAM,GAAG,WAAW,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;IAE7D,mEAAmE;IACnE,MAAM,WAAW,GAAa,EAAE,CAAA;IAChC,OAAO,MAAM,CAAC,MAAM,GAAG,CAAC,IAAI,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC,CAAE,CAAC,CAAC,EAAE,CAAC;QAC5E,WAAW,CAAC,OAAO,CAAC,MAAM,CAAC,GAAG,EAAG,CAAC,CAAA;IACnC,CAAC;IACD,IAAI,WAAW,CAAC,MAAM,EAAE,CAAC;QACxB,MAAM,CAAC,MAAM,GAAG,SAAS,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,GAAG,WAAW,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,MAAM,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC,WAAW,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;IAC/G,CAAC;IAED,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,MAAM,CAAA;IAEtC,4FAA4F;IAC5F,IAAI,aAAa,GAAG,CAAC,CAAC,CAAA;IACtB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACxC,uEAAuE;QACvE,IAAI,SAAS,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC9D,aAAa,GAAG,CAAC,CAAA;YACjB,MAAK;QACN,CAAC;IACF,CAAC;IAED,IAAI,aAAa,IAAI,CAAC,EAAE,CAAC;QACxB,IAAI,CAAC,GAAG,aAAa,CAAA;QACrB,MAAM,aAAa,GAAa,EAAE,CAAA;QAClC,OAAO,CAAC,GAAG,MAAM,CAAC,MAAM,GAAG,CAAC,IAAI,SAAS,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,CAAC,EAAE,CAAC;YACjE,aAAa,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,CAAA;YAC9B,CAAC,EAAE,CAAA;QACJ,CAAC;QACD,MAAM,CAAC,cAAc,GAAG,aAAa,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;QAC/C,MAAM,CAAC,MAAM,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;QACzC,MAAM,MAAM,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,aAAa,CAAC,CAAA;QAC7C,IAAI,MAAM,CAAC,MAAM;YAAE,MAAM,CAAC,KAAK,GAAG,MAAM,CAAC,CAAC,CAAC,CAAA;QAC3C,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC;YAAE,MAAM,CAAC,MAAM,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;QAChE,OAAO,MAAM,CAAA;IACd,CAAC;IAED,iFAAiF;IACjF,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACzB,MAAM,CAAC,KAAK,GAAG,MAAM,CAAC,CAAC,CAAC,CAAA;QACxB,OAAO,MAAM,CAAA;IACd,CAAC;IACD,MAAM,CAAC,KAAK,GAAG,MAAM,CAAC,CAAC,CAAC,CAAA;IACxB,MAAM,CAAC,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;IACzC,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC;QAAE,MAAM,CAAC,MAAM,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;IAEpE,OAAO,MAAM,CAAA;AACd,CAAC"}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Organization-name canonicalization — reduce a company name to a stable, comparable key.
|
|
7
|
+
*
|
|
8
|
+
* Winkler's record-linkage recipe: words of little distinguishing power (the legal designation —
|
|
9
|
+
* `Corporation`, `Limited`, `LLC`) are normalized away before matching, so `Acme Corp` and `Acme
|
|
10
|
+
* Corporation, LLC` collapse to the same key. We also split off a `doing business as` clause and
|
|
11
|
+
* normalize connectives (`&` → `and`), punctuation, accents, and a leading `The`.
|
|
12
|
+
*
|
|
13
|
+
* Evidence honesty (per the name-canonicalization research pass): the PERSON-name side is well
|
|
14
|
+
* sourced; the ORGANIZATION side is a known evidence gap. This is a solid _canonicalization_
|
|
15
|
+
* baseline (the strip-designations principle is Winkler-grounded; the designation list draws on
|
|
16
|
+
* the ISO 20275 Entity Legal Forms register and `cleanco`). The harder org-_matching_ problems —
|
|
17
|
+
* acronym ↔ expansion (`IBM` ↔ `International Business Machines`), DBA/alias resolution beyond
|
|
18
|
+
* the simple clause, subsidiary/parent, and TF-IDF n-gram token matching — are deferred to a
|
|
19
|
+
* follow-up (a dedicated org-matching research pass + the matcher epic).
|
|
20
|
+
*/
|
|
21
|
+
/** A canonicalized organization name. */
|
|
22
|
+
export interface OrganizationName {
|
|
23
|
+
/** The original input, verbatim. */
|
|
24
|
+
raw: string;
|
|
25
|
+
/** Normalized, designation-stripped key for blocking and comparison. */
|
|
26
|
+
canonical: string;
|
|
27
|
+
/** Legal designations that were stripped (`llc`, `inc`, `gmbh`), in encounter order. */
|
|
28
|
+
designations: string[];
|
|
29
|
+
/** The `doing business as` / trade-name clause, canonicalized, when one was present. */
|
|
30
|
+
dba?: string;
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Canonicalize an organization name: split off any `doing business as` clause, then reduce the
|
|
34
|
+
* legal name to a designation-stripped key. Returns `null` for empty input.
|
|
35
|
+
*/
|
|
36
|
+
export declare function canonicalizeOrganizationName(input: string | null | undefined): OrganizationName | null;
|
|
37
|
+
//# sourceMappingURL=organization.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"organization.d.ts","sourceRoot":"","sources":["../organization.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;GAmBG;AAEH,yCAAyC;AACzC,MAAM,WAAW,gBAAgB;IAChC,oCAAoC;IACpC,GAAG,EAAE,MAAM,CAAA;IACX,wEAAwE;IACxE,SAAS,EAAE,MAAM,CAAA;IACjB,wFAAwF;IACxF,YAAY,EAAE,MAAM,EAAE,CAAA;IACtB,wFAAwF;IACxF,GAAG,CAAC,EAAE,MAAM,CAAA;CACZ;AA8FD;;;GAGG;AACH,wBAAgB,4BAA4B,CAAC,KAAK,EAAE,MAAM,GAAG,IAAI,GAAG,SAAS,GAAG,gBAAgB,GAAG,IAAI,CAgBtG"}
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Organization-name canonicalization — reduce a company name to a stable, comparable key.
|
|
7
|
+
*
|
|
8
|
+
* Winkler's record-linkage recipe: words of little distinguishing power (the legal designation —
|
|
9
|
+
* `Corporation`, `Limited`, `LLC`) are normalized away before matching, so `Acme Corp` and `Acme
|
|
10
|
+
* Corporation, LLC` collapse to the same key. We also split off a `doing business as` clause and
|
|
11
|
+
* normalize connectives (`&` → `and`), punctuation, accents, and a leading `The`.
|
|
12
|
+
*
|
|
13
|
+
* Evidence honesty (per the name-canonicalization research pass): the PERSON-name side is well
|
|
14
|
+
* sourced; the ORGANIZATION side is a known evidence gap. This is a solid _canonicalization_
|
|
15
|
+
* baseline (the strip-designations principle is Winkler-grounded; the designation list draws on
|
|
16
|
+
* the ISO 20275 Entity Legal Forms register and `cleanco`). The harder org-_matching_ problems —
|
|
17
|
+
* acronym ↔ expansion (`IBM` ↔ `International Business Machines`), DBA/alias resolution beyond
|
|
18
|
+
* the simple clause, subsidiary/parent, and TF-IDF n-gram token matching — are deferred to a
|
|
19
|
+
* follow-up (a dedicated org-matching research pass + the matcher epic).
|
|
20
|
+
*/
|
|
21
|
+
/**
|
|
22
|
+
* Legal-entity designations across jurisdictions, normalized to lowercase with punctuation removed
|
|
23
|
+
* (so `L.L.C.` → `llc`). Drawn from the ISO 20275 Entity Legal Forms register + `cleanco`'s common
|
|
24
|
+
* set. Stripped as whole tokens wherever they occur. Deliberately excludes name-meaningful words
|
|
25
|
+
* (`group`, `holdings`, `partners`, `associates`).
|
|
26
|
+
*/
|
|
27
|
+
const DESIGNATIONS = new Set([
|
|
28
|
+
"inc",
|
|
29
|
+
"incorporated",
|
|
30
|
+
"corp",
|
|
31
|
+
"corporation",
|
|
32
|
+
"co",
|
|
33
|
+
"company",
|
|
34
|
+
"llc",
|
|
35
|
+
"lllp",
|
|
36
|
+
"llp",
|
|
37
|
+
"pllc",
|
|
38
|
+
"lp",
|
|
39
|
+
"ltd",
|
|
40
|
+
"limited",
|
|
41
|
+
"plc",
|
|
42
|
+
"pc",
|
|
43
|
+
"pa",
|
|
44
|
+
"ag",
|
|
45
|
+
"sa",
|
|
46
|
+
"sas",
|
|
47
|
+
"sarl",
|
|
48
|
+
"sl",
|
|
49
|
+
"gmbh",
|
|
50
|
+
"mbh",
|
|
51
|
+
"ug",
|
|
52
|
+
"bv",
|
|
53
|
+
"nv",
|
|
54
|
+
"oy",
|
|
55
|
+
"oyj",
|
|
56
|
+
"ab",
|
|
57
|
+
"as",
|
|
58
|
+
"asa",
|
|
59
|
+
"spa",
|
|
60
|
+
"srl",
|
|
61
|
+
"kg",
|
|
62
|
+
"kgaa",
|
|
63
|
+
"kk",
|
|
64
|
+
"pty",
|
|
65
|
+
"proprietary",
|
|
66
|
+
"bhd",
|
|
67
|
+
"sdn",
|
|
68
|
+
"cc",
|
|
69
|
+
"cv",
|
|
70
|
+
"ulc",
|
|
71
|
+
"aps",
|
|
72
|
+
"kft",
|
|
73
|
+
"zrt",
|
|
74
|
+
"doo",
|
|
75
|
+
"ood",
|
|
76
|
+
"ead",
|
|
77
|
+
]);
|
|
78
|
+
/** Splits a `doing business as` / trade-name clause from a legal name. */
|
|
79
|
+
const DBA_PATTERN = /\s+(?:d\/b\/a|dba|doing business as|t\/a|trading as|a\/k\/a|aka|fka|f\/k\/a)\s+/i;
|
|
80
|
+
/**
|
|
81
|
+
* Canonicalize one name fragment: lowercase, strip accents, connectives → `and`, drop punctuation,
|
|
82
|
+
* remove a leading `the`, strip legal designations, collapse whitespace. Returns the key plus the
|
|
83
|
+
* designations it removed.
|
|
84
|
+
*/
|
|
85
|
+
function canonicalizeFragment(fragment) {
|
|
86
|
+
const normalized = fragment
|
|
87
|
+
.normalize("NFKD")
|
|
88
|
+
.replace(/[̀-ͯ]/g, "")
|
|
89
|
+
.toLowerCase()
|
|
90
|
+
// connective punctuation joins words rather than vanishing: "AT&T" → "at and t"
|
|
91
|
+
.replace(/&/g, " and ")
|
|
92
|
+
.replace(/\+/g, " and ")
|
|
93
|
+
// periods + apostrophes are intra-token, so remove (not space): "S.A." → "sa", "Macy's" → "macys"
|
|
94
|
+
.replace(/[.'’]/g, "")
|
|
95
|
+
.replace(/[^a-z0-9\s]/g, " ")
|
|
96
|
+
.replace(/\s+/g, " ")
|
|
97
|
+
.trim()
|
|
98
|
+
.replace(/^the\s+/, "");
|
|
99
|
+
const designations = [];
|
|
100
|
+
const kept = [];
|
|
101
|
+
for (const token of normalized.split(" ")) {
|
|
102
|
+
if (!token)
|
|
103
|
+
continue;
|
|
104
|
+
if (DESIGNATIONS.has(token))
|
|
105
|
+
designations.push(token);
|
|
106
|
+
else
|
|
107
|
+
kept.push(token);
|
|
108
|
+
}
|
|
109
|
+
return { canonical: kept.join(" "), designations };
|
|
110
|
+
}
|
|
111
|
+
/**
|
|
112
|
+
* Canonicalize an organization name: split off any `doing business as` clause, then reduce the
|
|
113
|
+
* legal name to a designation-stripped key. Returns `null` for empty input.
|
|
114
|
+
*/
|
|
115
|
+
export function canonicalizeOrganizationName(input) {
|
|
116
|
+
if (typeof input !== "string" || !input.trim())
|
|
117
|
+
return null;
|
|
118
|
+
const raw = input;
|
|
119
|
+
const [legalPart, ...dbaParts] = input.split(DBA_PATTERN);
|
|
120
|
+
const { canonical, designations } = canonicalizeFragment(legalPart ?? "");
|
|
121
|
+
const result = { raw, canonical, designations };
|
|
122
|
+
if (dbaParts.length) {
|
|
123
|
+
const dba = canonicalizeFragment(dbaParts.join(" ")).canonical;
|
|
124
|
+
if (dba)
|
|
125
|
+
result.dba = dba;
|
|
126
|
+
}
|
|
127
|
+
return result;
|
|
128
|
+
}
|
|
129
|
+
//# sourceMappingURL=organization.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"organization.js","sourceRoot":"","sources":["../organization.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;GAmBG;AAcH;;;;;GAKG;AACH,MAAM,YAAY,GAAG,IAAI,GAAG,CAAC;IAC5B,KAAK;IACL,cAAc;IACd,MAAM;IACN,aAAa;IACb,IAAI;IACJ,SAAS;IACT,KAAK;IACL,MAAM;IACN,KAAK;IACL,MAAM;IACN,IAAI;IACJ,KAAK;IACL,SAAS;IACT,KAAK;IACL,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,KAAK;IACL,MAAM;IACN,IAAI;IACJ,MAAM;IACN,KAAK;IACL,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,KAAK;IACL,IAAI;IACJ,IAAI;IACJ,KAAK;IACL,KAAK;IACL,KAAK;IACL,IAAI;IACJ,MAAM;IACN,IAAI;IACJ,KAAK;IACL,aAAa;IACb,KAAK;IACL,KAAK;IACL,IAAI;IACJ,IAAI;IACJ,KAAK;IACL,KAAK;IACL,KAAK;IACL,KAAK;IACL,KAAK;IACL,KAAK;IACL,KAAK;CACL,CAAC,CAAA;AAEF,0EAA0E;AAC1E,MAAM,WAAW,GAAG,kFAAkF,CAAA;AAEtG;;;;GAIG;AACH,SAAS,oBAAoB,CAAC,QAAgB;IAC7C,MAAM,UAAU,GAAG,QAAQ;SACzB,SAAS,CAAC,MAAM,CAAC;SACjB,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC;SACrB,WAAW,EAAE;QACd,gFAAgF;SAC/E,OAAO,CAAC,IAAI,EAAE,OAAO,CAAC;SACtB,OAAO,CAAC,KAAK,EAAE,OAAO,CAAC;QACxB,kGAAkG;SACjG,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC;SACrB,OAAO,CAAC,cAAc,EAAE,GAAG,CAAC;SAC5B,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC;SACpB,IAAI,EAAE;SACN,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC,CAAA;IAExB,MAAM,YAAY,GAAa,EAAE,CAAA;IACjC,MAAM,IAAI,GAAa,EAAE,CAAA;IACzB,KAAK,MAAM,KAAK,IAAI,UAAU,CAAC,KAAK,CAAC,GAAG,CAAC,EAAE,CAAC;QAC3C,IAAI,CAAC,KAAK;YAAE,SAAQ;QACpB,IAAI,YAAY,CAAC,GAAG,CAAC,KAAK,CAAC;YAAE,YAAY,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;;YAChD,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;IACtB,CAAC;IAED,OAAO,EAAE,SAAS,EAAE,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,YAAY,EAAE,CAAA;AACnD,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,4BAA4B,CAAC,KAAgC;IAC5E,IAAI,OAAO,KAAK,KAAK,QAAQ,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE;QAAE,OAAO,IAAI,CAAA;IAE3D,MAAM,GAAG,GAAG,KAAK,CAAA;IACjB,MAAM,CAAC,SAAS,EAAE,GAAG,QAAQ,CAAC,GAAG,KAAK,CAAC,KAAK,CAAC,WAAW,CAAC,CAAA;IAEzD,MAAM,EAAE,SAAS,EAAE,YAAY,EAAE,GAAG,oBAAoB,CAAC,SAAS,IAAI,EAAE,CAAC,CAAA;IAEzE,MAAM,MAAM,GAAqB,EAAE,GAAG,EAAE,SAAS,EAAE,YAAY,EAAE,CAAA;IAEjE,IAAI,QAAQ,CAAC,MAAM,EAAE,CAAC;QACrB,MAAM,GAAG,GAAG,oBAAoB,CAAC,QAAQ,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,SAAS,CAAA;QAC9D,IAAI,GAAG;YAAE,MAAM,CAAC,GAAG,GAAG,GAAG,CAAA;IAC1B,CAAC;IAED,OAAO,MAAM,CAAA;AACd,CAAC"}
|
package/package.json
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@mailwoman/record",
|
|
3
|
+
"version": "4.8.1",
|
|
4
|
+
"description": "Lean, plain-TypeScript record schema + per-field normalizers for the geocode-first matcher. Address-first: the canonical PostalAddress record composes parser components, the formatter's match key, and a resolved geocode; organization + contact records build on the same spine.",
|
|
5
|
+
"license": "AGPL-3.0-only",
|
|
6
|
+
"repository": {
|
|
7
|
+
"type": "git",
|
|
8
|
+
"url": "https://github.com/sister-software/mailwoman.git",
|
|
9
|
+
"directory": "record"
|
|
10
|
+
},
|
|
11
|
+
"type": "module",
|
|
12
|
+
"exports": {
|
|
13
|
+
"./package.json": "./package.json",
|
|
14
|
+
".": "./out/index.js",
|
|
15
|
+
"./address": "./out/address.js",
|
|
16
|
+
"./name": "./out/name.js",
|
|
17
|
+
"./organization": "./out/organization.js"
|
|
18
|
+
},
|
|
19
|
+
"dependencies": {
|
|
20
|
+
"@mailwoman/formatter": "4.8.1"
|
|
21
|
+
},
|
|
22
|
+
"devDependencies": {
|
|
23
|
+
"@types/node": "^25.9.2"
|
|
24
|
+
},
|
|
25
|
+
"files": [
|
|
26
|
+
"out/**/*.js",
|
|
27
|
+
"out/**/*.js.map",
|
|
28
|
+
"out/**/*.d.ts",
|
|
29
|
+
"out/**/*.d.ts.map"
|
|
30
|
+
],
|
|
31
|
+
"publishConfig": {
|
|
32
|
+
"access": "public"
|
|
33
|
+
}
|
|
34
|
+
}
|