@mailwoman/record 4.10.0 → 4.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +73 -0
- package/out/address.d.ts +2 -2
- package/out/address.js +2 -2
- package/out/index.d.ts +1 -1
- package/out/index.js +1 -1
- package/out/organization.d.ts +52 -5
- package/out/organization.d.ts.map +1 -1
- package/out/organization.js +90 -14
- package/out/organization.js.map +1 -1
- package/package.json +3 -3
package/README.md
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# @mailwoman/record
|
|
2
|
+
|
|
3
|
+
**Record schema and per-field normalizers** for the geocode-first entity
|
|
4
|
+
resolution matcher. Address-first design: the canonical `PostalAddress` record
|
|
5
|
+
composes parsed address components, the formatter's match key, and a resolved
|
|
6
|
+
geocode. Organization and contact records build on the same canonical record.
|
|
7
|
+
|
|
8
|
+
```ts
|
|
9
|
+
import { PostalAddress, parsePersonName, canonicalizeOrganizationName } from "@mailwoman/record";
|
|
10
|
+
|
|
11
|
+
// Canonical address record (parser output → canonical form)
|
|
12
|
+
const address: PostalAddress = {
|
|
13
|
+
components: { house_number: "1600", street: "Amphitheatre Parkway", ... },
|
|
14
|
+
canonicalKey: "1600 amphitheatre pkwy mountain view ca 94043",
|
|
15
|
+
coordinate: { lat: 37.4224, lon: -122.0842 },
|
|
16
|
+
};
|
|
17
|
+
|
|
18
|
+
// Person name parsing
|
|
19
|
+
const name = parsePersonName("Jane L. Smith");
|
|
20
|
+
// → { given: "Jane", middleInitial: "L", surname: "Smith" }
|
|
21
|
+
|
|
22
|
+
// Organization name canonicalization (for dedup)
|
|
23
|
+
canonicalizeOrganizationName("Baylor University Medical Center");
|
|
24
|
+
// → "baylor university medical center"
|
|
25
|
+
|
|
26
|
+
canonicalizeOrganizationName("Baylor Univ. Med. Ctr.");
|
|
27
|
+
// → "baylor university medical center" (same key)
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## API
|
|
31
|
+
|
|
32
|
+
```ts
|
|
33
|
+
// Address record (the core of the record system)
|
|
34
|
+
import { PostalAddress, createPostalAddress } from "@mailwoman/record/address"
|
|
35
|
+
|
|
36
|
+
// Person name parsing → structured components
|
|
37
|
+
import { parsePersonName, type ParsedPersonName } from "@mailwoman/record/name"
|
|
38
|
+
|
|
39
|
+
// Organization name canonicalization → matchable key
|
|
40
|
+
import { canonicalizeOrganizationName, type CanonicalizeOrgOpts } from "@mailwoman/record/organization"
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## What it normalizes
|
|
44
|
+
|
|
45
|
+
| Field | Normalizer | Purpose |
|
|
46
|
+
| ---------------- | ------------------------------ | ------------------------------------------------------------- |
|
|
47
|
+
| **Address** | `createPostalAddress` | Parse components + formatter key + geocode → canonical record |
|
|
48
|
+
| **Person name** | `parsePersonName` | "Jane L. Smith" → `{given, middleInitial, surname}` |
|
|
49
|
+
| **Organization** | `canonicalizeOrganizationName` | "Baylor Univ. Med. Ctr." → "baylor university medical center" |
|
|
50
|
+
|
|
51
|
+
## Design
|
|
52
|
+
|
|
53
|
+
- **Plain data, no classes.** Records are plain TypeScript objects with
|
|
54
|
+
branded types where needed.
|
|
55
|
+
- **Address-first.** The `PostalAddress` is the canonical record — the geocode-first
|
|
56
|
+
matcher resolves places, not strings.
|
|
57
|
+
- **Domain-scoped.** Organization canonicalization supports jurisdiction and
|
|
58
|
+
domain context (e.g., `{jurisdiction: "ID"}` for Indonesian legal designations,
|
|
59
|
+
`{domain: "healthcare"}` to protect PT/SCA from collision with medical
|
|
60
|
+
abbreviations).
|
|
61
|
+
- **Lean dependencies.** Only depends on `@mailwoman/formatter` for the
|
|
62
|
+
canonical key.
|
|
63
|
+
|
|
64
|
+
## Related
|
|
65
|
+
|
|
66
|
+
- [`@mailwoman/match`](../match) — the fuzzy matcher that consumes these records
|
|
67
|
+
- [`@mailwoman/formatter`](../formatter) — `canonicalKey` used by `PostalAddress`
|
|
68
|
+
- [`@mailwoman/registry`](../registry) — high-level `resolveEntities` that uses records
|
|
69
|
+
- [Geocode-First Record Matching](https://mailwoman.sister.software/articles/concepts/geocode-first-record-matching/)
|
|
70
|
+
|
|
71
|
+
## License
|
|
72
|
+
|
|
73
|
+
[AGPL-3.0-only](https://www.gnu.org/licenses/agpl-3.0.html)
|
package/out/address.d.ts
CHANGED
|
@@ -3,8 +3,8 @@
|
|
|
3
3
|
* @license AGPL-3.0
|
|
4
4
|
* @author Teffen Ellis, et al.
|
|
5
5
|
*
|
|
6
|
-
* The canonical address record — the matcher's unit of address identity, and the
|
|
7
|
-
* organization and contact records build on.
|
|
6
|
+
* The canonical address record — the matcher's unit of address identity, and the canonical record
|
|
7
|
+
* the organization and contact records build on.
|
|
8
8
|
*
|
|
9
9
|
* It is plain data: parser components + the formatter's match key + an optional resolved geocode,
|
|
10
10
|
* composed into one object. No ORM, no decorators, no schema-generation machinery — if we need a
|
package/out/address.js
CHANGED
|
@@ -3,8 +3,8 @@
|
|
|
3
3
|
* @license AGPL-3.0
|
|
4
4
|
* @author Teffen Ellis, et al.
|
|
5
5
|
*
|
|
6
|
-
* The canonical address record — the matcher's unit of address identity, and the
|
|
7
|
-
* organization and contact records build on.
|
|
6
|
+
* The canonical address record — the matcher's unit of address identity, and the canonical record
|
|
7
|
+
* the organization and contact records build on.
|
|
8
8
|
*
|
|
9
9
|
* It is plain data: parser components + the formatter's match key + an optional resolved geocode,
|
|
10
10
|
* composed into one object. No ORM, no decorators, no schema-generation machinery — if we need a
|
package/out/index.d.ts
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
*
|
|
6
6
|
* `@mailwoman/record` — the canonicalize layer for the geocode-first matcher.
|
|
7
7
|
*
|
|
8
|
-
* Address-first: {@linkcode PostalAddress} is the
|
|
8
|
+
* Address-first: {@linkcode PostalAddress} is the canonical record. The per-field normalizers
|
|
9
9
|
* ({@linkcode parsePersonName}, {@linkcode canonicalizeOrganizationName}) build on the same
|
|
10
10
|
* plain-data pattern. Contact records and the comparator/Fellegi-Sunter layer land in the
|
|
11
11
|
* matcher.
|
package/out/index.js
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
*
|
|
6
6
|
* `@mailwoman/record` — the canonicalize layer for the geocode-first matcher.
|
|
7
7
|
*
|
|
8
|
-
* Address-first: {@linkcode PostalAddress} is the
|
|
8
|
+
* Address-first: {@linkcode PostalAddress} is the canonical record. The per-field normalizers
|
|
9
9
|
* ({@linkcode parsePersonName}, {@linkcode canonicalizeOrganizationName}) build on the same
|
|
10
10
|
* plain-data pattern. Contact records and the comparator/Fellegi-Sunter layer land in the
|
|
11
11
|
* matcher.
|
package/out/organization.d.ts
CHANGED
|
@@ -10,13 +10,32 @@
|
|
|
10
10
|
* Corporation, LLC` collapse to the same key. We also split off a `doing business as` clause and
|
|
11
11
|
* normalize connectives (`&` → `and`), punctuation, accents, and a leading `The`.
|
|
12
12
|
*
|
|
13
|
+
* **The collision problem (#668).** A legal-form token in one jurisdiction is a meaningful word in
|
|
14
|
+
* another domain. `PT` is Indonesia's `Perseroan Terbatas` (its LLC) — and US-healthcare
|
|
15
|
+
* shorthand for _Physical Therapy_. `SCA` / `SCS` are French/Belgian/Luxembourg commandite forms
|
|
16
|
+
* — and, in a clinic's name, _Sudden Cardiac Arrest_ / _Spinal Cord Stimulator_. A single
|
|
17
|
+
* universal strip-list can't be right for both: strip `PT` and you corrupt `Lakeside PT`; keep it
|
|
18
|
+
* and you leave the legal form on an Indonesian company. So the strip-set is computed on **two
|
|
19
|
+
* axes**:
|
|
20
|
+
*
|
|
21
|
+
* - **jurisdiction** (ISO 3166-1 alpha-2, e.g. from the resolved address country) — _adds_ the legal
|
|
22
|
+
* forms valid in that country. Collision-prone forms (`pt`, `sca`, `scs`) live here, gated
|
|
23
|
+
* behind a known jurisdiction, NOT in the universal base.
|
|
24
|
+
* - **domain** (an ingest-config tag, e.g. `healthcare`) — _protects_ domain-meaningful tokens from
|
|
25
|
+
* ever being stripped, even when a jurisdiction pack would add them. Domain protection wins.
|
|
26
|
+
*
|
|
27
|
+
* `effective = (base ∪ jurisdiction-pack) − domain-protect-pack`. With no options the set is the
|
|
28
|
+
* universal base and behavior is byte-for-byte unchanged — the new axes are strictly opt-in.
|
|
29
|
+
*
|
|
13
30
|
* Evidence honesty (per the name-canonicalization research pass): the PERSON-name side is well
|
|
14
31
|
* sourced; the ORGANIZATION side is a known evidence gap. This is a solid _canonicalization_
|
|
15
32
|
* baseline (the strip-designations principle is Winkler-grounded; the designation list draws on
|
|
16
|
-
* the ISO 20275 Entity Legal Forms register and `cleanco`). The
|
|
17
|
-
*
|
|
18
|
-
*
|
|
19
|
-
*
|
|
33
|
+
* the ISO 20275 Entity Legal Forms register and `cleanco`). The jurisdiction/domain packs below
|
|
34
|
+
* are grounded seeds, not exhaustive — extend them per ISO 20275 as locales are added. The harder
|
|
35
|
+
* org-_matching_ problems — acronym ↔ expansion (`IBM` ↔ `International Business Machines`),
|
|
36
|
+
* DBA/alias resolution beyond the simple clause, subsidiary/parent, and TF-IDF n-gram token
|
|
37
|
+
* matching — are deferred to a follow-up (a dedicated org-matching research pass + the matcher
|
|
38
|
+
* epic).
|
|
20
39
|
*/
|
|
21
40
|
/** A canonicalized organization name. */
|
|
22
41
|
export interface OrganizationName {
|
|
@@ -29,9 +48,37 @@ export interface OrganizationName {
|
|
|
29
48
|
/** The `doing business as` / trade-name clause, canonicalized, when one was present. */
|
|
30
49
|
dba?: string;
|
|
31
50
|
}
|
|
51
|
+
/**
|
|
52
|
+
* A domain pack name. Each protects the abbreviations that are meaningful in that domain from being
|
|
53
|
+
* stripped as legal forms (see {@link DOMAIN_PROTECTED}). `general` protects nothing — the explicit
|
|
54
|
+
* "no domain" choice. Add a pack here (and to {@link DOMAIN_PROTECTED}) per ingest domain.
|
|
55
|
+
*/
|
|
56
|
+
export type DesignationDomain = "general" | "healthcare";
|
|
57
|
+
/**
|
|
58
|
+
* Context for {@link canonicalizeOrganizationName}. Omit both fields for the universal base
|
|
59
|
+
* behavior.
|
|
60
|
+
*/
|
|
61
|
+
export interface CanonicalizeOptions {
|
|
62
|
+
/**
|
|
63
|
+
* ISO 3166-1 alpha-2 country code of the org's jurisdiction (typically the resolved address
|
|
64
|
+
* country). Adds that country's legal forms — including collision-prone ones gated out of the
|
|
65
|
+
* base — to the strip-set. Case-insensitive; unknown codes add nothing.
|
|
66
|
+
*/
|
|
67
|
+
jurisdiction?: string;
|
|
68
|
+
/**
|
|
69
|
+
* Ingest domain. Protects domain-meaningful abbreviations (e.g. `healthcare` protects `pt` /
|
|
70
|
+
* `sca` / `scs`) from being stripped, overriding any jurisdiction pack that would add them.
|
|
71
|
+
*/
|
|
72
|
+
domain?: DesignationDomain;
|
|
73
|
+
}
|
|
32
74
|
/**
|
|
33
75
|
* Canonicalize an organization name: split off any `doing business as` clause, then reduce the
|
|
34
76
|
* legal name to a designation-stripped key. Returns `null` for empty input.
|
|
77
|
+
*
|
|
78
|
+
* Pass {@link CanonicalizeOptions} to resolve the jurisdiction × domain collision (#668): a
|
|
79
|
+
* `jurisdiction` adds that country's legal forms, a `domain` protects its meaningful abbreviations.
|
|
80
|
+
* With no options the universal base set is used and the result is byte-for-byte the legacy
|
|
81
|
+
* behavior.
|
|
35
82
|
*/
|
|
36
|
-
export declare function canonicalizeOrganizationName(input: string | null | undefined): OrganizationName | null;
|
|
83
|
+
export declare function canonicalizeOrganizationName(input: string | null | undefined, options?: CanonicalizeOptions): OrganizationName | null;
|
|
37
84
|
//# sourceMappingURL=organization.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"organization.d.ts","sourceRoot":"","sources":["../organization.ts"],"names":[],"mappings":"AAAA
|
|
1
|
+
{"version":3,"file":"organization.d.ts","sourceRoot":"","sources":["../organization.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAsCG;AAEH,yCAAyC;AACzC,MAAM,WAAW,gBAAgB;IAChC,oCAAoC;IACpC,GAAG,EAAE,MAAM,CAAA;IACX,wEAAwE;IACxE,SAAS,EAAE,MAAM,CAAA;IACjB,wFAAwF;IACxF,YAAY,EAAE,MAAM,EAAE,CAAA;IACtB,wFAAwF;IACxF,GAAG,CAAC,EAAE,MAAM,CAAA;CACZ;AAED;;;;GAIG;AACH,MAAM,MAAM,iBAAiB,GAAG,SAAS,GAAG,YAAY,CAAA;AAExD;;;GAGG;AACH,MAAM,WAAW,mBAAmB;IACnC;;;;OAIG;IACH,YAAY,CAAC,EAAE,MAAM,CAAA;IACrB;;;OAGG;IACH,MAAM,CAAC,EAAE,iBAAiB,CAAA;CAC1B;AAoJD;;;;;;;;GAQG;AACH,wBAAgB,4BAA4B,CAC3C,KAAK,EAAE,MAAM,GAAG,IAAI,GAAG,SAAS,EAChC,OAAO,CAAC,EAAE,mBAAmB,GAC3B,gBAAgB,GAAG,IAAI,CAiBzB"}
|
package/out/organization.js
CHANGED
|
@@ -10,21 +10,42 @@
|
|
|
10
10
|
* Corporation, LLC` collapse to the same key. We also split off a `doing business as` clause and
|
|
11
11
|
* normalize connectives (`&` → `and`), punctuation, accents, and a leading `The`.
|
|
12
12
|
*
|
|
13
|
+
* **The collision problem (#668).** A legal-form token in one jurisdiction is a meaningful word in
|
|
14
|
+
* another domain. `PT` is Indonesia's `Perseroan Terbatas` (its LLC) — and US-healthcare
|
|
15
|
+
* shorthand for _Physical Therapy_. `SCA` / `SCS` are French/Belgian/Luxembourg commandite forms
|
|
16
|
+
* — and, in a clinic's name, _Sudden Cardiac Arrest_ / _Spinal Cord Stimulator_. A single
|
|
17
|
+
* universal strip-list can't be right for both: strip `PT` and you corrupt `Lakeside PT`; keep it
|
|
18
|
+
* and you leave the legal form on an Indonesian company. So the strip-set is computed on **two
|
|
19
|
+
* axes**:
|
|
20
|
+
*
|
|
21
|
+
* - **jurisdiction** (ISO 3166-1 alpha-2, e.g. from the resolved address country) — _adds_ the legal
|
|
22
|
+
* forms valid in that country. Collision-prone forms (`pt`, `sca`, `scs`) live here, gated
|
|
23
|
+
* behind a known jurisdiction, NOT in the universal base.
|
|
24
|
+
* - **domain** (an ingest-config tag, e.g. `healthcare`) — _protects_ domain-meaningful tokens from
|
|
25
|
+
* ever being stripped, even when a jurisdiction pack would add them. Domain protection wins.
|
|
26
|
+
*
|
|
27
|
+
* `effective = (base ∪ jurisdiction-pack) − domain-protect-pack`. With no options the set is the
|
|
28
|
+
* universal base and behavior is byte-for-byte unchanged — the new axes are strictly opt-in.
|
|
29
|
+
*
|
|
13
30
|
* Evidence honesty (per the name-canonicalization research pass): the PERSON-name side is well
|
|
14
31
|
* sourced; the ORGANIZATION side is a known evidence gap. This is a solid _canonicalization_
|
|
15
32
|
* baseline (the strip-designations principle is Winkler-grounded; the designation list draws on
|
|
16
|
-
* the ISO 20275 Entity Legal Forms register and `cleanco`). The
|
|
17
|
-
*
|
|
18
|
-
*
|
|
19
|
-
*
|
|
33
|
+
* the ISO 20275 Entity Legal Forms register and `cleanco`). The jurisdiction/domain packs below
|
|
34
|
+
* are grounded seeds, not exhaustive — extend them per ISO 20275 as locales are added. The harder
|
|
35
|
+
* org-_matching_ problems — acronym ↔ expansion (`IBM` ↔ `International Business Machines`),
|
|
36
|
+
* DBA/alias resolution beyond the simple clause, subsidiary/parent, and TF-IDF n-gram token
|
|
37
|
+
* matching — are deferred to a follow-up (a dedicated org-matching research pass + the matcher
|
|
38
|
+
* epic).
|
|
20
39
|
*/
|
|
21
40
|
/**
|
|
22
|
-
*
|
|
23
|
-
*
|
|
24
|
-
*
|
|
25
|
-
*
|
|
41
|
+
* Universal legal-entity designations — the forms that are safe to strip regardless of jurisdiction
|
|
42
|
+
* or domain because they don't collide with common domain abbreviations. Normalized to lowercase
|
|
43
|
+
* with punctuation removed (so `L.L.C.` → `llc`). Drawn from the ISO 20275 register + `cleanco`'s
|
|
44
|
+
* common set. Stripped as whole tokens wherever they occur. Deliberately excludes name-meaningful
|
|
45
|
+
* words (`group`, `holdings`, `partners`, `associates`) AND the collision-prone forms (`pt`, `sca`,
|
|
46
|
+
* `scs`) — those last live in {@link JURISDICTION_DESIGNATIONS}, gated behind a known jurisdiction.
|
|
26
47
|
*/
|
|
27
|
-
const
|
|
48
|
+
const BASE_DESIGNATIONS = new Set([
|
|
28
49
|
"inc",
|
|
29
50
|
"incorporated",
|
|
30
51
|
"corp",
|
|
@@ -74,7 +95,56 @@ const DESIGNATIONS = new Set([
|
|
|
74
95
|
"doo",
|
|
75
96
|
"ood",
|
|
76
97
|
"ead",
|
|
98
|
+
// Belgian forms — safe to add to the base (no domain collision).
|
|
99
|
+
"bvba",
|
|
100
|
+
"sprl",
|
|
77
101
|
]);
|
|
102
|
+
/**
|
|
103
|
+
* Jurisdiction-gated legal forms (ISO 3166-1 alpha-2 → forms), added only when the jurisdiction is
|
|
104
|
+
* known. This is where the collision-prone tokens live: `pt` (Indonesia), `sca` / `scs`
|
|
105
|
+
* (French/Belgian/Luxembourg commandite forms). Stripping these is correct ONLY when we know the
|
|
106
|
+
* org's country — never in the universal base. Grounded seeds, not exhaustive; extend per ISO
|
|
107
|
+
* 20275.
|
|
108
|
+
*/
|
|
109
|
+
const JURISDICTION_DESIGNATIONS = {
|
|
110
|
+
ID: ["pt", "tbk", "ud"], // Perseroan Terbatas / Terbuka (listed) / Usaha Dagang
|
|
111
|
+
FR: ["sca", "scs", "sci", "eurl", "sasu", "snc"],
|
|
112
|
+
BE: ["sca", "scs"],
|
|
113
|
+
LU: ["sca", "scs"],
|
|
114
|
+
ES: ["scs"], // Sociedad en Comandita Simple
|
|
115
|
+
IT: ["sapa", "snc"], // S.a.p.a. (commandite par actions) / società in nome collettivo
|
|
116
|
+
};
|
|
117
|
+
/**
|
|
118
|
+
* Domain protect-sets (domain → tokens never stripped). Overrides any jurisdiction pack: a token
|
|
119
|
+
* here stays in the name even if the org's jurisdiction would treat it as a legal form.
|
|
120
|
+
* `healthcare` guards the clinical abbreviations that collide with gated legal forms — `pt`
|
|
121
|
+
* (Physical Therapy), `sca` (Sudden Cardiac Arrest), `scs` (Spinal Cord Stimulator) — plus a couple
|
|
122
|
+
* of always-clinical ones for future-proofing.
|
|
123
|
+
*/
|
|
124
|
+
const DOMAIN_PROTECTED = {
|
|
125
|
+
general: [],
|
|
126
|
+
healthcare: ["pt", "sca", "scs", "ot", "dpt"],
|
|
127
|
+
};
|
|
128
|
+
/**
|
|
129
|
+
* Compute the effective designation strip-set for the given context: `(base ∪ jurisdiction-pack) −
|
|
130
|
+
* domain-protect-pack`. Returns the shared base set unchanged when no context is given (the
|
|
131
|
+
* byte-stable default), so the common path allocates nothing.
|
|
132
|
+
*/
|
|
133
|
+
function resolveDesignations(options) {
|
|
134
|
+
const jurisdiction = options?.jurisdiction?.trim().toUpperCase();
|
|
135
|
+
const jurisdictionPack = jurisdiction ? JURISDICTION_DESIGNATIONS[jurisdiction] : undefined;
|
|
136
|
+
const protectPack = options?.domain ? DOMAIN_PROTECTED[options.domain] : undefined;
|
|
137
|
+
if (!jurisdictionPack && !protectPack?.length)
|
|
138
|
+
return BASE_DESIGNATIONS;
|
|
139
|
+
const set = new Set(BASE_DESIGNATIONS);
|
|
140
|
+
if (jurisdictionPack)
|
|
141
|
+
for (const token of jurisdictionPack)
|
|
142
|
+
set.add(token);
|
|
143
|
+
if (protectPack)
|
|
144
|
+
for (const token of protectPack)
|
|
145
|
+
set.delete(token);
|
|
146
|
+
return set;
|
|
147
|
+
}
|
|
78
148
|
/** Splits a `doing business as` / trade-name clause from a legal name. */
|
|
79
149
|
const DBA_PATTERN = /\s+(?:d\/b\/a|dba|doing business as|t\/a|trading as|a\/k\/a|aka|fka|f\/k\/a)\s+/i;
|
|
80
150
|
/**
|
|
@@ -82,7 +152,7 @@ const DBA_PATTERN = /\s+(?:d\/b\/a|dba|doing business as|t\/a|trading as|a\/k\/a
|
|
|
82
152
|
* remove a leading `the`, strip legal designations, collapse whitespace. Returns the key plus the
|
|
83
153
|
* designations it removed.
|
|
84
154
|
*/
|
|
85
|
-
function canonicalizeFragment(fragment) {
|
|
155
|
+
function canonicalizeFragment(fragment, designationSet) {
|
|
86
156
|
const normalized = fragment
|
|
87
157
|
.normalize("NFKD")
|
|
88
158
|
.replace(/[̀-ͯ]/g, "")
|
|
@@ -101,7 +171,7 @@ function canonicalizeFragment(fragment) {
|
|
|
101
171
|
for (const token of normalized.split(" ")) {
|
|
102
172
|
if (!token)
|
|
103
173
|
continue;
|
|
104
|
-
if (
|
|
174
|
+
if (designationSet.has(token))
|
|
105
175
|
designations.push(token);
|
|
106
176
|
else
|
|
107
177
|
kept.push(token);
|
|
@@ -111,16 +181,22 @@ function canonicalizeFragment(fragment) {
|
|
|
111
181
|
/**
|
|
112
182
|
* Canonicalize an organization name: split off any `doing business as` clause, then reduce the
|
|
113
183
|
* legal name to a designation-stripped key. Returns `null` for empty input.
|
|
184
|
+
*
|
|
185
|
+
* Pass {@link CanonicalizeOptions} to resolve the jurisdiction × domain collision (#668): a
|
|
186
|
+
* `jurisdiction` adds that country's legal forms, a `domain` protects its meaningful abbreviations.
|
|
187
|
+
* With no options the universal base set is used and the result is byte-for-byte the legacy
|
|
188
|
+
* behavior.
|
|
114
189
|
*/
|
|
115
|
-
export function canonicalizeOrganizationName(input) {
|
|
190
|
+
export function canonicalizeOrganizationName(input, options) {
|
|
116
191
|
if (typeof input !== "string" || !input.trim())
|
|
117
192
|
return null;
|
|
118
193
|
const raw = input;
|
|
194
|
+
const designationSet = resolveDesignations(options);
|
|
119
195
|
const [legalPart, ...dbaParts] = input.split(DBA_PATTERN);
|
|
120
|
-
const { canonical, designations } = canonicalizeFragment(legalPart ?? "");
|
|
196
|
+
const { canonical, designations } = canonicalizeFragment(legalPart ?? "", designationSet);
|
|
121
197
|
const result = { raw, canonical, designations };
|
|
122
198
|
if (dbaParts.length) {
|
|
123
|
-
const dba = canonicalizeFragment(dbaParts.join(" ")).canonical;
|
|
199
|
+
const dba = canonicalizeFragment(dbaParts.join(" "), designationSet).canonical;
|
|
124
200
|
if (dba)
|
|
125
201
|
result.dba = dba;
|
|
126
202
|
}
|
package/out/organization.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"organization.js","sourceRoot":"","sources":["../organization.ts"],"names":[],"mappings":"AAAA
|
|
1
|
+
{"version":3,"file":"organization.js","sourceRoot":"","sources":["../organization.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAsCG;AAuCH;;;;;;;GAOG;AACH,MAAM,iBAAiB,GAAG,IAAI,GAAG,CAAC;IACjC,KAAK;IACL,cAAc;IACd,MAAM;IACN,aAAa;IACb,IAAI;IACJ,SAAS;IACT,KAAK;IACL,MAAM;IACN,KAAK;IACL,MAAM;IACN,IAAI;IACJ,KAAK;IACL,SAAS;IACT,KAAK;IACL,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,KAAK;IACL,MAAM;IACN,IAAI;IACJ,MAAM;IACN,KAAK;IACL,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,KAAK;IACL,IAAI;IACJ,IAAI;IACJ,KAAK;IACL,KAAK;IACL,KAAK;IACL,IAAI;IACJ,MAAM;IACN,IAAI;IACJ,KAAK;IACL,aAAa;IACb,KAAK;IACL,KAAK;IACL,IAAI;IACJ,IAAI;IACJ,KAAK;IACL,KAAK;IACL,KAAK;IACL,KAAK;IACL,KAAK;IACL,KAAK;IACL,KAAK;IACL,iEAAiE;IACjE,MAAM;IACN,MAAM;CACN,CAAC,CAAA;AAEF;;;;;;GAMG;AACH,MAAM,yBAAyB,GAAsC;IACpE,EAAE,EAAE,CAAC,IAAI,EAAE,KAAK,EAAE,IAAI,CAAC,EAAE,uDAAuD;IAChF,EAAE,EAAE,CAAC,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,KAAK,CAAC;IAChD,EAAE,EAAE,CAAC,KAAK,EAAE,KAAK,CAAC;IAClB,EAAE,EAAE,CAAC,KAAK,EAAE,KAAK,CAAC;IAClB,EAAE,EAAE,CAAC,KAAK,CAAC,EAAE,+BAA+B;IAC5C,EAAE,EAAE,CAAC,MAAM,EAAE,KAAK,CAAC,EAAE,iEAAiE;CACtF,CAAA;AAED;;;;;;GAMG;AACH,MAAM,gBAAgB,GAAiD;IACtE,OAAO,EAAE,EAAE;IACX,UAAU,EAAE,CAAC,IAAI,EAAE,KAAK,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,CAAC;CAC7C,CAAA;AAED;;;;GAIG;AACH,SAAS,mBAAmB,CAAC,OAA6B;IACzD,MAAM,YAAY,GAAG,OAAO,EAAE,YAAY,EAAE,IAAI,EAAE,CAAC,WAAW,EAAE,CAAA;IAChE,MAAM,gBAAgB,GAAG,YAAY,CAAC,CAAC,CAAC,yBAAyB,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,SAAS,CAAA;IAC3F,MAAM,WAAW,GAAG,OAAO,EAAE,MAAM,CAAC,CAAC,CAAC,gBAAgB,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,SAAS,CAAA;IAElF,IAAI,CAAC,gBAAgB,IAAI,CAAC,WAAW,EAAE,MAAM;QAAE,OAAO,iBAAiB,CAAA;IAEvE,MAAM,GAAG,GAAG,IAAI,GAAG,CAAC,iBAAiB,CAAC,CAAA;IACtC,IAAI,gBAAgB;QAAE,KAAK,MAAM,KAAK,IAAI,gBAAgB;YAAE,GAAG,CAAC,GAAG,CAAC,KAAK,CAAC,CAAA;IAC1E,IAAI,WAAW;QAAE,KAAK,MAAM,KAAK,IAAI,WAAW;YAAE,GAAG,CAAC,MAAM,CAAC,KAAK,CAAC,CAAA;IACnE,OAAO,GAAG,CAAA;AACX,CAAC;AAED,0EAA0E;AAC1E,MAAM,WAAW,GAAG,kFAAkF,CAAA;AAEtG;;;;GAIG;AACH,SAAS,oBAAoB,CAC5B,QAAgB,EAChB,cAAmC;IAEnC,MAAM,UAAU,GAAG,QAAQ;SACzB,SAAS,CAAC,MAAM,CAAC;SACjB,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC;SACrB,WAAW,EAAE;QACd,gFAAgF;SAC/E,OAAO,CAAC,IAAI,EAAE,OAAO,CAAC;SACtB,OAAO,CAAC,KAAK,EAAE,OAAO,CAAC;QACxB,kGAAkG;SACjG,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC;SACrB,OAAO,CAAC,cAAc,EAAE,GAAG,CAAC;SAC5B,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC;SACpB,IAAI,EAAE;SACN,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC,CAAA;IAExB,MAAM,YAAY,GAAa,EAAE,CAAA;IACjC,MAAM,IAAI,GAAa,EAAE,CAAA;IACzB,KAAK,MAAM,KAAK,IAAI,UAAU,CAAC,KAAK,CAAC,GAAG,CAAC,EAAE,CAAC;QAC3C,IAAI,CAAC,KAAK;YAAE,SAAQ;QACpB,IAAI,cAAc,CAAC,GAAG,CAAC,KAAK,CAAC;YAAE,YAAY,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;;YAClD,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;IACtB,CAAC;IAED,OAAO,EAAE,SAAS,EAAE,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,YAAY,EAAE,CAAA;AACnD,CAAC;AAED;;;;;;;;GAQG;AACH,MAAM,UAAU,4BAA4B,CAC3C,KAAgC,EAChC,OAA6B;IAE7B,IAAI,OAAO,KAAK,KAAK,QAAQ,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE;QAAE,OAAO,IAAI,CAAA;IAE3D,MAAM,GAAG,GAAG,KAAK,CAAA;IACjB,MAAM,cAAc,GAAG,mBAAmB,CAAC,OAAO,CAAC,CAAA;IACnD,MAAM,CAAC,SAAS,EAAE,GAAG,QAAQ,CAAC,GAAG,KAAK,CAAC,KAAK,CAAC,WAAW,CAAC,CAAA;IAEzD,MAAM,EAAE,SAAS,EAAE,YAAY,EAAE,GAAG,oBAAoB,CAAC,SAAS,IAAI,EAAE,EAAE,cAAc,CAAC,CAAA;IAEzF,MAAM,MAAM,GAAqB,EAAE,GAAG,EAAE,SAAS,EAAE,YAAY,EAAE,CAAA;IAEjE,IAAI,QAAQ,CAAC,MAAM,EAAE,CAAC;QACrB,MAAM,GAAG,GAAG,oBAAoB,CAAC,QAAQ,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,cAAc,CAAC,CAAC,SAAS,CAAA;QAC9E,IAAI,GAAG;YAAE,MAAM,CAAC,GAAG,GAAG,GAAG,CAAA;IAC1B,CAAC;IAED,OAAO,MAAM,CAAA;AACd,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mailwoman/record",
|
|
3
|
-
"version": "4.
|
|
4
|
-
"description": "Lean, plain-TypeScript record schema + per-field normalizers for the geocode-first matcher. Address-first: the canonical PostalAddress record composes parser components, the formatter's match key, and a resolved geocode; organization + contact records build on the same
|
|
3
|
+
"version": "4.12.0",
|
|
4
|
+
"description": "Lean, plain-TypeScript record schema + per-field normalizers for the geocode-first matcher. Address-first: the canonical PostalAddress record composes parser components, the formatter's match key, and a resolved geocode; organization + contact records build on the same canonical record.",
|
|
5
5
|
"license": "AGPL-3.0-only",
|
|
6
6
|
"repository": {
|
|
7
7
|
"type": "git",
|
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
"./organization": "./out/organization.js"
|
|
18
18
|
},
|
|
19
19
|
"dependencies": {
|
|
20
|
-
"@mailwoman/formatter": "4.
|
|
20
|
+
"@mailwoman/formatter": "4.12.0"
|
|
21
21
|
},
|
|
22
22
|
"devDependencies": {
|
|
23
23
|
"@types/node": "^25.9.2"
|