bm-address-normaliser 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +29 -0
- package/src/dictionary.json +60 -0
- package/src/index.js +6 -0
- package/src/normalise.js +202 -0
package/package.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "bm-address-normaliser",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Expand abbreviated Malaysian addresses for display and text-to-speech.",
|
|
5
|
+
"main": "src/index.js",
|
|
6
|
+
"files": [
|
|
7
|
+
"src"
|
|
8
|
+
],
|
|
9
|
+
"scripts": {
|
|
10
|
+
"test": "node --test"
|
|
11
|
+
},
|
|
12
|
+
"keywords": [
|
|
13
|
+
"malaysia",
|
|
14
|
+
"bahasa-malaysia",
|
|
15
|
+
"address",
|
|
16
|
+
"nlp",
|
|
17
|
+
"tts",
|
|
18
|
+
"normaliser"
|
|
19
|
+
],
|
|
20
|
+
"author": "Ahmad Hozir <ahozir@gmail.com>",
|
|
21
|
+
"license": "MIT",
|
|
22
|
+
"repository": {
|
|
23
|
+
"type": "git",
|
|
24
|
+
"url": "https://github.com/ahozir-ux/bm-address-normaliser.git"
|
|
25
|
+
},
|
|
26
|
+
"engines": {
|
|
27
|
+
"node": ">=18.0.0"
|
|
28
|
+
}
|
|
29
|
+
}
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
{
|
|
2
|
+
"JLN": { "expansion": "Jalan", "syllables": "Ja-lan" },
|
|
3
|
+
"TMN": { "expansion": "Taman", "syllables": "Ta-man" },
|
|
4
|
+
"BKT": { "expansion": "Bukit", "syllables": "Bu-kit" },
|
|
5
|
+
"LRG": { "expansion": "Lorong", "syllables": "Lo-rong" },
|
|
6
|
+
"BDR": { "expansion": "Bandar", "syllables": "Ban-dar" },
|
|
7
|
+
"BT": { "expansion": "Batu", "syllables": "Ba-tu" },
|
|
8
|
+
"SEK": { "expansion": "Seksyen", "syllables": "Sek-syen" },
|
|
9
|
+
"TKT": { "expansion": "Tingkat", "syllables": "Ting-kat" },
|
|
10
|
+
"BLK": { "expansion": "Blok", "syllables": "Blok" },
|
|
11
|
+
"APT": { "expansion": "Apartmen", "syllables": "A-part-men" },
|
|
12
|
+
"ULU": { "expansion": "Ulu", "syllables": "U-lu" },
|
|
13
|
+
|
|
14
|
+
"KL": { "expansion": "Kuala Lumpur", "syllables": "Kua-la Lum-pur" },
|
|
15
|
+
"PJ": { "expansion": "Petaling Jaya", "syllables": "Pe-ta-ling Ja-ya" },
|
|
16
|
+
"JB": { "expansion": "Johor Bahru", "syllables": "Jo-hor Bah-ru" },
|
|
17
|
+
"KB": { "expansion": "Kota Bharu", "syllables": "Ko-ta Bha-ru" },
|
|
18
|
+
"KK": { "expansion": "Kota Kinabalu", "syllables": "Ko-ta Ki-na-ba-lu" },
|
|
19
|
+
"PNG": { "expansion": "Pulau Pinang", "syllables": "Pu-lau Pi-nang" },
|
|
20
|
+
"KCH": { "expansion": "Kuching", "syllables": "Ku-ching" },
|
|
21
|
+
|
|
22
|
+
"KG": {
|
|
23
|
+
"expansion": "Kampung",
|
|
24
|
+
"syllables": "Kam-pung",
|
|
25
|
+
"ambiguous_with": "kilogram",
|
|
26
|
+
"rule": "numeric_prefix_is_weight"
|
|
27
|
+
},
|
|
28
|
+
"KPG": {
|
|
29
|
+
"expansion": "Kampung",
|
|
30
|
+
"syllables": "Kam-pung",
|
|
31
|
+
"ambiguous_with": "kilogram",
|
|
32
|
+
"rule": "numeric_prefix_is_weight"
|
|
33
|
+
},
|
|
34
|
+
"SG": {
|
|
35
|
+
"expansion": "Sungai",
|
|
36
|
+
"syllables": "Su-ngai",
|
|
37
|
+
"ambiguous_with": "Singapore",
|
|
38
|
+
"rule": "default_to_malay"
|
|
39
|
+
},
|
|
40
|
+
"KT": {
|
|
41
|
+
"expansion": "Kuala Terengganu",
|
|
42
|
+
"syllables": "Kua-la Te-reng-ga-nu",
|
|
43
|
+
"ambiguous_with": "section code prefix",
|
|
44
|
+
"rule": "numeric_suffix_is_code"
|
|
45
|
+
},
|
|
46
|
+
|
|
47
|
+
"SGR": { "expansion": "Selangor", "syllables": "Se-lan-gor" },
|
|
48
|
+
"JHR": { "expansion": "Johor", "syllables": "Jo-hor" },
|
|
49
|
+
"PHG": { "expansion": "Pahang", "syllables": "Pa-hang" },
|
|
50
|
+
"PRK": { "expansion": "Perak", "syllables": "Pe-rak" },
|
|
51
|
+
"KDH": { "expansion": "Kedah", "syllables": "Ke-dah" },
|
|
52
|
+
"KTN": { "expansion": "Kelantan", "syllables": "Ke-lan-tan" },
|
|
53
|
+
"MLK": { "expansion": "Melaka", "syllables": "Me-la-ka" },
|
|
54
|
+
"NSN": { "expansion": "Negeri Sembilan", "syllables": "Ne-ge-ri Sem-bi-lan" },
|
|
55
|
+
"TRG": { "expansion": "Terengganu", "syllables": "Te-reng-ga-nu" },
|
|
56
|
+
"SBH": { "expansion": "Sabah", "syllables": "Sa-bah" },
|
|
57
|
+
"SWK": { "expansion": "Sarawak", "syllables": "Sa-ra-wak" },
|
|
58
|
+
"LBN": { "expansion": "Labuan", "syllables": "La-bu-an" },
|
|
59
|
+
"PJY": { "expansion": "Putrajaya", "syllables": "Pu-tra-ja-ya" }
|
|
60
|
+
}
|
package/src/index.js
ADDED
package/src/normalise.js
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const dictionary = require('./dictionary.json');
|
|
4
|
+
|
|
5
|
+
const VOWELS = new Set(['a', 'e', 'i', 'o', 'u']);
|
|
6
|
+
const DIGRAPHS = ['ng', 'ny', 'sy', 'kh', 'gh'];
|
|
7
|
+
const DIPHTHONGS = ['ai', 'au', 'oi'];
|
|
8
|
+
|
|
9
|
+
function titleCase(word) {
|
|
10
|
+
if (!word) return word;
|
|
11
|
+
return word[0].toUpperCase() + word.slice(1).toLowerCase();
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
// Lightweight Malay syllabifier. Used for tokens not in the dictionary so the
|
|
15
|
+
// tts_ready output is consistent across the whole address.
|
|
16
|
+
function syllabify(word) {
|
|
17
|
+
if (word.length <= 2) return word;
|
|
18
|
+
const lower = word.toLowerCase();
|
|
19
|
+
|
|
20
|
+
// Tokenise the word into atomic units, keeping digraphs (ng, ny, ...) together.
|
|
21
|
+
const units = [];
|
|
22
|
+
for (let i = 0; i < word.length; ) {
|
|
23
|
+
const two = lower.slice(i, i + 2);
|
|
24
|
+
if (DIGRAPHS.includes(two)) {
|
|
25
|
+
units.push({ chars: word.slice(i, i + 2), type: 'consonant' });
|
|
26
|
+
i += 2;
|
|
27
|
+
} else {
|
|
28
|
+
const ch = lower[i];
|
|
29
|
+
units.push({ chars: word[i], type: VOWELS.has(ch) ? 'vowel' : 'consonant' });
|
|
30
|
+
i += 1;
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
// Collapse word-final diphthongs (ai, au, oi) into a single vowel unit so
|
|
35
|
+
// names like "Sungai" syllabify as "Su-ngai", not "Su-nga-i".
|
|
36
|
+
if (units.length >= 2) {
|
|
37
|
+
const last = units[units.length - 1];
|
|
38
|
+
const prev = units[units.length - 2];
|
|
39
|
+
if (last.type === 'vowel' && prev.type === 'vowel') {
|
|
40
|
+
const pair = (prev.chars + last.chars).toLowerCase();
|
|
41
|
+
if (DIPHTHONGS.includes(pair)) {
|
|
42
|
+
units.splice(units.length - 2, 2, { chars: prev.chars + last.chars, type: 'vowel' });
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
const vowelIdx = [];
|
|
48
|
+
units.forEach((u, j) => { if (u.type === 'vowel') vowelIdx.push(j); });
|
|
49
|
+
if (vowelIdx.length <= 1) return word;
|
|
50
|
+
|
|
51
|
+
// Maximum-onset rule: a lone consonant between vowels starts the next
|
|
52
|
+
// syllable (V-CV); two or more split one-to-the-left (VC-CV / VC-CCV).
|
|
53
|
+
const boundaries = [0];
|
|
54
|
+
for (let k = 0; k < vowelIdx.length - 1; k++) {
|
|
55
|
+
const v1 = vowelIdx[k];
|
|
56
|
+
const v2 = vowelIdx[k + 1];
|
|
57
|
+
const between = v2 - v1 - 1;
|
|
58
|
+
if (between === 0) boundaries.push(v2);
|
|
59
|
+
else if (between === 1) boundaries.push(v1 + 1);
|
|
60
|
+
else boundaries.push(v1 + 2);
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
const syllables = [];
|
|
64
|
+
for (let b = 0; b < boundaries.length; b++) {
|
|
65
|
+
const start = boundaries[b];
|
|
66
|
+
const end = b + 1 < boundaries.length ? boundaries[b + 1] : units.length;
|
|
67
|
+
syllables.push(units.slice(start, end).map(u => u.chars).join(''));
|
|
68
|
+
}
|
|
69
|
+
return syllables.join('-');
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
function isAllCaps(s) {
|
|
73
|
+
return /[A-Z]/.test(s) && s === s.toUpperCase();
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
function isWhitespace(t) { return /^\s+$/.test(t); }
|
|
77
|
+
function isPunctuation(t) { return /^[,.;:()\/]+$/.test(t); }
|
|
78
|
+
|
|
79
|
+
// A bare number or alphanumeric code that starts with a digit (1, 12, 1A, 2B).
|
|
80
|
+
// Used by the numeric_suffix_is_code rule.
|
|
81
|
+
function isCodeToken(t) { return /^\d[\dA-Za-z]*$/.test(t); }
|
|
82
|
+
|
|
83
|
+
function nextContentToken(tokens, fromIndex) {
|
|
84
|
+
for (let i = fromIndex + 1; i < tokens.length; i++) {
|
|
85
|
+
const t = tokens[i];
|
|
86
|
+
if (!isWhitespace(t) && !isPunctuation(t)) return t;
|
|
87
|
+
}
|
|
88
|
+
return null;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
function normalise(input) {
|
|
92
|
+
if (typeof input !== 'string') {
|
|
93
|
+
throw new TypeError('input must be a string');
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// Split on whitespace and a small set of address punctuation, keeping the
|
|
97
|
+
// separators so we can reassemble the original spacing.
|
|
98
|
+
const tokens = input.split(/(\s+|[,.;:()\/])/).filter(t => t !== '');
|
|
99
|
+
|
|
100
|
+
const normalisedParts = [];
|
|
101
|
+
const ttsParts = [];
|
|
102
|
+
const expansionsApplied = [];
|
|
103
|
+
const ambiguousFlags = [];
|
|
104
|
+
let previousContentToken = null;
|
|
105
|
+
|
|
106
|
+
for (let idx = 0; idx < tokens.length; idx++) {
|
|
107
|
+
const token = tokens[idx];
|
|
108
|
+
if (isWhitespace(token) || isPunctuation(token)) {
|
|
109
|
+
normalisedParts.push(token);
|
|
110
|
+
ttsParts.push(token);
|
|
111
|
+
continue;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// Detect a number glued to letters, e.g. "5KG", so we can both look up the
|
|
115
|
+
// alpha part in the dictionary and apply the weight heuristic.
|
|
116
|
+
const glued = token.match(/^(\d+)([A-Za-z]+)$/);
|
|
117
|
+
const numericAttached = glued ? glued[1] : null;
|
|
118
|
+
const alphaPart = glued ? glued[2] : token;
|
|
119
|
+
const upper = alphaPart.toUpperCase();
|
|
120
|
+
const entry = dictionary[upper];
|
|
121
|
+
|
|
122
|
+
if (entry) {
|
|
123
|
+
// Weight heuristic: a numeric prefix (in-token or as the previous token)
|
|
124
|
+
// means we're looking at a unit like "5 KG of rice", not "Kampung".
|
|
125
|
+
if (entry.rule === 'numeric_prefix_is_weight') {
|
|
126
|
+
const numericContext = numericAttached !== null
|
|
127
|
+
|| (previousContentToken !== null && /^\d+$/.test(previousContentToken));
|
|
128
|
+
if (numericContext) {
|
|
129
|
+
normalisedParts.push(token);
|
|
130
|
+
ttsParts.push(token);
|
|
131
|
+
ambiguousFlags.push({
|
|
132
|
+
token,
|
|
133
|
+
chosen: token,
|
|
134
|
+
alternative: entry.expansion,
|
|
135
|
+
reason: `numeric context suggests ${entry.ambiguous_with}; left unexpanded`,
|
|
136
|
+
});
|
|
137
|
+
previousContentToken = token;
|
|
138
|
+
continue;
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// Section-code heuristic: when the following token starts with a digit
|
|
143
|
+
// (e.g. "KT 1", "KT 2B"), treat KT as a section prefix and leave it
|
|
144
|
+
// alone. With no numeric follower, fall through to the expansion.
|
|
145
|
+
if (entry.rule === 'numeric_suffix_is_code') {
|
|
146
|
+
const next = nextContentToken(tokens, idx);
|
|
147
|
+
if (next && isCodeToken(next)) {
|
|
148
|
+
normalisedParts.push(token);
|
|
149
|
+
ttsParts.push(token);
|
|
150
|
+
ambiguousFlags.push({
|
|
151
|
+
token,
|
|
152
|
+
chosen: token,
|
|
153
|
+
alternative: entry.expansion,
|
|
154
|
+
reason: `following token "${next}" suggests ${entry.ambiguous_with}; left unexpanded`,
|
|
155
|
+
});
|
|
156
|
+
previousContentToken = token;
|
|
157
|
+
continue;
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
const expansion = entry.expansion;
|
|
162
|
+
const syllables = entry.syllables;
|
|
163
|
+
const display = numericAttached ? `${numericAttached} ${expansion}` : expansion;
|
|
164
|
+
const tts = numericAttached ? `${numericAttached} ${syllables}` : syllables;
|
|
165
|
+
|
|
166
|
+
normalisedParts.push(display);
|
|
167
|
+
ttsParts.push(tts);
|
|
168
|
+
expansionsApplied.push({ from: token, to: expansion });
|
|
169
|
+
|
|
170
|
+
// Only flag when we *defaulted* — i.e. picked one of multiple plausible
|
|
171
|
+
// meanings without a deterministic rule to resolve it. KG resolved by
|
|
172
|
+
// numeric_prefix_is_weight is not a guess; SG defaulting to Sungai is.
|
|
173
|
+
if (entry.ambiguous_with && entry.rule === 'default_to_malay') {
|
|
174
|
+
ambiguousFlags.push({
|
|
175
|
+
token,
|
|
176
|
+
chosen: expansion,
|
|
177
|
+
alternative: entry.ambiguous_with,
|
|
178
|
+
reason: `defaulted to ${expansion}; could also mean ${entry.ambiguous_with}`,
|
|
179
|
+
});
|
|
180
|
+
}
|
|
181
|
+
} else {
|
|
182
|
+
// Title-case only purely alphabetic all-caps tokens. Mixed alphanumeric
|
|
183
|
+
// tokens (KT1, A12, 50450) are codes — preserve them verbatim.
|
|
184
|
+
const pureAlpha = /^[A-Za-z]+$/.test(alphaPart);
|
|
185
|
+
const display = (pureAlpha && isAllCaps(alphaPart)) ? titleCase(alphaPart) : alphaPart;
|
|
186
|
+
const tts = syllabify(display);
|
|
187
|
+
normalisedParts.push(numericAttached ? numericAttached + display : display);
|
|
188
|
+
ttsParts.push(numericAttached ? numericAttached + tts : tts);
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
previousContentToken = token;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
return {
|
|
195
|
+
normalised: normalisedParts.join(''),
|
|
196
|
+
tts_ready: ttsParts.join(''),
|
|
197
|
+
expansions_applied: expansionsApplied,
|
|
198
|
+
ambiguous_flags: ambiguousFlags,
|
|
199
|
+
};
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
module.exports = { normalise, syllabify };
|