bm-address-normaliser 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json ADDED
@@ -0,0 +1,29 @@
1
+ {
2
+ "name": "bm-address-normaliser",
3
+ "version": "1.0.0",
4
+ "description": "Expand abbreviated Malaysian addresses for display and text-to-speech.",
5
+ "main": "src/index.js",
6
+ "files": [
7
+ "src"
8
+ ],
9
+ "scripts": {
10
+ "test": "node --test"
11
+ },
12
+ "keywords": [
13
+ "malaysia",
14
+ "bahasa-malaysia",
15
+ "address",
16
+ "nlp",
17
+ "tts",
18
+ "normaliser"
19
+ ],
20
+ "author": "Ahmad Hozir <ahozir@gmail.com>",
21
+ "license": "MIT",
22
+ "repository": {
23
+ "type": "git",
24
+ "url": "https://github.com/ahozir-ux/bm-address-normaliser.git"
25
+ },
26
+ "engines": {
27
+ "node": ">=18.0.0"
28
+ }
29
+ }
@@ -0,0 +1,60 @@
1
+ {
2
+ "JLN": { "expansion": "Jalan", "syllables": "Ja-lan" },
3
+ "TMN": { "expansion": "Taman", "syllables": "Ta-man" },
4
+ "BKT": { "expansion": "Bukit", "syllables": "Bu-kit" },
5
+ "LRG": { "expansion": "Lorong", "syllables": "Lo-rong" },
6
+ "BDR": { "expansion": "Bandar", "syllables": "Ban-dar" },
7
+ "BT": { "expansion": "Batu", "syllables": "Ba-tu" },
8
+ "SEK": { "expansion": "Seksyen", "syllables": "Sek-syen" },
9
+ "TKT": { "expansion": "Tingkat", "syllables": "Ting-kat" },
10
+ "BLK": { "expansion": "Blok", "syllables": "Blok" },
11
+ "APT": { "expansion": "Apartmen", "syllables": "A-part-men" },
12
+ "ULU": { "expansion": "Ulu", "syllables": "U-lu" },
13
+
14
+ "KL": { "expansion": "Kuala Lumpur", "syllables": "Kua-la Lum-pur" },
15
+ "PJ": { "expansion": "Petaling Jaya", "syllables": "Pe-ta-ling Ja-ya" },
16
+ "JB": { "expansion": "Johor Bahru", "syllables": "Jo-hor Bah-ru" },
17
+ "KB": { "expansion": "Kota Bharu", "syllables": "Ko-ta Bha-ru" },
18
+ "KK": { "expansion": "Kota Kinabalu", "syllables": "Ko-ta Ki-na-ba-lu" },
19
+ "PNG": { "expansion": "Pulau Pinang", "syllables": "Pu-lau Pi-nang" },
20
+ "KCH": { "expansion": "Kuching", "syllables": "Ku-ching" },
21
+
22
+ "KG": {
23
+ "expansion": "Kampung",
24
+ "syllables": "Kam-pung",
25
+ "ambiguous_with": "kilogram",
26
+ "rule": "numeric_prefix_is_weight"
27
+ },
28
+ "KPG": {
29
+ "expansion": "Kampung",
30
+ "syllables": "Kam-pung",
31
+ "ambiguous_with": "kilogram",
32
+ "rule": "numeric_prefix_is_weight"
33
+ },
34
+ "SG": {
35
+ "expansion": "Sungai",
36
+ "syllables": "Su-ngai",
37
+ "ambiguous_with": "Singapore",
38
+ "rule": "default_to_malay"
39
+ },
40
+ "KT": {
41
+ "expansion": "Kuala Terengganu",
42
+ "syllables": "Kua-la Te-reng-ga-nu",
43
+ "ambiguous_with": "section code prefix",
44
+ "rule": "numeric_suffix_is_code"
45
+ },
46
+
47
+ "SGR": { "expansion": "Selangor", "syllables": "Se-lan-gor" },
48
+ "JHR": { "expansion": "Johor", "syllables": "Jo-hor" },
49
+ "PHG": { "expansion": "Pahang", "syllables": "Pa-hang" },
50
+ "PRK": { "expansion": "Perak", "syllables": "Pe-rak" },
51
+ "KDH": { "expansion": "Kedah", "syllables": "Ke-dah" },
52
+ "KTN": { "expansion": "Kelantan", "syllables": "Ke-lan-tan" },
53
+ "MLK": { "expansion": "Melaka", "syllables": "Me-la-ka" },
54
+ "NSN": { "expansion": "Negeri Sembilan", "syllables": "Ne-ge-ri Sem-bi-lan" },
55
+ "TRG": { "expansion": "Terengganu", "syllables": "Te-reng-ga-nu" },
56
+ "SBH": { "expansion": "Sabah", "syllables": "Sa-bah" },
57
+ "SWK": { "expansion": "Sarawak", "syllables": "Sa-ra-wak" },
58
+ "LBN": { "expansion": "Labuan", "syllables": "La-bu-an" },
59
+ "PJY": { "expansion": "Putrajaya", "syllables": "Pu-tra-ja-ya" }
60
+ }
package/src/index.js ADDED
@@ -0,0 +1,6 @@
1
+ 'use strict';
2
+
3
+ const { normalise, syllabify } = require('./normalise.js');
4
+ const dictionary = require('./dictionary.json');
5
+
6
+ module.exports = { normalise, syllabify, dictionary };
@@ -0,0 +1,202 @@
1
+ 'use strict';
2
+
3
+ const dictionary = require('./dictionary.json');
4
+
5
+ const VOWELS = new Set(['a', 'e', 'i', 'o', 'u']);
6
+ const DIGRAPHS = ['ng', 'ny', 'sy', 'kh', 'gh'];
7
+ const DIPHTHONGS = ['ai', 'au', 'oi'];
8
+
9
+ function titleCase(word) {
10
+ if (!word) return word;
11
+ return word[0].toUpperCase() + word.slice(1).toLowerCase();
12
+ }
13
+
14
+ // Lightweight Malay syllabifier. Used for tokens not in the dictionary so the
15
+ // tts_ready output is consistent across the whole address.
16
+ function syllabify(word) {
17
+ if (word.length <= 2) return word;
18
+ const lower = word.toLowerCase();
19
+
20
+ // Tokenise the word into atomic units, keeping digraphs (ng, ny, ...) together.
21
+ const units = [];
22
+ for (let i = 0; i < word.length; ) {
23
+ const two = lower.slice(i, i + 2);
24
+ if (DIGRAPHS.includes(two)) {
25
+ units.push({ chars: word.slice(i, i + 2), type: 'consonant' });
26
+ i += 2;
27
+ } else {
28
+ const ch = lower[i];
29
+ units.push({ chars: word[i], type: VOWELS.has(ch) ? 'vowel' : 'consonant' });
30
+ i += 1;
31
+ }
32
+ }
33
+
34
+ // Collapse word-final diphthongs (ai, au, oi) into a single vowel unit so
35
+ // names like "Sungai" syllabify as "Su-ngai", not "Su-nga-i".
36
+ if (units.length >= 2) {
37
+ const last = units[units.length - 1];
38
+ const prev = units[units.length - 2];
39
+ if (last.type === 'vowel' && prev.type === 'vowel') {
40
+ const pair = (prev.chars + last.chars).toLowerCase();
41
+ if (DIPHTHONGS.includes(pair)) {
42
+ units.splice(units.length - 2, 2, { chars: prev.chars + last.chars, type: 'vowel' });
43
+ }
44
+ }
45
+ }
46
+
47
+ const vowelIdx = [];
48
+ units.forEach((u, j) => { if (u.type === 'vowel') vowelIdx.push(j); });
49
+ if (vowelIdx.length <= 1) return word;
50
+
51
+ // Maximum-onset rule: a lone consonant between vowels starts the next
52
+ // syllable (V-CV); two or more split one-to-the-left (VC-CV / VC-CCV).
53
+ const boundaries = [0];
54
+ for (let k = 0; k < vowelIdx.length - 1; k++) {
55
+ const v1 = vowelIdx[k];
56
+ const v2 = vowelIdx[k + 1];
57
+ const between = v2 - v1 - 1;
58
+ if (between === 0) boundaries.push(v2);
59
+ else if (between === 1) boundaries.push(v1 + 1);
60
+ else boundaries.push(v1 + 2);
61
+ }
62
+
63
+ const syllables = [];
64
+ for (let b = 0; b < boundaries.length; b++) {
65
+ const start = boundaries[b];
66
+ const end = b + 1 < boundaries.length ? boundaries[b + 1] : units.length;
67
+ syllables.push(units.slice(start, end).map(u => u.chars).join(''));
68
+ }
69
+ return syllables.join('-');
70
+ }
71
+
72
+ function isAllCaps(s) {
73
+ return /[A-Z]/.test(s) && s === s.toUpperCase();
74
+ }
75
+
76
+ function isWhitespace(t) { return /^\s+$/.test(t); }
77
+ function isPunctuation(t) { return /^[,.;:()\/]+$/.test(t); }
78
+
79
+ // A bare number or alphanumeric code that starts with a digit (1, 12, 1A, 2B).
80
+ // Used by the numeric_suffix_is_code rule.
81
+ function isCodeToken(t) { return /^\d[\dA-Za-z]*$/.test(t); }
82
+
83
+ function nextContentToken(tokens, fromIndex) {
84
+ for (let i = fromIndex + 1; i < tokens.length; i++) {
85
+ const t = tokens[i];
86
+ if (!isWhitespace(t) && !isPunctuation(t)) return t;
87
+ }
88
+ return null;
89
+ }
90
+
91
+ function normalise(input) {
92
+ if (typeof input !== 'string') {
93
+ throw new TypeError('input must be a string');
94
+ }
95
+
96
+ // Split on whitespace and a small set of address punctuation, keeping the
97
+ // separators so we can reassemble the original spacing.
98
+ const tokens = input.split(/(\s+|[,.;:()\/])/).filter(t => t !== '');
99
+
100
+ const normalisedParts = [];
101
+ const ttsParts = [];
102
+ const expansionsApplied = [];
103
+ const ambiguousFlags = [];
104
+ let previousContentToken = null;
105
+
106
+ for (let idx = 0; idx < tokens.length; idx++) {
107
+ const token = tokens[idx];
108
+ if (isWhitespace(token) || isPunctuation(token)) {
109
+ normalisedParts.push(token);
110
+ ttsParts.push(token);
111
+ continue;
112
+ }
113
+
114
+ // Detect a number glued to letters, e.g. "5KG", so we can both look up the
115
+ // alpha part in the dictionary and apply the weight heuristic.
116
+ const glued = token.match(/^(\d+)([A-Za-z]+)$/);
117
+ const numericAttached = glued ? glued[1] : null;
118
+ const alphaPart = glued ? glued[2] : token;
119
+ const upper = alphaPart.toUpperCase();
120
+ const entry = dictionary[upper];
121
+
122
+ if (entry) {
123
+ // Weight heuristic: a numeric prefix (in-token or as the previous token)
124
+ // means we're looking at a unit like "5 KG of rice", not "Kampung".
125
+ if (entry.rule === 'numeric_prefix_is_weight') {
126
+ const numericContext = numericAttached !== null
127
+ || (previousContentToken !== null && /^\d+$/.test(previousContentToken));
128
+ if (numericContext) {
129
+ normalisedParts.push(token);
130
+ ttsParts.push(token);
131
+ ambiguousFlags.push({
132
+ token,
133
+ chosen: token,
134
+ alternative: entry.expansion,
135
+ reason: `numeric context suggests ${entry.ambiguous_with}; left unexpanded`,
136
+ });
137
+ previousContentToken = token;
138
+ continue;
139
+ }
140
+ }
141
+
142
+ // Section-code heuristic: when the following token starts with a digit
143
+ // (e.g. "KT 1", "KT 2B"), treat KT as a section prefix and leave it
144
+ // alone. With no numeric follower, fall through to the expansion.
145
+ if (entry.rule === 'numeric_suffix_is_code') {
146
+ const next = nextContentToken(tokens, idx);
147
+ if (next && isCodeToken(next)) {
148
+ normalisedParts.push(token);
149
+ ttsParts.push(token);
150
+ ambiguousFlags.push({
151
+ token,
152
+ chosen: token,
153
+ alternative: entry.expansion,
154
+ reason: `following token "${next}" suggests ${entry.ambiguous_with}; left unexpanded`,
155
+ });
156
+ previousContentToken = token;
157
+ continue;
158
+ }
159
+ }
160
+
161
+ const expansion = entry.expansion;
162
+ const syllables = entry.syllables;
163
+ const display = numericAttached ? `${numericAttached} ${expansion}` : expansion;
164
+ const tts = numericAttached ? `${numericAttached} ${syllables}` : syllables;
165
+
166
+ normalisedParts.push(display);
167
+ ttsParts.push(tts);
168
+ expansionsApplied.push({ from: token, to: expansion });
169
+
170
+ // Only flag when we *defaulted* — i.e. picked one of multiple plausible
171
+ // meanings without a deterministic rule to resolve it. KG resolved by
172
+ // numeric_prefix_is_weight is not a guess; SG defaulting to Sungai is.
173
+ if (entry.ambiguous_with && entry.rule === 'default_to_malay') {
174
+ ambiguousFlags.push({
175
+ token,
176
+ chosen: expansion,
177
+ alternative: entry.ambiguous_with,
178
+ reason: `defaulted to ${expansion}; could also mean ${entry.ambiguous_with}`,
179
+ });
180
+ }
181
+ } else {
182
+ // Title-case only purely alphabetic all-caps tokens. Mixed alphanumeric
183
+ // tokens (KT1, A12, 50450) are codes — preserve them verbatim.
184
+ const pureAlpha = /^[A-Za-z]+$/.test(alphaPart);
185
+ const display = (pureAlpha && isAllCaps(alphaPart)) ? titleCase(alphaPart) : alphaPart;
186
+ const tts = syllabify(display);
187
+ normalisedParts.push(numericAttached ? numericAttached + display : display);
188
+ ttsParts.push(numericAttached ? numericAttached + tts : tts);
189
+ }
190
+
191
+ previousContentToken = token;
192
+ }
193
+
194
+ return {
195
+ normalised: normalisedParts.join(''),
196
+ tts_ready: ttsParts.join(''),
197
+ expansions_applied: expansionsApplied,
198
+ ambiguous_flags: ambiguousFlags,
199
+ };
200
+ }
201
+
202
+ module.exports = { normalise, syllabify };