tamil-romanizer 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +63 -0
- package/data/exceptions.json +9 -0
- package/index.js +1 -0
- package/package.json +29 -0
- package/src/contextAnalyzer.js +85 -0
- package/src/decomposer.js +50 -0
- package/src/exceptionTrie.js +56 -0
- package/src/romanizer.js +90 -0
- package/src/sanitizer.js +26 -0
- package/src/schemeResolver.js +95 -0
- package/src/schemes/alaLc.js +15 -0
- package/src/schemes/iso15919.js +16 -0
- package/src/schemes/practical.js +34 -0
- package/src/specialTokens.js +51 -0
- package/src/tokenizer.js +76 -0
package/README.md
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# Tamil Romanizer (v1.0)
|
|
2
|
+
|
|
3
|
+
A robust, context-aware rule-based Tamil-to-English romanization library.
|
|
4
|
+
|
|
5
|
+
Vastly outperforming naive character-replacement scripts, this library implements a **6-Layer pipeline** powered by grapheme cluster tokenization (`Intl.Segmenter`) and phonological context analysis to yield true, phonetic English mappings from dense Tamil text.
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
- **Grapheme Accurate:** Handles zero-width joiners, complex modifier stacks, and canonical normalization natively.
|
|
10
|
+
- **Context-Aware Phonology:** Detects word-initial constraints, intervocalic softening, post-nasal transformations, and geminate cluster conditions to output dynamically accurate English syntax (e.g. `ப` maps to `p` or `b` dynamically depending on cross-word sandhi context).
|
|
11
|
+
- **Multiple Mapping Schemes:** Natively supports intelligent `practical` syntax (Tanglish/casual phonetic usage) and strict `iso15919` formalized transliteration.
|
|
12
|
+
- **Exception Trie Routing:** Intercepts proper nouns and anglicized loan words prior to phonological transcription.
|
|
13
|
+
- **Foreign-Script Safe:** Can safely ingest paragraphs containing English, Japanese, or other Unicode blocks, surgically romanizing only the Tamil tokens while passing non-Tamil text safely through.
|
|
14
|
+
|
|
15
|
+
## Installation
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
npm install tamil-romanizer
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Usage
|
|
22
|
+
|
|
23
|
+
```javascript
|
|
24
|
+
import { romanize } from 'tamil-romanizer';
|
|
25
|
+
|
|
26
|
+
// Basic Transliteration
|
|
27
|
+
console.log(romanize("தமிழ்")); // "thamizh"
|
|
28
|
+
|
|
29
|
+
// Practical Phonics (Contextual Mapping)
|
|
30
|
+
console.log(romanize("பம்பரம்")); // "pambaram" (Initial P, Post-Nasal B)
|
|
31
|
+
console.log(romanize("சிங்கம்")); // "singam"
|
|
32
|
+
|
|
33
|
+
// Capitalization Support
|
|
34
|
+
console.log(romanize("சென்னை பயணம்", { capitalize: 'sentence' })); // "Chennai payanam"
|
|
35
|
+
|
|
36
|
+
// Strict ISO 15919 Syntax
|
|
37
|
+
console.log(romanize("தமிழ்நாடு", { scheme: 'iso15919' })); // "tamiḻnāṭu"
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## API Options
|
|
41
|
+
You can adjust parsing behavior globally via a config block on `romanize(text, options)`.
|
|
42
|
+
|
|
43
|
+
* **scheme**: Target rules (`'practical'` default, `'iso15919'`, or `'ala-lc'`)
|
|
44
|
+
* **exceptions**: Boolean enabling exception lookups (defaults `true`)
|
|
45
|
+
* **capitalize**: Output casing rule (`'none'`, `'words'`, `'sentence'`). *Note: `none` enforces strict lowercase even for proper noun dictionary inputs.*
|
|
46
|
+
|
|
47
|
+
## Architecture
|
|
48
|
+
|
|
49
|
+
1. **Sanitizer:** NFC normalization & format-character stripping.
|
|
50
|
+
2. **Cluster Tokenizer:** Uses `Intl.Segmenter` to split graphemes accurately.
|
|
51
|
+
3. **Decomposer:** Maps bases and vowel modifiers distinctively.
|
|
52
|
+
4. **Context Analyzer:** Positional tagging (Word Initial, Intervocalic, Geminate, Post-Nasal).
|
|
53
|
+
5. **Scheme Resolver:** Base lookup to targeted transliteration schema (`iso15919`, `practical`, `ala-lc`).
|
|
54
|
+
6. **Special Token Handler:** Cross-cluster constraints (Aytham lookaheads, Grantha sequence transformations).
|
|
55
|
+
7. **Exception Trie:** Fast dictionary overrides.
|
|
56
|
+
|
|
57
|
+
## Testing & Reliability
|
|
58
|
+
This library was built with mathematical rigour, achieving > 98% test coverage via `vitest`.
|
|
59
|
+
* **ISO 15919 Benchmark:** Achieves 100% Character Error Rate (CER) exact-match compliance against the official specification.
|
|
60
|
+
* **Stress Testing:** A built-in CLI is available to test arbitrary text locally:
|
|
61
|
+
1. Paste text into `test/stress/input.txt`
|
|
62
|
+
2. Run `node test/stress/evaluate.js`
|
|
63
|
+
|
package/index.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { romanize } from './src/romanizer.js';
|
package/package.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "tamil-romanizer",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Tamil Romanization Engine v1.0",
|
|
5
|
+
"main": "index.js",
|
|
6
|
+
"type": "module",
|
|
7
|
+
"scripts": {
|
|
8
|
+
"test": "vitest run"
|
|
9
|
+
},
|
|
10
|
+
"keywords": [
|
|
11
|
+
"tamil",
|
|
12
|
+
"romanizer",
|
|
13
|
+
"transliteration"
|
|
14
|
+
],
|
|
15
|
+
"author": "Harold Alan",
|
|
16
|
+
"license": "ISC",
|
|
17
|
+
"files": [
|
|
18
|
+
"src",
|
|
19
|
+
"data",
|
|
20
|
+
"index.js"
|
|
21
|
+
],
|
|
22
|
+
"exports": {
|
|
23
|
+
".": "./index.js"
|
|
24
|
+
},
|
|
25
|
+
"devDependencies": {
|
|
26
|
+
"@vitest/coverage-v8": "^4.0.18",
|
|
27
|
+
"vitest": "^4.0.18"
|
|
28
|
+
}
|
|
29
|
+
}
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import { tokenTypes } from './tokenizer.js';
|
|
2
|
+
import { modifierTypes } from './decomposer.js';
|
|
3
|
+
|
|
4
|
+
export const contextTags = {
|
|
5
|
+
WORD_INITIAL: 'WORD_INITIAL',
|
|
6
|
+
GEMINATE: 'GEMINATE',
|
|
7
|
+
POST_NASAL: 'POST_NASAL',
|
|
8
|
+
INTERVOCALIC: 'INTERVOCALIC',
|
|
9
|
+
WORD_FINAL: 'WORD_FINAL',
|
|
10
|
+
DEFAULT: 'DEFAULT'
|
|
11
|
+
};
|
|
12
|
+
|
|
13
|
+
const nasals = ['ங', 'ன', 'ண', 'ந', 'ம', 'ஞ'];
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Checks if a token acts as a vowel-carrying unit for intervocalic purposes.
|
|
17
|
+
*/
|
|
18
|
+
function carriesVowel(token) {
|
|
19
|
+
if (!token) return false;
|
|
20
|
+
if (token.type === tokenTypes.VOWEL) return true;
|
|
21
|
+
// Consonants with a vowel sign or inherent vowel act as vowels for flanking
|
|
22
|
+
if (token.type === tokenTypes.CONSONANT_VOWEL_SIGN || token.type === tokenTypes.CONSONANT_BARE) {
|
|
23
|
+
return true;
|
|
24
|
+
}
|
|
25
|
+
return false;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
/**
|
|
29
|
+
* Analyzes context for decomposed tokens and assigns a context tag.
|
|
30
|
+
*
|
|
31
|
+
* @param {Array<Object>} tokens - Array of decomposed tokens
|
|
32
|
+
* @returns {Array<Object>} Tokens mapped with contextTag property
|
|
33
|
+
*/
|
|
34
|
+
export function analyzeContext(tokens) {
|
|
35
|
+
if (!Array.isArray(tokens)) return [];
|
|
36
|
+
|
|
37
|
+
let wordInitialIndex = 0;
|
|
38
|
+
|
|
39
|
+
return tokens.map((token, index) => {
|
|
40
|
+
let tag = contextTags.DEFAULT;
|
|
41
|
+
|
|
42
|
+
// We only tag consonants for allophony resolution.
|
|
43
|
+
// Pure vowels and OTHER tokens don't need positional allophone tags.
|
|
44
|
+
if ([tokenTypes.CONSONANT_BARE, tokenTypes.CONSONANT_VIRAMA, tokenTypes.CONSONANT_VOWEL_SIGN].includes(token.type)) {
|
|
45
|
+
|
|
46
|
+
const prevToken = index > 0 ? tokens[index - 1] : null;
|
|
47
|
+
const nextToken = index < tokens.length - 1 ? tokens[index + 1] : null;
|
|
48
|
+
|
|
49
|
+
// Determine word boundaries.
|
|
50
|
+
// A token is word initial if it's the very first token in the string,
|
|
51
|
+
// OR if the previous token was a space/punctuation (OTHER).
|
|
52
|
+
const isWordInitial = index === 0 || (prevToken && prevToken.type === tokenTypes.OTHER);
|
|
53
|
+
|
|
54
|
+
const isWordFinal = index === tokens.length - 1 || (nextToken && nextToken.type === tokenTypes.OTHER);
|
|
55
|
+
|
|
56
|
+
if (isWordInitial) {
|
|
57
|
+
tag = contextTags.WORD_INITIAL;
|
|
58
|
+
wordInitialIndex = index;
|
|
59
|
+
} else {
|
|
60
|
+
// Tagging rules in priority order.
|
|
61
|
+
|
|
62
|
+
// 1. GEMINATE: Either the virama half or base half of a geminate pair
|
|
63
|
+
if (token.modifierType === modifierTypes.VIRAMA && nextToken && nextToken.base === token.base) {
|
|
64
|
+
tag = contextTags.GEMINATE; // First half (virama)
|
|
65
|
+
} else if (prevToken && prevToken.modifierType === modifierTypes.VIRAMA && prevToken.base === token.base) {
|
|
66
|
+
tag = contextTags.GEMINATE; // Second half (vowel-carrying)
|
|
67
|
+
}
|
|
68
|
+
// 2. POST_NASAL: Immediately preceded by ங், ன், ண், ந், or ம் (virama form)
|
|
69
|
+
else if (prevToken && prevToken.modifierType === modifierTypes.VIRAMA && nasals.includes(prevToken.base)) {
|
|
70
|
+
tag = contextTags.POST_NASAL;
|
|
71
|
+
}
|
|
72
|
+
// 3. INTERVOCALIC: Preceding cluster holds a vowel, and current cluster holds a vowel.
|
|
73
|
+
else if (carriesVowel(prevToken) && carriesVowel(token)) {
|
|
74
|
+
tag = contextTags.INTERVOCALIC;
|
|
75
|
+
}
|
|
76
|
+
// 4. WORD_FINAL: Last cluster in a word
|
|
77
|
+
else if (isWordFinal) {
|
|
78
|
+
tag = contextTags.WORD_FINAL;
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
return { ...token, contextTag: tag };
|
|
84
|
+
});
|
|
85
|
+
}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import { tokenTypes } from './tokenizer.js';
|
|
2
|
+
|
|
3
|
+
export const modifierTypes = {
|
|
4
|
+
VOWEL_SIGN: 'vowel_sign',
|
|
5
|
+
VIRAMA: 'virama',
|
|
6
|
+
NULL: 'null', // Inherent 'a' or pure vowel
|
|
7
|
+
NONE: 'none' // For OTHER types
|
|
8
|
+
};
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Mathematically splits each mapped cluster into { base, modifier }
|
|
12
|
+
* based on the token type.
|
|
13
|
+
*
|
|
14
|
+
* @param {Array<{text: string, type: string}>} tokens
|
|
15
|
+
* @returns {Array<{text: string, type: string, base: string, modifier: string, modifierType: string}>}
|
|
16
|
+
*/
|
|
17
|
+
export function decompose(tokens) {
|
|
18
|
+
if (!Array.isArray(tokens)) return [];
|
|
19
|
+
|
|
20
|
+
return tokens.map(token => {
|
|
21
|
+
let base = token.text;
|
|
22
|
+
let modifier = '';
|
|
23
|
+
let modifierType = modifierTypes.NONE;
|
|
24
|
+
|
|
25
|
+
if (token.type === tokenTypes.VOWEL) {
|
|
26
|
+
base = token.text[0]; // First code point
|
|
27
|
+
modifierType = modifierTypes.NULL;
|
|
28
|
+
} else if (token.type === tokenTypes.CONSONANT_BARE) {
|
|
29
|
+
base = token.text[0];
|
|
30
|
+
modifierType = modifierTypes.NULL;
|
|
31
|
+
} else if (token.type === tokenTypes.CONSONANT_VIRAMA || token.type === tokenTypes.CONSONANT_VOWEL_SIGN) {
|
|
32
|
+
base = token.text[0]; // U+0B95 etc.
|
|
33
|
+
modifier = token.text.slice(1); // The rest of the code points in the grapheme cluster
|
|
34
|
+
|
|
35
|
+
modifierType = token.type === tokenTypes.CONSONANT_VIRAMA
|
|
36
|
+
? modifierTypes.VIRAMA
|
|
37
|
+
: modifierTypes.VOWEL_SIGN;
|
|
38
|
+
} else {
|
|
39
|
+
// OTHER tags passes through verbatim
|
|
40
|
+
modifierType = modifierTypes.NONE;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
return {
|
|
44
|
+
...token,
|
|
45
|
+
base,
|
|
46
|
+
modifier,
|
|
47
|
+
modifierType
|
|
48
|
+
};
|
|
49
|
+
});
|
|
50
|
+
}
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import exceptionsData from '../data/exceptions.json' with { type: 'json' };
|
|
2
|
+
|
|
3
|
+
class TrieNode {
|
|
4
|
+
constructor() {
|
|
5
|
+
this.children = new Map();
|
|
6
|
+
this.isWordEnd = false;
|
|
7
|
+
this.override = null;
|
|
8
|
+
}
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
export class ExceptionTrie {
|
|
12
|
+
constructor() {
|
|
13
|
+
this.root = new TrieNode();
|
|
14
|
+
this.buildTrie();
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
buildTrie() {
|
|
18
|
+
for (const [tamilWord, overrideStr] of Object.entries(exceptionsData)) {
|
|
19
|
+
let current = this.root;
|
|
20
|
+
for (const char of tamilWord) {
|
|
21
|
+
if (!current.children.has(char)) {
|
|
22
|
+
current.children.set(char, new TrieNode());
|
|
23
|
+
}
|
|
24
|
+
current = current.children.get(char);
|
|
25
|
+
}
|
|
26
|
+
current.isWordEnd = true;
|
|
27
|
+
current.override = overrideStr;
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Attempts to intercept an entire Tamil word via the dictionary.
|
|
33
|
+
* Runs BEFORE the state machine.
|
|
34
|
+
*
|
|
35
|
+
* @param {string} tamilWord - A single contiguous Tamil word.
|
|
36
|
+
* @returns {string|null} - The hardcoded romanized word, or null on a miss.
|
|
37
|
+
*/
|
|
38
|
+
lookup(tamilWord) {
|
|
39
|
+
let current = this.root;
|
|
40
|
+
for (const char of tamilWord) {
|
|
41
|
+
if (!current.children.has(char)) {
|
|
42
|
+
return null; // Immediate miss mechanism
|
|
43
|
+
}
|
|
44
|
+
current = current.children.get(char);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
if (current.isWordEnd) {
|
|
48
|
+
return current.override;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
return null;
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// Singleton instances
|
|
56
|
+
export const exceptionDictionary = new ExceptionTrie();
|
package/src/romanizer.js
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
import { sanitize } from './sanitizer.js';
|
|
2
|
+
import { tokenize } from './tokenizer.js';
|
|
3
|
+
import { decompose } from './decomposer.js';
|
|
4
|
+
import { analyzeContext } from './contextAnalyzer.js';
|
|
5
|
+
import { resolveScheme } from './schemeResolver.js';
|
|
6
|
+
import { handleSpecialTokens } from './specialTokens.js';
|
|
7
|
+
import { exceptionDictionary } from './exceptionTrie.js';
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Apply capitalization rules.
|
|
11
|
+
*/
|
|
12
|
+
function applyCapitalization(text, format) {
|
|
13
|
+
if (!text) return '';
|
|
14
|
+
if (format === 'words') {
|
|
15
|
+
return text.split(' ').map(w => w.charAt(0).toUpperCase() + w.slice(1).toLowerCase()).join(' ');
|
|
16
|
+
}
|
|
17
|
+
if (format === 'sentence') {
|
|
18
|
+
return text.charAt(0).toUpperCase() + text.slice(1).toLowerCase();
|
|
19
|
+
}
|
|
20
|
+
// 'none'
|
|
21
|
+
return text.toLowerCase();
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* The public Tamil Romanizer API.
|
|
26
|
+
*
|
|
27
|
+
* @param {string} text - The raw Tamil string to transliterate.
|
|
28
|
+
* @param {Object} options - Configuration options.
|
|
29
|
+
* @param {string} [options.scheme='practical'] - 'practical', 'iso15919', 'alaLc'
|
|
30
|
+
* @param {boolean} [options.exceptions=true] - Whether to use the Exception Trie for known words.
|
|
31
|
+
* @param {Object} [options.table=null] - Custom map to override specific consonant derivations.
|
|
32
|
+
* @param {string} [options.capitalize='none'] - Capitalization strategy ('none', 'sentence', 'words').
|
|
33
|
+
* @returns {string} The fully romanized text.
|
|
34
|
+
*/
|
|
35
|
+
export function romanize(text, options = {}) {
|
|
36
|
+
const {
|
|
37
|
+
scheme = 'practical',
|
|
38
|
+
exceptions = true,
|
|
39
|
+
table = null,
|
|
40
|
+
capitalize = 'none'
|
|
41
|
+
} = options;
|
|
42
|
+
|
|
43
|
+
if (typeof text !== 'string') return '';
|
|
44
|
+
|
|
45
|
+
// 1. Sanitize text natively
|
|
46
|
+
const cleanText = sanitize(text);
|
|
47
|
+
if (!cleanText) return '';
|
|
48
|
+
|
|
49
|
+
let outputWords = [];
|
|
50
|
+
// Tokenize by spaces to apply whole-word Exception Trie natively
|
|
51
|
+
const words = cleanText.split(/(\s+)/);
|
|
52
|
+
|
|
53
|
+
for (const word of words) {
|
|
54
|
+
if (!word.trim()) {
|
|
55
|
+
outputWords.push({ text: word, isException: false });
|
|
56
|
+
continue;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// Step 2. Exception Trie Intercept
|
|
60
|
+
if (exceptions) {
|
|
61
|
+
const hardMatch = exceptionDictionary.lookup(word);
|
|
62
|
+
if (hardMatch) {
|
|
63
|
+
outputWords.push({ text: hardMatch, isException: true });
|
|
64
|
+
continue;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// Pipeline Execution
|
|
69
|
+
const tokens = tokenize(word);
|
|
70
|
+
const decomposed = decompose(tokens);
|
|
71
|
+
const analyzed = analyzeContext(decomposed);
|
|
72
|
+
const resolved = resolveScheme(analyzed, scheme, table);
|
|
73
|
+
const finalizedWord = handleSpecialTokens(resolved, scheme);
|
|
74
|
+
|
|
75
|
+
outputWords.push({ text: finalizedWord, isException: false });
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// Instead of a blind lowercase over the whole string,
|
|
79
|
+
// we apply lowercase *only* to algorithmically generated words if format is 'none'.
|
|
80
|
+
// We assemble the string carefully.
|
|
81
|
+
|
|
82
|
+
if (capitalize === 'none') {
|
|
83
|
+
return outputWords.map(w => w.text).join('').toLowerCase();
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// For 'sentence' or 'words', we apply the standard transform
|
|
87
|
+
// since the user explicitly requested global casing rules.
|
|
88
|
+
const resultString = outputWords.map(w => w.text).join('');
|
|
89
|
+
return applyCapitalization(resultString, capitalize);
|
|
90
|
+
}
|
package/src/sanitizer.js
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Sanitizes a raw Tamil string for tokenization.
|
|
3
|
+
* Performs NFC normalization, canonicalization of specific multi-cluster sequences,
|
|
4
|
+
* stripping of control characters (ZWJ/ZWNJ), and converting Tamil numerals to Indo-Arabic characters.
|
|
5
|
+
*
|
|
6
|
+
* @param {string} text - The raw Tamil text.
|
|
7
|
+
* @returns {string} The canonicalized and normalized Tamil text.
|
|
8
|
+
*/
|
|
9
|
+
export function sanitize(text) {
|
|
10
|
+
if (typeof text !== 'string') return '';
|
|
11
|
+
|
|
12
|
+
return text
|
|
13
|
+
// 1. ZWJ (U+200D) / ZWNJ (U+200C) removal
|
|
14
|
+
.replace(/[\u200C\u200D]/g, '')
|
|
15
|
+
|
|
16
|
+
// 2. ஸ்ரீ (Sri) canonicalization
|
|
17
|
+
// Normalize variant `ஶ்ரீ` (U+0BB6) to canonical `ஸ்ரீ` (U+0BB8)
|
|
18
|
+
.replace(/\u0BB6\u0BCD\u0BB0\u0BC0/g, '\u0BB8\u0BCD\u0BB0\u0BC0')
|
|
19
|
+
|
|
20
|
+
// 3. Convert Tamil numerals (௦-௯) to standard Indo-Arabic (0-9)
|
|
21
|
+
// ௦ is U+0BE6, ௯ is U+0BEF
|
|
22
|
+
.replace(/[\u0BE6-\u0BEF]/g, char => String.fromCharCode(char.charCodeAt(0) - 0x0BE6 + 48))
|
|
23
|
+
|
|
24
|
+
// 4. NFC Normalization
|
|
25
|
+
.normalize('NFC');
|
|
26
|
+
}
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
import iso15919 from './schemes/iso15919.js';
|
|
2
|
+
import practical from './schemes/practical.js';
|
|
3
|
+
import alaLc from './schemes/alaLc.js';
|
|
4
|
+
import { tokenTypes } from './tokenizer.js';
|
|
5
|
+
import { modifierTypes } from './decomposer.js';
|
|
6
|
+
|
|
7
|
+
const schemes = {
|
|
8
|
+
iso15919,
|
|
9
|
+
practical,
|
|
10
|
+
alaLc,
|
|
11
|
+
'ala-lc': alaLc
|
|
12
|
+
};
|
|
13
|
+
|
|
14
|
+
// Maps vowel signs (modifier part of cluster) to their pure vowel equivalent
|
|
15
|
+
export const vowelSignToBase = {
|
|
16
|
+
'\u0BBE': 'ஆ', // ா
|
|
17
|
+
'\u0BBF': 'இ', // ி
|
|
18
|
+
'\u0BC0': 'ஈ', // ீ
|
|
19
|
+
'\u0BC1': 'உ', // ு
|
|
20
|
+
'\u0BC2': 'ஊ', // ூ
|
|
21
|
+
'\u0BC6': 'எ', // ெ
|
|
22
|
+
'\u0BC7': 'ஏ', // ே
|
|
23
|
+
'\u0BC8': 'ஐ', // ை
|
|
24
|
+
'\u0BCA': 'ஒ', // ொ
|
|
25
|
+
'\u0BCB': 'ஓ', // ோ
|
|
26
|
+
'\u0BCC': 'ஔ', // ௌ
|
|
27
|
+
'\u0BD7': 'ஔ' // ௗ (Length mark, functionally part of au modifier sequence)
|
|
28
|
+
};
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Maps fully decorated tokens into romanized structures via the selected scheme.
|
|
32
|
+
*
|
|
33
|
+
* @param {Array<Object>} tokens - Analyzed tokens from Layer 3
|
|
34
|
+
* @param {string} schemeName - 'practical', 'iso15919', 'ala-lc'
|
|
35
|
+
* @param {Object} customTable - User overrides
|
|
36
|
+
*/
|
|
37
|
+
export function resolveScheme(tokens, schemeName = 'practical', customTable = null) {
|
|
38
|
+
const scheme = schemes[schemeName] || practical;
|
|
39
|
+
|
|
40
|
+
return tokens.map(token => {
|
|
41
|
+
// Pass non-tamil strings / numerals directly
|
|
42
|
+
if (token.type === tokenTypes.OTHER) {
|
|
43
|
+
return { ...token, romanized: token.text };
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
let romanized = '';
|
|
47
|
+
|
|
48
|
+
if (token.type === tokenTypes.VOWEL) {
|
|
49
|
+
romanized = scheme.vowels[token.base] || token.base;
|
|
50
|
+
} else {
|
|
51
|
+
// It's a consonant base
|
|
52
|
+
let consStr = token.base;
|
|
53
|
+
|
|
54
|
+
// Override or scheme check
|
|
55
|
+
const customCons = customTable && customTable[token.base];
|
|
56
|
+
const consMap = customCons || scheme.consonants[token.base];
|
|
57
|
+
|
|
58
|
+
if (typeof consMap === 'string') {
|
|
59
|
+
consStr = consMap; // ISO style maps all contexts identically
|
|
60
|
+
} else if (consMap && typeof consMap === 'object') {
|
|
61
|
+
// Practical style (context-aware)
|
|
62
|
+
if (token.contextTag === 'GEMINATE' && consMap['GEMINATE']) {
|
|
63
|
+
const gm = consMap['GEMINATE'];
|
|
64
|
+
if (token.modifierType === modifierTypes.VIRAMA) {
|
|
65
|
+
consStr = gm.charAt(0); // Take first character (e.g. 't' from 'tch')
|
|
66
|
+
} else {
|
|
67
|
+
consStr = gm.slice(1); // Take the rest (e.g. 'ch' from 'tch')
|
|
68
|
+
}
|
|
69
|
+
} else {
|
|
70
|
+
consStr = consMap[token.contextTag] || consMap['DEFAULT'] || token.base;
|
|
71
|
+
}
|
|
72
|
+
} else {
|
|
73
|
+
// Fallback missing mappings
|
|
74
|
+
consStr = token.base;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
// Add modifying vowel
|
|
78
|
+
let vowelStr = '';
|
|
79
|
+
if (token.modifierType === modifierTypes.NULL) {
|
|
80
|
+
vowelStr = scheme.vowels['அ'] || 'a';
|
|
81
|
+
} else if (token.modifierType === modifierTypes.VOWEL_SIGN) {
|
|
82
|
+
// Map modifier string to base vowel string
|
|
83
|
+
const baseVowel = vowelSignToBase[token.modifier];
|
|
84
|
+
if (baseVowel) {
|
|
85
|
+
vowelStr = scheme.vowels[baseVowel] || '';
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
// VIRAMA translates to no vowel.
|
|
89
|
+
|
|
90
|
+
romanized = consStr + vowelStr;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
return { ...token, romanized };
|
|
94
|
+
});
|
|
95
|
+
}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import iso15919 from './iso15919.js';
|
|
2
|
+
|
|
3
|
+
// ALA-LC for Tamil is nearly identical to ISO 15919
|
|
4
|
+
// Known delta: ISO 'ē' / 'ō' vs ALA-LC 'e' / 'o' without macron (sometimes)
|
|
5
|
+
// but standard ALA-LC does use macrons for long vowels.
|
|
6
|
+
// A common difference is that ISO uses ṟ for ற, while ALA-LC uses ṟ.
|
|
7
|
+
// ISO uses ṉ for ன, ALA-LC also uses ṉ.
|
|
8
|
+
// We will export a cloned variant of iso15919.
|
|
9
|
+
|
|
10
|
+
const alaLc = {
|
|
11
|
+
vowels: { ...iso15919.vowels },
|
|
12
|
+
consonants: { ...iso15919.consonants }
|
|
13
|
+
};
|
|
14
|
+
|
|
15
|
+
export default alaLc;
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
export default {
|
|
2
|
+
vowels: {
|
|
3
|
+
'அ': 'a', 'ஆ': 'ā', 'இ': 'i', 'ஈ': 'ī',
|
|
4
|
+
'உ': 'u', 'ஊ': 'ū', 'எ': 'e', 'ஏ': 'ē',
|
|
5
|
+
'ஐ': 'ai', 'ஒ': 'o', 'ஓ': 'ō', 'ஔ': 'au'
|
|
6
|
+
},
|
|
7
|
+
consonants: {
|
|
8
|
+
'க': 'k', 'ங': 'ṅ', 'ச': 'c', 'ஞ': 'ñ',
|
|
9
|
+
'ட': 'ṭ', 'ண': 'ṇ', 'த': 't', 'ந': 'n',
|
|
10
|
+
'ப': 'p', 'ம': 'm', 'ய': 'y', 'ர': 'r',
|
|
11
|
+
'ல': 'l', 'வ': 'v', 'ழ': 'ḻ', 'ள': 'ḷ',
|
|
12
|
+
'ற': 'ṟ', 'ன': 'ṉ',
|
|
13
|
+
// Grantha mappings
|
|
14
|
+
'ஜ': 'j', 'ஷ': 'ṣ', 'ஸ': 's', 'ஹ': 'h'
|
|
15
|
+
}
|
|
16
|
+
};
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
export default {
|
|
2
|
+
vowels: {
|
|
3
|
+
'அ': 'a', 'ஆ': 'aa', 'இ': 'i', 'ஈ': 'ee',
|
|
4
|
+
'உ': 'u', 'ஊ': 'oo', 'எ': 'e', 'ஏ': 'ae',
|
|
5
|
+
'ஐ': 'ai', 'ஒ': 'o', 'ஓ': 'oa', 'ஔ': 'au'
|
|
6
|
+
},
|
|
7
|
+
consonants: {
|
|
8
|
+
'க': { DEFAULT: 'k', INTERVOCALIC: 'g', POST_NASAL: 'g', GEMINATE: 'kk' },
|
|
9
|
+
'ச': { DEFAULT: 's', WORD_INITIAL: 's', INTERVOCALIC: 's', POST_NASAL: 'j', GEMINATE: 'chch' },
|
|
10
|
+
'ட': { DEFAULT: 't', INTERVOCALIC: 'd', POST_NASAL: 'd', GEMINATE: 'tt' },
|
|
11
|
+
'த': { DEFAULT: 'th', INTERVOCALIC: 'd', POST_NASAL: 'd', GEMINATE: 'tth' },
|
|
12
|
+
'ப': { DEFAULT: 'p', INTERVOCALIC: 'b', POST_NASAL: 'b', GEMINATE: 'pp' },
|
|
13
|
+
'ற': { DEFAULT: 'r', INTERVOCALIC: 'r', POST_NASAL: 'dr', GEMINATE: 'tr' },
|
|
14
|
+
// Nasals and other consonants that change based on context or position
|
|
15
|
+
'ங': { DEFAULT: 'n', WORD_INITIAL: 'ng' },
|
|
16
|
+
'ஞ': { DEFAULT: 'n', WORD_INITIAL: 'gn' },
|
|
17
|
+
// Strict direct mappings
|
|
18
|
+
'ல': { DEFAULT: 'l' },
|
|
19
|
+
'ள': { DEFAULT: 'l' }, // Often 'l' in modern names
|
|
20
|
+
'ழ': { DEFAULT: 'zh' },
|
|
21
|
+
'ந': { DEFAULT: 'n' },
|
|
22
|
+
'ன': { DEFAULT: 'n' },
|
|
23
|
+
'ண': { DEFAULT: 'n' },
|
|
24
|
+
'ம': { DEFAULT: 'm' },
|
|
25
|
+
'ய': { DEFAULT: 'y' },
|
|
26
|
+
'ர': { DEFAULT: 'r' },
|
|
27
|
+
'வ': { DEFAULT: 'v' },
|
|
28
|
+
// Grantha mappings standard for practical
|
|
29
|
+
'ஜ': { DEFAULT: 'j' },
|
|
30
|
+
'ஷ': { DEFAULT: 'sh' },
|
|
31
|
+
'ஸ': { DEFAULT: 's' },
|
|
32
|
+
'ஹ': { DEFAULT: 'h' }
|
|
33
|
+
}
|
|
34
|
+
};
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import { tokenTypes } from './tokenizer.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Handles Āytham resolution and Grantha sequence post-processing.
|
|
5
|
+
*
|
|
6
|
+
* @param {Array<Object>} resolvedTokens - Tokens mapped by Layer 4 (contains 'romanized' and 'base')
|
|
7
|
+
* @param {string} schemeName - 'practical', 'iso15919', 'alaLc'
|
|
8
|
+
* @returns {string} Final romanized string
|
|
9
|
+
*/
|
|
10
|
+
export function handleSpecialTokens(resolvedTokens, schemeName = 'practical') {
|
|
11
|
+
if (!Array.isArray(resolvedTokens)) return '';
|
|
12
|
+
|
|
13
|
+
const isPractical = schemeName === 'practical';
|
|
14
|
+
|
|
15
|
+
// 1. Āytham resolution and assemble string
|
|
16
|
+
let outputString = '';
|
|
17
|
+
|
|
18
|
+
for (let i = 0; i < resolvedTokens.length; i++) {
|
|
19
|
+
const token = resolvedTokens[i];
|
|
20
|
+
|
|
21
|
+
if (token.text === 'ஃ') {
|
|
22
|
+
const nextToken = resolvedTokens[i + 1];
|
|
23
|
+
|
|
24
|
+
if (isPractical) {
|
|
25
|
+
if (nextToken && nextToken.base === 'ப') {
|
|
26
|
+
// Replace 'p' or 'b' with 'f' in the next token's romanization
|
|
27
|
+
nextToken.romanized = nextToken.romanized.replace(/^[pb]/i, 'f');
|
|
28
|
+
} else if (nextToken && nextToken.base === 'ஜ') {
|
|
29
|
+
// Replace 'j' with 'z' in the next token's romanization
|
|
30
|
+
nextToken.romanized = nextToken.romanized.replace(/^j/i, 'z');
|
|
31
|
+
}
|
|
32
|
+
// For other cases or standalone 'ஃ', it's omitted in practical scheme, so nothing is added to outputString here.
|
|
33
|
+
} else {
|
|
34
|
+
// ISO 15919
|
|
35
|
+
outputString += 'ḵ';
|
|
36
|
+
}
|
|
37
|
+
} else {
|
|
38
|
+
outputString += token.romanized || token.text;
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// 2. Grantha Post-processing
|
|
43
|
+
// The state machine outputs multi-clusters compositionally.
|
|
44
|
+
// We clean up specific sequences in practical scheme.
|
|
45
|
+
if (isPractical) {
|
|
46
|
+
outputString = outputString.replace(/kṣ/g, 'ksh');
|
|
47
|
+
outputString = outputString.replace(/sree/g, 'sri');
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
return outputString;
|
|
51
|
+
}
|
package/src/tokenizer.js
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
const segmenter = new Intl.Segmenter('ta-IN', { granularity: 'grapheme' });
|
|
2
|
+
|
|
3
|
+
export const tokenTypes = {
|
|
4
|
+
VOWEL: 'vowel',
|
|
5
|
+
CONSONANT_VIRAMA: 'consonant_virama',
|
|
6
|
+
CONSONANT_VOWEL_SIGN: 'consonant_vowel_sign',
|
|
7
|
+
CONSONANT_BARE: 'consonant_bare',
|
|
8
|
+
OTHER: 'other' // numerals, punctuation, spaces, non-tamil
|
|
9
|
+
};
|
|
10
|
+
|
|
11
|
+
// Vowels (அ to ஔ) U+0B85 to U+0B94
|
|
12
|
+
const isVowel = (char) => {
|
|
13
|
+
const code = char.charCodeAt(0);
|
|
14
|
+
return code >= 0x0B85 && code <= 0x0B94;
|
|
15
|
+
};
|
|
16
|
+
|
|
17
|
+
// Consonants (க to ஹ) U+0B95 to U+0BB9
|
|
18
|
+
const isConsonant = (char) => {
|
|
19
|
+
const code = char.charCodeAt(0);
|
|
20
|
+
return code >= 0x0B95 && code <= 0x0BB9;
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
// Virama (்) U+0BCD
|
|
24
|
+
const isVirama = (char) => char === '\u0BCD';
|
|
25
|
+
|
|
26
|
+
// Vowel Signs (ா to ௌ) U+0BBE to U+0BCC, plus length marks
|
|
27
|
+
const isVowelSign = (char) => {
|
|
28
|
+
const code = char.charCodeAt(0);
|
|
29
|
+
return code >= 0x0BBE && code <= 0x0BCD && code !== 0x0BCD; // Exclude virama explicitly
|
|
30
|
+
};
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Tokenizes a sanitized Tamil string into grapheme clusters.
|
|
34
|
+
*
|
|
35
|
+
* @param {string} text - Cleaned Tamil text (after passing Layer 0).
|
|
36
|
+
* @returns {Array<{text: string, type: string}>} Array of tagged clusters.
|
|
37
|
+
*/
|
|
38
|
+
export function tokenize(text) {
|
|
39
|
+
if (typeof text !== 'string' || !text) return [];
|
|
40
|
+
|
|
41
|
+
const tokens = [];
|
|
42
|
+
for (const { segment } of segmenter.segment(text)) {
|
|
43
|
+
let type = tokenTypes.OTHER;
|
|
44
|
+
|
|
45
|
+
// Check classification based on first character and any modifiers
|
|
46
|
+
if (segment.length === 1) {
|
|
47
|
+
if (isVowel(segment)) {
|
|
48
|
+
type = tokenTypes.VOWEL;
|
|
49
|
+
} else if (isConsonant(segment)) {
|
|
50
|
+
type = tokenTypes.CONSONANT_BARE;
|
|
51
|
+
}
|
|
52
|
+
} else if (segment.length > 1) {
|
|
53
|
+
const base = segment[0];
|
|
54
|
+
const modifier = segment[1];
|
|
55
|
+
|
|
56
|
+
if (isConsonant(base)) {
|
|
57
|
+
if (isVirama(modifier)) {
|
|
58
|
+
type = tokenTypes.CONSONANT_VIRAMA;
|
|
59
|
+
} else if (isVowelSign(modifier)) {
|
|
60
|
+
type = tokenTypes.CONSONANT_VOWEL_SIGN;
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// Additional check for composed sequences like க்ஷ or ஸ்ரீ
|
|
65
|
+
// where Intl Segmenter keeps multiple characters together.
|
|
66
|
+
// But for grantha/ligatures or three-code sequences, if it starts
|
|
67
|
+
// with a consonant, we categorize it by its final modifier.
|
|
68
|
+
// Wait, Intl.Segmenter splits க்ஷ into க் and ஷ correctly in ta-IN? Let's verify.
|
|
69
|
+
// If it doesn't, it drops to OTHER, which decomposer can handle as raw text.
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
tokens.push({ text: segment, type });
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
return tokens;
|
|
76
|
+
}
|