georgian-hyphenation 1.0.1 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,151 @@
1
+ /**
2
+ * Georgian Language Hyphenation Library (v2.0 - Academic Logic)
3
+ * ქართული ენის დამარცვლის ბიბლიოთეკა
4
+ * * Logic: Phonological distance analysis & Anti-Orphan protection.
5
+ * Author: Guram Zhgamadze
6
+ */
7
+
8
+ class GeorgianHyphenator {
9
+ /**
10
+ * Initialize Georgian Hyphenator
11
+ * @param {string} hyphenChar - Character to use for hyphenation (default: soft hyphen U+00AD)
12
+ */
13
+ constructor(hyphenChar = '\u00AD') {
14
+ this.hyphenChar = hyphenChar;
15
+ this.vowels = 'აეიოუ';
16
+ }
17
+
18
+ /**
19
+ * Hyphenate a single Georgian word
20
+ * @param {string} word - Georgian word to hyphenate
21
+ * @returns {string} Word with hyphenation points
22
+ */
23
+ hyphenate(word) {
24
+ // 1. Safety Rule: Words shorter than 4 chars are never hyphenated
25
+ // (Prevents: "a-ra", "i-gi", "e-na")
26
+ if (word.length < 4) return word;
27
+
28
+ // 2. Find all vowel indices
29
+ let vowelIndices = [];
30
+ for (let i = 0; i < word.length; i++) {
31
+ if (this.vowels.includes(word[i])) {
32
+ vowelIndices.push(i);
33
+ }
34
+ }
35
+
36
+ // 3. If less than 2 vowels, cannot be hyphenated (e.g. "mcvrtnls")
37
+ if (vowelIndices.length < 2) return word;
38
+
39
+ let insertPoints = [];
40
+
41
+ // 4. Core Logic: Analyze distance between vowels
42
+ for (let i = 0; i < vowelIndices.length - 1; i++) {
43
+ let v1 = vowelIndices[i];
44
+ let v2 = vowelIndices[i + 1];
45
+ let distance = v2 - v1 - 1; // Number of consonants between vowels
46
+ let betweenSubstring = word.substring(v1 + 1, v2);
47
+
48
+ let candidatePos = -1;
49
+
50
+ if (distance === 0) {
51
+ // Case V-V (Hiatus): Split between vowels (ga-a-a-na-li-za)
52
+ candidatePos = v1 + 1;
53
+ } else if (distance === 1) {
54
+ // Case V-C-V: Split before consonant (ga-da)
55
+ candidatePos = v1 + 1;
56
+ } else {
57
+ // Case V-CC...-V: Cluster handling
58
+ // 'R' Rule: If cluster starts with 'r', keep it left (bar-bi)
59
+ // Otherwise, split after first consonant (saq-me) for balance
60
+ if (betweenSubstring[0] === 'რ') {
61
+ candidatePos = v1 + 2;
62
+ } else {
63
+ candidatePos = v1 + 2;
64
+ }
65
+ }
66
+
67
+ // 5. Critical Filter (Anti-Orphan / Anti-Widow)
68
+ // Ensure at least 2 characters remain on both sides of the hyphen
69
+ if (candidatePos >= 2 && (word.length - candidatePos) >= 2) {
70
+ insertPoints.push(candidatePos);
71
+ }
72
+ }
73
+
74
+ // 6. Reconstruct the word
75
+ let result = word.split('');
76
+ for (let i = insertPoints.length - 1; i >= 0; i--) {
77
+ result.splice(insertPoints[i], 0, this.hyphenChar);
78
+ }
79
+
80
+ return result.join('');
81
+ }
82
+
83
+ /**
84
+ * Get array of syllables for a word
85
+ * @param {string} word - Georgian word
86
+ * @returns {string[]} Array of syllables
87
+ */
88
+ getSyllables(word) {
89
+ // Use a temporary hyphenator with a safe delimiter to split
90
+ const tempHyphenator = new GeorgianHyphenator('-');
91
+ return tempHyphenator.hyphenate(word).split('-');
92
+ }
93
+
94
+ /**
95
+ * Hyphenate entire text (preserves punctuation)
96
+ * @param {string} text - Georgian text
97
+ * @returns {string} Hyphenated text
98
+ */
99
+ hyphenateText(text) {
100
+ // Improved Tokenizer: Splits by non-Georgian chars to protect punctuation
101
+ const parts = text.split(/([^ა-ჰ]+)/);
102
+
103
+ return parts.map(part => {
104
+ // Process only Georgian words with length >= 4
105
+ if (/[ა-ჰ]{4,}/.test(part)) {
106
+ return this.hyphenate(part);
107
+ }
108
+ return part;
109
+ }).join('');
110
+ }
111
+ }
112
+
113
+ /**
114
+ * Convert word to TeX pattern format (e.g., .გ1ა1ა1ნ1ა1ლ1ი1ზ1ა.)
115
+ * Useful for LaTeX or TeX engines
116
+ */
117
+ function toTeXPattern(word) {
118
+ const hyphenator = new GeorgianHyphenator();
119
+ const syllables = hyphenator.getSyllables(word);
120
+ if (syllables.length <= 1) {
121
+ return `.${word}.`;
122
+ }
123
+ // TeX hyphenation patterns usually use odd numbers (1, 3, 5) to indicate hyphens
124
+ // Here we simply join syllables with '1'
125
+ return '.' + syllables.join('1') + '.';
126
+ }
127
+
128
+ /**
129
+ * Convert word to Hunspell format (syllable=syllable)
130
+ */
131
+ function toHunspellFormat(word) {
132
+ const hyphenator = new GeorgianHyphenator();
133
+ const syllables = hyphenator.getSyllables(word);
134
+ return syllables.join('=');
135
+ }
136
+
137
+ // Export for Node.js
138
+ if (typeof module !== 'undefined' && module.exports) {
139
+ module.exports = {
140
+ GeorgianHyphenator,
141
+ toTeXPattern,
142
+ toHunspellFormat
143
+ };
144
+ }
145
+
146
+ // Export for Browser
147
+ if (typeof window !== 'undefined') {
148
+ window.GeorgianHyphenator = GeorgianHyphenator;
149
+ window.toTeXPattern = toTeXPattern;
150
+ window.toHunspellFormat = toHunspellFormat;
151
+ }
package/dist/index.d.ts DELETED
@@ -1,47 +0,0 @@
1
- /**
2
- * Georgian Language Hyphenation Library
3
- * ქართული ენის დამარცვლის ბიბლიოთეკა
4
- */
5
-
6
- export class GeorgianHyphenator {
7
- /**
8
- * Create a Georgian hyphenator
9
- * @param hyphenChar - Character to use for hyphenation points (default: U+00AD soft hyphen)
10
- */
11
- constructor(hyphenChar?: string);
12
-
13
- /**
14
- * Hyphenate a Georgian word
15
- * @param word - Georgian word to hyphenate
16
- * @returns Word with hyphenation points inserted
17
- */
18
- hyphenate(word: string): string;
19
-
20
- /**
21
- * Get syllables for a Georgian word
22
- * @param word - Georgian word
23
- * @returns Array of syllables
24
- */
25
- getSyllables(word: string): string[];
26
-
27
- /**
28
- * Hyphenate entire text
29
- * @param text - Georgian text
30
- * @returns Hyphenated text
31
- */
32
- hyphenateText(text: string): string;
33
- }
34
-
35
- /**
36
- * Convert word to TeX pattern format
37
- * @param word - Georgian word
38
- * @returns TeX pattern
39
- */
40
- export function toTeXPattern(word: string): string;
41
-
42
- /**
43
- * Convert word to Hunspell format
44
- * @param word - Georgian word
45
- * @returns Hunspell format
46
- */
47
- export function toHunspellFormat(word: string): string;
package/dist/index.js DELETED
@@ -1,199 +0,0 @@
1
- /**
2
- * Georgian Language Hyphenation Library (JavaScript)
3
- * ქართული ენის დამარცვლის ბიბლიოთეკა
4
- *
5
- * Usage:
6
- * const hyphenator = new GeorgianHyphenator();
7
- * const result = hyphenator.hyphenate("საქართველო");
8
- * // Result: "სა\u00ADქარ\u00ADთვე\u00ADლო"
9
- */
10
-
11
- class GeorgianHyphenator {
12
- /**
13
- * Initialize Georgian Hyphenator
14
- * @param {string} hyphenChar - Character to use for hyphenation (default: soft hyphen U+00AD)
15
- */
16
- constructor(hyphenChar = '\u00AD') {
17
- this.hyphenChar = hyphenChar;
18
- this.C = '[ბგდვზთკლმნპჟრსტფქღყშჩცძწჭხჯჰ]'; // Consonants
19
- this.V = '[აეიოუ]'; // Vowels
20
- this.char = '[ა-ჰ]'; // All Georgian letters
21
- }
22
-
23
- /**
24
- * Count vowels in a word
25
- * @param {string} word - Georgian word
26
- * @returns {number} Number of vowels
27
- */
28
- countVowels(word) {
29
- const vowels = 'აეიოუ';
30
- let count = 0;
31
- for (let v of vowels) {
32
- count += (word.match(new RegExp(v, 'g')) || []).length;
33
- }
34
- return count;
35
- }
36
-
37
- /**
38
- * Apply hyphenation rules with specified boundary markers
39
- * @private
40
- */
41
- _applyRules(w, softhpn, startchar, endchar) {
42
- const C = this.C;
43
- const V = this.V;
44
- const char = this.char;
45
-
46
- let t = w;
47
-
48
- // Rule 1: V+C+C++V → VC|CV
49
- t = t.replace(new RegExp(`(${V})(${C})(${C}+)(${V})`, 'gu'),
50
- `$1$2${softhpn}$3$4`);
51
-
52
- // Rule 2: V+C+V+C+V → VCV|CV
53
- t = t.replace(new RegExp(`(${V})(${C})(${V})(${C})(${V})`, 'gu'),
54
- `$1$2$3${softhpn}$4$5`);
55
-
56
- // Rule 3: C+V+C+V → CV|CV
57
- t = t.replace(new RegExp(`(${C})(${V})(${C})(${V})`, 'gu'),
58
- `$1$2${softhpn}$3$4`);
59
-
60
- // Rule 4: V+V+V → VV|V
61
- t = t.replace(new RegExp(`(${V})(${V})(${V})`, 'gu'),
62
- `$1$2${softhpn}$3`);
63
-
64
- // Rule 5: Word start - ^VCVCV
65
- t = t.replace(new RegExp(`${startchar}(${V})(${C})(${V})(${C})(${V})`, 'gu'),
66
- `$1$2$3${softhpn}$4$5`);
67
-
68
- // Rule 6: Word start - ^VCVCchar
69
- t = t.replace(new RegExp(`${startchar}(${V})(${C})(${V})(${C})(${char})`, 'gu'),
70
- `$1$2$3${softhpn}$4$5`);
71
-
72
- // Rule 7: Word start - ^C++CVCV
73
- t = t.replace(new RegExp(`${startchar}(${C}+)(${V})(${C})(${V})`, 'gu'),
74
- `$1$2${softhpn}$3$4`);
75
-
76
- // Rule 8: Word start - ^C++VVchar
77
- t = t.replace(new RegExp(`${startchar}(${C}+)(${V})(${V})(${char})`, 'gu'),
78
- `$1$2${softhpn}$3$4`);
79
-
80
- // Rule 9: Word end - charVVC++$
81
- t = t.replace(new RegExp(`(${char})(${V})(${V})(${C}+)${endchar}`, 'gu'),
82
- `$1$2${softhpn}$3$4`);
83
-
84
- // Rule 10: Word end - charVCV$
85
- t = t.replace(new RegExp(`(${char})(${V})(${C})(${V})${endchar}`, 'gu'),
86
- `$1$2${softhpn}$3$4`);
87
-
88
- // Rule 11: Word end - VCC++VC++$
89
- t = t.replace(new RegExp(`(${V})(${C})(${C}+)(${V})(${C}+)${endchar}`, 'gu'),
90
- `$1$2${softhpn}$3$4$5`);
91
-
92
- // Rule 12: Word end - charVCVC++$
93
- t = t.replace(new RegExp(`(${char})(${V})(${C})(${V}+)(${C}+)${endchar}`, 'gu'),
94
- `$1$2${softhpn}$3$4$5`);
95
-
96
- return t;
97
- }
98
-
99
- /**
100
- * Hyphenate a single Georgian word
101
- * @param {string} word - Georgian word to hyphenate
102
- * @returns {string} Word with hyphenation points
103
- */
104
- hyphenate(word) {
105
- // Don't hyphenate words with 0-1 vowels
106
- if (this.countVowels(word) <= 1) {
107
- return word;
108
- }
109
-
110
- const softhpn = this.hyphenChar;
111
-
112
- // Apply hyphenation rules with different boundary markers
113
- let result = this._applyRules(word, softhpn, '^', '$');
114
- result = this._applyRules(result, softhpn, '^', this._escapeRegex(softhpn));
115
- result = this._applyRules(result, this._escapeRegex(softhpn), '$');
116
- result = this._applyRules(result, this._escapeRegex(softhpn), this._escapeRegex(softhpn));
117
-
118
- // Remove duplicate hyphens
119
- const escapedHyphen = this._escapeRegex(softhpn);
120
- result = result.replace(new RegExp(`${escapedHyphen}+`, 'gu'), softhpn);
121
-
122
- return result;
123
- }
124
-
125
- /**
126
- * Get array of syllables for a word
127
- * @param {string} word - Georgian word
128
- * @returns {string[]} Array of syllables
129
- */
130
- getSyllables(word) {
131
- const hyphenated = this.hyphenate(word);
132
- return hyphenated.split(this.hyphenChar);
133
- }
134
-
135
- /**
136
- * Hyphenate entire text
137
- * @param {string} text - Georgian text
138
- * @returns {string} Hyphenated text
139
- */
140
- hyphenateText(text) {
141
- const words = text.split(' ');
142
- const hyphenatedWords = words.map(w => this.hyphenate(w));
143
- return hyphenatedWords.join(' ');
144
- }
145
-
146
- /**
147
- * Escape special regex characters
148
- * @private
149
- */
150
- _escapeRegex(str) {
151
- return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
152
- }
153
- }
154
-
155
- /**
156
- * Convert word to TeX pattern format
157
- * @param {string} word - Georgian word
158
- * @returns {string} TeX pattern
159
- */
160
- function toTeXPattern(word) {
161
- const hyphenator = new GeorgianHyphenator();
162
- const syllables = hyphenator.getSyllables(word);
163
- if (syllables.length <= 1) {
164
- return `.${word}`;
165
- }
166
- return '.' + syllables.join('1');
167
- }
168
-
169
- /**
170
- * Convert word to Hunspell format
171
- * @param {string} word - Georgian word
172
- * @returns {string} Hunspell format
173
- */
174
- function toHunspellFormat(word) {
175
- const hyphenator = new GeorgianHyphenator();
176
- const syllables = hyphenator.getSyllables(word);
177
- return syllables.join('=');
178
- }
179
-
180
- // Export for use in Node.js or browser
181
- if (typeof module !== 'undefined' && module.exports) {
182
- module.exports = {
183
- GeorgianHyphenator,
184
- toTeXPattern,
185
- toHunspellFormat
186
- };
187
- }
188
-
189
- // Demo usage
190
- if (typeof window !== 'undefined') {
191
- window.GeorgianHyphenator = GeorgianHyphenator;
192
- window.toTeXPattern = toTeXPattern;
193
- window.toHunspellFormat = toHunspellFormat;
194
- }
195
-
196
- // Example usage:
197
- // const hyphenator = new GeorgianHyphenator('-'); // visible hyphens
198
- // console.log(hyphenator.hyphenate("საქართველო")); // "სა-ქარ-თვე-ლო"
199
- // console.log(hyphenator.getSyllables("საქართველო")); // ["სა", "ქარ", "თვე", "ლო"]