georgian-hyphenation 1.0.1 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README-NPM.md +620 -0
- package/README.md +261 -155
- package/package.json +7 -9
- package/src/javascript/index.js +151 -0
- package/dist/georgian_hyphenation-1.0.1-py3-none-any.whl +0 -0
- package/dist/georgian_hyphenation-1.0.1.tar.gz +0 -0
- package/dist/index.d.ts +0 -47
- package/dist/index.js +0 -199
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Georgian Language Hyphenation Library (v2.0 - Academic Logic)
|
|
3
|
+
* ქართული ენის დამარცვლის ბიბლიოთეკა
|
|
4
|
+
* * Logic: Phonological distance analysis & Anti-Orphan protection.
|
|
5
|
+
* Author: Guram Zhgamadze
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
class GeorgianHyphenator {
|
|
9
|
+
/**
|
|
10
|
+
* Initialize Georgian Hyphenator
|
|
11
|
+
* @param {string} hyphenChar - Character to use for hyphenation (default: soft hyphen U+00AD)
|
|
12
|
+
*/
|
|
13
|
+
constructor(hyphenChar = '\u00AD') {
|
|
14
|
+
this.hyphenChar = hyphenChar;
|
|
15
|
+
this.vowels = 'აეიოუ';
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Hyphenate a single Georgian word
|
|
20
|
+
* @param {string} word - Georgian word to hyphenate
|
|
21
|
+
* @returns {string} Word with hyphenation points
|
|
22
|
+
*/
|
|
23
|
+
hyphenate(word) {
|
|
24
|
+
// 1. Safety Rule: Words shorter than 4 chars are never hyphenated
|
|
25
|
+
// (Prevents: "a-ra", "i-gi", "e-na")
|
|
26
|
+
if (word.length < 4) return word;
|
|
27
|
+
|
|
28
|
+
// 2. Find all vowel indices
|
|
29
|
+
let vowelIndices = [];
|
|
30
|
+
for (let i = 0; i < word.length; i++) {
|
|
31
|
+
if (this.vowels.includes(word[i])) {
|
|
32
|
+
vowelIndices.push(i);
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
// 3. If less than 2 vowels, cannot be hyphenated (e.g. "mcvrtnls")
|
|
37
|
+
if (vowelIndices.length < 2) return word;
|
|
38
|
+
|
|
39
|
+
let insertPoints = [];
|
|
40
|
+
|
|
41
|
+
// 4. Core Logic: Analyze distance between vowels
|
|
42
|
+
for (let i = 0; i < vowelIndices.length - 1; i++) {
|
|
43
|
+
let v1 = vowelIndices[i];
|
|
44
|
+
let v2 = vowelIndices[i + 1];
|
|
45
|
+
let distance = v2 - v1 - 1; // Number of consonants between vowels
|
|
46
|
+
let betweenSubstring = word.substring(v1 + 1, v2);
|
|
47
|
+
|
|
48
|
+
let candidatePos = -1;
|
|
49
|
+
|
|
50
|
+
if (distance === 0) {
|
|
51
|
+
// Case V-V (Hiatus): Split between vowels (ga-a-a-na-li-za)
|
|
52
|
+
candidatePos = v1 + 1;
|
|
53
|
+
} else if (distance === 1) {
|
|
54
|
+
// Case V-C-V: Split before consonant (ga-da)
|
|
55
|
+
candidatePos = v1 + 1;
|
|
56
|
+
} else {
|
|
57
|
+
// Case V-CC...-V: Cluster handling
|
|
58
|
+
// 'R' Rule: If cluster starts with 'r', keep it left (bar-bi)
|
|
59
|
+
// Otherwise, split after first consonant (saq-me) for balance
|
|
60
|
+
if (betweenSubstring[0] === 'რ') {
|
|
61
|
+
candidatePos = v1 + 2;
|
|
62
|
+
} else {
|
|
63
|
+
candidatePos = v1 + 2;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// 5. Critical Filter (Anti-Orphan / Anti-Widow)
|
|
68
|
+
// Ensure at least 2 characters remain on both sides of the hyphen
|
|
69
|
+
if (candidatePos >= 2 && (word.length - candidatePos) >= 2) {
|
|
70
|
+
insertPoints.push(candidatePos);
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// 6. Reconstruct the word
|
|
75
|
+
let result = word.split('');
|
|
76
|
+
for (let i = insertPoints.length - 1; i >= 0; i--) {
|
|
77
|
+
result.splice(insertPoints[i], 0, this.hyphenChar);
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
return result.join('');
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Get array of syllables for a word
|
|
85
|
+
* @param {string} word - Georgian word
|
|
86
|
+
* @returns {string[]} Array of syllables
|
|
87
|
+
*/
|
|
88
|
+
getSyllables(word) {
|
|
89
|
+
// Use a temporary hyphenator with a safe delimiter to split
|
|
90
|
+
const tempHyphenator = new GeorgianHyphenator('-');
|
|
91
|
+
return tempHyphenator.hyphenate(word).split('-');
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* Hyphenate entire text (preserves punctuation)
|
|
96
|
+
* @param {string} text - Georgian text
|
|
97
|
+
* @returns {string} Hyphenated text
|
|
98
|
+
*/
|
|
99
|
+
hyphenateText(text) {
|
|
100
|
+
// Improved Tokenizer: Splits by non-Georgian chars to protect punctuation
|
|
101
|
+
const parts = text.split(/([^ა-ჰ]+)/);
|
|
102
|
+
|
|
103
|
+
return parts.map(part => {
|
|
104
|
+
// Process only Georgian words with length >= 4
|
|
105
|
+
if (/[ა-ჰ]{4,}/.test(part)) {
|
|
106
|
+
return this.hyphenate(part);
|
|
107
|
+
}
|
|
108
|
+
return part;
|
|
109
|
+
}).join('');
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
/**
|
|
114
|
+
* Convert word to TeX pattern format (e.g., .გ1ა1ა1ნ1ა1ლ1ი1ზ1ა.)
|
|
115
|
+
* Useful for LaTeX or TeX engines
|
|
116
|
+
*/
|
|
117
|
+
function toTeXPattern(word) {
|
|
118
|
+
const hyphenator = new GeorgianHyphenator();
|
|
119
|
+
const syllables = hyphenator.getSyllables(word);
|
|
120
|
+
if (syllables.length <= 1) {
|
|
121
|
+
return `.${word}.`;
|
|
122
|
+
}
|
|
123
|
+
// TeX hyphenation patterns usually use odd numbers (1, 3, 5) to indicate hyphens
|
|
124
|
+
// Here we simply join syllables with '1'
|
|
125
|
+
return '.' + syllables.join('1') + '.';
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
/**
|
|
129
|
+
* Convert word to Hunspell format (syllable=syllable)
|
|
130
|
+
*/
|
|
131
|
+
function toHunspellFormat(word) {
|
|
132
|
+
const hyphenator = new GeorgianHyphenator();
|
|
133
|
+
const syllables = hyphenator.getSyllables(word);
|
|
134
|
+
return syllables.join('=');
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
// Export for Node.js
|
|
138
|
+
if (typeof module !== 'undefined' && module.exports) {
|
|
139
|
+
module.exports = {
|
|
140
|
+
GeorgianHyphenator,
|
|
141
|
+
toTeXPattern,
|
|
142
|
+
toHunspellFormat
|
|
143
|
+
};
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// Export for Browser
|
|
147
|
+
if (typeof window !== 'undefined') {
|
|
148
|
+
window.GeorgianHyphenator = GeorgianHyphenator;
|
|
149
|
+
window.toTeXPattern = toTeXPattern;
|
|
150
|
+
window.toHunspellFormat = toHunspellFormat;
|
|
151
|
+
}
|
|
Binary file
|
|
Binary file
|
package/dist/index.d.ts
DELETED
|
@@ -1,47 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Georgian Language Hyphenation Library
|
|
3
|
-
* ქართული ენის დამარცვლის ბიბლიოთეკა
|
|
4
|
-
*/
|
|
5
|
-
|
|
6
|
-
export class GeorgianHyphenator {
|
|
7
|
-
/**
|
|
8
|
-
* Create a Georgian hyphenator
|
|
9
|
-
* @param hyphenChar - Character to use for hyphenation points (default: U+00AD soft hyphen)
|
|
10
|
-
*/
|
|
11
|
-
constructor(hyphenChar?: string);
|
|
12
|
-
|
|
13
|
-
/**
|
|
14
|
-
* Hyphenate a Georgian word
|
|
15
|
-
* @param word - Georgian word to hyphenate
|
|
16
|
-
* @returns Word with hyphenation points inserted
|
|
17
|
-
*/
|
|
18
|
-
hyphenate(word: string): string;
|
|
19
|
-
|
|
20
|
-
/**
|
|
21
|
-
* Get syllables for a Georgian word
|
|
22
|
-
* @param word - Georgian word
|
|
23
|
-
* @returns Array of syllables
|
|
24
|
-
*/
|
|
25
|
-
getSyllables(word: string): string[];
|
|
26
|
-
|
|
27
|
-
/**
|
|
28
|
-
* Hyphenate entire text
|
|
29
|
-
* @param text - Georgian text
|
|
30
|
-
* @returns Hyphenated text
|
|
31
|
-
*/
|
|
32
|
-
hyphenateText(text: string): string;
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
/**
|
|
36
|
-
* Convert word to TeX pattern format
|
|
37
|
-
* @param word - Georgian word
|
|
38
|
-
* @returns TeX pattern
|
|
39
|
-
*/
|
|
40
|
-
export function toTeXPattern(word: string): string;
|
|
41
|
-
|
|
42
|
-
/**
|
|
43
|
-
* Convert word to Hunspell format
|
|
44
|
-
* @param word - Georgian word
|
|
45
|
-
* @returns Hunspell format
|
|
46
|
-
*/
|
|
47
|
-
export function toHunspellFormat(word: string): string;
|
package/dist/index.js
DELETED
|
@@ -1,199 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Georgian Language Hyphenation Library (JavaScript)
|
|
3
|
-
* ქართული ენის დამარცვლის ბიბლიოთეკა
|
|
4
|
-
*
|
|
5
|
-
* Usage:
|
|
6
|
-
* const hyphenator = new GeorgianHyphenator();
|
|
7
|
-
* const result = hyphenator.hyphenate("საქართველო");
|
|
8
|
-
* // Result: "სა\u00ADქარ\u00ADთვე\u00ADლო"
|
|
9
|
-
*/
|
|
10
|
-
|
|
11
|
-
class GeorgianHyphenator {
|
|
12
|
-
/**
|
|
13
|
-
* Initialize Georgian Hyphenator
|
|
14
|
-
* @param {string} hyphenChar - Character to use for hyphenation (default: soft hyphen U+00AD)
|
|
15
|
-
*/
|
|
16
|
-
constructor(hyphenChar = '\u00AD') {
|
|
17
|
-
this.hyphenChar = hyphenChar;
|
|
18
|
-
this.C = '[ბგდვზთკლმნპჟრსტფქღყშჩცძწჭხჯჰ]'; // Consonants
|
|
19
|
-
this.V = '[აეიოუ]'; // Vowels
|
|
20
|
-
this.char = '[ა-ჰ]'; // All Georgian letters
|
|
21
|
-
}
|
|
22
|
-
|
|
23
|
-
/**
|
|
24
|
-
* Count vowels in a word
|
|
25
|
-
* @param {string} word - Georgian word
|
|
26
|
-
* @returns {number} Number of vowels
|
|
27
|
-
*/
|
|
28
|
-
countVowels(word) {
|
|
29
|
-
const vowels = 'აეიოუ';
|
|
30
|
-
let count = 0;
|
|
31
|
-
for (let v of vowels) {
|
|
32
|
-
count += (word.match(new RegExp(v, 'g')) || []).length;
|
|
33
|
-
}
|
|
34
|
-
return count;
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
/**
|
|
38
|
-
* Apply hyphenation rules with specified boundary markers
|
|
39
|
-
* @private
|
|
40
|
-
*/
|
|
41
|
-
_applyRules(w, softhpn, startchar, endchar) {
|
|
42
|
-
const C = this.C;
|
|
43
|
-
const V = this.V;
|
|
44
|
-
const char = this.char;
|
|
45
|
-
|
|
46
|
-
let t = w;
|
|
47
|
-
|
|
48
|
-
// Rule 1: V+C+C++V → VC|CV
|
|
49
|
-
t = t.replace(new RegExp(`(${V})(${C})(${C}+)(${V})`, 'gu'),
|
|
50
|
-
`$1$2${softhpn}$3$4`);
|
|
51
|
-
|
|
52
|
-
// Rule 2: V+C+V+C+V → VCV|CV
|
|
53
|
-
t = t.replace(new RegExp(`(${V})(${C})(${V})(${C})(${V})`, 'gu'),
|
|
54
|
-
`$1$2$3${softhpn}$4$5`);
|
|
55
|
-
|
|
56
|
-
// Rule 3: C+V+C+V → CV|CV
|
|
57
|
-
t = t.replace(new RegExp(`(${C})(${V})(${C})(${V})`, 'gu'),
|
|
58
|
-
`$1$2${softhpn}$3$4`);
|
|
59
|
-
|
|
60
|
-
// Rule 4: V+V+V → VV|V
|
|
61
|
-
t = t.replace(new RegExp(`(${V})(${V})(${V})`, 'gu'),
|
|
62
|
-
`$1$2${softhpn}$3`);
|
|
63
|
-
|
|
64
|
-
// Rule 5: Word start - ^VCVCV
|
|
65
|
-
t = t.replace(new RegExp(`${startchar}(${V})(${C})(${V})(${C})(${V})`, 'gu'),
|
|
66
|
-
`$1$2$3${softhpn}$4$5`);
|
|
67
|
-
|
|
68
|
-
// Rule 6: Word start - ^VCVCchar
|
|
69
|
-
t = t.replace(new RegExp(`${startchar}(${V})(${C})(${V})(${C})(${char})`, 'gu'),
|
|
70
|
-
`$1$2$3${softhpn}$4$5`);
|
|
71
|
-
|
|
72
|
-
// Rule 7: Word start - ^C++CVCV
|
|
73
|
-
t = t.replace(new RegExp(`${startchar}(${C}+)(${V})(${C})(${V})`, 'gu'),
|
|
74
|
-
`$1$2${softhpn}$3$4`);
|
|
75
|
-
|
|
76
|
-
// Rule 8: Word start - ^C++VVchar
|
|
77
|
-
t = t.replace(new RegExp(`${startchar}(${C}+)(${V})(${V})(${char})`, 'gu'),
|
|
78
|
-
`$1$2${softhpn}$3$4`);
|
|
79
|
-
|
|
80
|
-
// Rule 9: Word end - charVVC++$
|
|
81
|
-
t = t.replace(new RegExp(`(${char})(${V})(${V})(${C}+)${endchar}`, 'gu'),
|
|
82
|
-
`$1$2${softhpn}$3$4`);
|
|
83
|
-
|
|
84
|
-
// Rule 10: Word end - charVCV$
|
|
85
|
-
t = t.replace(new RegExp(`(${char})(${V})(${C})(${V})${endchar}`, 'gu'),
|
|
86
|
-
`$1$2${softhpn}$3$4`);
|
|
87
|
-
|
|
88
|
-
// Rule 11: Word end - VCC++VC++$
|
|
89
|
-
t = t.replace(new RegExp(`(${V})(${C})(${C}+)(${V})(${C}+)${endchar}`, 'gu'),
|
|
90
|
-
`$1$2${softhpn}$3$4$5`);
|
|
91
|
-
|
|
92
|
-
// Rule 12: Word end - charVCVC++$
|
|
93
|
-
t = t.replace(new RegExp(`(${char})(${V})(${C})(${V}+)(${C}+)${endchar}`, 'gu'),
|
|
94
|
-
`$1$2${softhpn}$3$4$5`);
|
|
95
|
-
|
|
96
|
-
return t;
|
|
97
|
-
}
|
|
98
|
-
|
|
99
|
-
/**
|
|
100
|
-
* Hyphenate a single Georgian word
|
|
101
|
-
* @param {string} word - Georgian word to hyphenate
|
|
102
|
-
* @returns {string} Word with hyphenation points
|
|
103
|
-
*/
|
|
104
|
-
hyphenate(word) {
|
|
105
|
-
// Don't hyphenate words with 0-1 vowels
|
|
106
|
-
if (this.countVowels(word) <= 1) {
|
|
107
|
-
return word;
|
|
108
|
-
}
|
|
109
|
-
|
|
110
|
-
const softhpn = this.hyphenChar;
|
|
111
|
-
|
|
112
|
-
// Apply hyphenation rules with different boundary markers
|
|
113
|
-
let result = this._applyRules(word, softhpn, '^', '$');
|
|
114
|
-
result = this._applyRules(result, softhpn, '^', this._escapeRegex(softhpn));
|
|
115
|
-
result = this._applyRules(result, this._escapeRegex(softhpn), '$');
|
|
116
|
-
result = this._applyRules(result, this._escapeRegex(softhpn), this._escapeRegex(softhpn));
|
|
117
|
-
|
|
118
|
-
// Remove duplicate hyphens
|
|
119
|
-
const escapedHyphen = this._escapeRegex(softhpn);
|
|
120
|
-
result = result.replace(new RegExp(`${escapedHyphen}+`, 'gu'), softhpn);
|
|
121
|
-
|
|
122
|
-
return result;
|
|
123
|
-
}
|
|
124
|
-
|
|
125
|
-
/**
|
|
126
|
-
* Get array of syllables for a word
|
|
127
|
-
* @param {string} word - Georgian word
|
|
128
|
-
* @returns {string[]} Array of syllables
|
|
129
|
-
*/
|
|
130
|
-
getSyllables(word) {
|
|
131
|
-
const hyphenated = this.hyphenate(word);
|
|
132
|
-
return hyphenated.split(this.hyphenChar);
|
|
133
|
-
}
|
|
134
|
-
|
|
135
|
-
/**
|
|
136
|
-
* Hyphenate entire text
|
|
137
|
-
* @param {string} text - Georgian text
|
|
138
|
-
* @returns {string} Hyphenated text
|
|
139
|
-
*/
|
|
140
|
-
hyphenateText(text) {
|
|
141
|
-
const words = text.split(' ');
|
|
142
|
-
const hyphenatedWords = words.map(w => this.hyphenate(w));
|
|
143
|
-
return hyphenatedWords.join(' ');
|
|
144
|
-
}
|
|
145
|
-
|
|
146
|
-
/**
|
|
147
|
-
* Escape special regex characters
|
|
148
|
-
* @private
|
|
149
|
-
*/
|
|
150
|
-
_escapeRegex(str) {
|
|
151
|
-
return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
152
|
-
}
|
|
153
|
-
}
|
|
154
|
-
|
|
155
|
-
/**
|
|
156
|
-
* Convert word to TeX pattern format
|
|
157
|
-
* @param {string} word - Georgian word
|
|
158
|
-
* @returns {string} TeX pattern
|
|
159
|
-
*/
|
|
160
|
-
function toTeXPattern(word) {
|
|
161
|
-
const hyphenator = new GeorgianHyphenator();
|
|
162
|
-
const syllables = hyphenator.getSyllables(word);
|
|
163
|
-
if (syllables.length <= 1) {
|
|
164
|
-
return `.${word}`;
|
|
165
|
-
}
|
|
166
|
-
return '.' + syllables.join('1');
|
|
167
|
-
}
|
|
168
|
-
|
|
169
|
-
/**
|
|
170
|
-
* Convert word to Hunspell format
|
|
171
|
-
* @param {string} word - Georgian word
|
|
172
|
-
* @returns {string} Hunspell format
|
|
173
|
-
*/
|
|
174
|
-
function toHunspellFormat(word) {
|
|
175
|
-
const hyphenator = new GeorgianHyphenator();
|
|
176
|
-
const syllables = hyphenator.getSyllables(word);
|
|
177
|
-
return syllables.join('=');
|
|
178
|
-
}
|
|
179
|
-
|
|
180
|
-
// Export for use in Node.js or browser
|
|
181
|
-
if (typeof module !== 'undefined' && module.exports) {
|
|
182
|
-
module.exports = {
|
|
183
|
-
GeorgianHyphenator,
|
|
184
|
-
toTeXPattern,
|
|
185
|
-
toHunspellFormat
|
|
186
|
-
};
|
|
187
|
-
}
|
|
188
|
-
|
|
189
|
-
// Demo usage
|
|
190
|
-
if (typeof window !== 'undefined') {
|
|
191
|
-
window.GeorgianHyphenator = GeorgianHyphenator;
|
|
192
|
-
window.toTeXPattern = toTeXPattern;
|
|
193
|
-
window.toHunspellFormat = toHunspellFormat;
|
|
194
|
-
}
|
|
195
|
-
|
|
196
|
-
// Example usage:
|
|
197
|
-
// const hyphenator = new GeorgianHyphenator('-'); // visible hyphens
|
|
198
|
-
// console.log(hyphenator.hyphenate("საქართველო")); // "სა-ქარ-თვე-ლო"
|
|
199
|
-
// console.log(hyphenator.getSyllables("საქართველო")); // ["სა", "ქარ", "თვე", "ლო"]
|