georgian-hyphenation 2.0.1 → 2.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE.txt +1 -1
- package/README.md +63 -347
- package/data/exceptions.json +144 -0
- package/package.json +18 -8
- package/src/javascript/index.js +112 -96
- package/README-NPM.md +0 -620
package/package.json
CHANGED
|
@@ -1,16 +1,22 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "georgian-hyphenation",
|
|
3
|
-
"version": "2.
|
|
4
|
-
"description": "Georgian Language Hyphenation Library v2.
|
|
3
|
+
"version": "2.2.2",
|
|
4
|
+
"description": "Georgian Language Hyphenation Library v2.2.1 - Academic Logic with Sanitization & Dictionary Support",
|
|
5
|
+
"type": "module",
|
|
5
6
|
"main": "src/javascript/index.js",
|
|
6
7
|
"types": "src/javascript/index.d.ts",
|
|
7
8
|
"files": [
|
|
8
9
|
"src/javascript",
|
|
9
|
-
"
|
|
10
|
-
"
|
|
10
|
+
"data/exceptions.json",
|
|
11
|
+
"README.md",
|
|
12
|
+
"LICENSE.txt"
|
|
11
13
|
],
|
|
14
|
+
"exports": {
|
|
15
|
+
".": "./src/javascript/index.js",
|
|
16
|
+
"./data/*": "./data/*"
|
|
17
|
+
},
|
|
12
18
|
"scripts": {
|
|
13
|
-
"test": "
|
|
19
|
+
"test": "node test-suite.js"
|
|
14
20
|
},
|
|
15
21
|
"repository": {
|
|
16
22
|
"type": "git",
|
|
@@ -27,12 +33,16 @@
|
|
|
27
33
|
"linguistics",
|
|
28
34
|
"text-processing",
|
|
29
35
|
"i18n",
|
|
30
|
-
"localization"
|
|
36
|
+
"localization",
|
|
37
|
+
"sanitization"
|
|
31
38
|
],
|
|
32
39
|
"author": "Guram Zhgamadze <guramzhgamadze@gmail.com>",
|
|
33
40
|
"license": "MIT",
|
|
34
41
|
"bugs": {
|
|
35
42
|
"url": "https://github.com/guramzhgamadze/georgian-hyphenation/issues"
|
|
36
43
|
},
|
|
37
|
-
"homepage": "https://github.com/guramzhgamadze/georgian-hyphenation#readme"
|
|
38
|
-
|
|
44
|
+
"homepage": "https://github.com/guramzhgamadze/georgian-hyphenation#readme",
|
|
45
|
+
"dependencies": {
|
|
46
|
+
"georgian-hyphenation": "^2.2.1"
|
|
47
|
+
}
|
|
48
|
+
}
|
package/src/javascript/index.js
CHANGED
|
@@ -1,108 +1,151 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Georgian
|
|
3
|
-
*
|
|
4
|
-
* * Logic: Phonological distance analysis & Anti-Orphan protection.
|
|
5
|
-
* Author: Guram Zhgamadze
|
|
2
|
+
* Georgian Hyphenation Library v2.2.1
|
|
3
|
+
* Modernized & Optimized by GitHub Code Architect
|
|
6
4
|
*/
|
|
7
5
|
|
|
8
|
-
class GeorgianHyphenator {
|
|
9
|
-
/**
|
|
10
|
-
* Initialize Georgian Hyphenator
|
|
11
|
-
* @param {string} hyphenChar - Character to use for hyphenation (default: soft hyphen U+00AD)
|
|
12
|
-
*/
|
|
6
|
+
export default class GeorgianHyphenator {
|
|
13
7
|
constructor(hyphenChar = '\u00AD') {
|
|
14
8
|
this.hyphenChar = hyphenChar;
|
|
15
9
|
this.vowels = 'აეიოუ';
|
|
10
|
+
this.leftMin = 2;
|
|
11
|
+
this.rightMin = 2;
|
|
12
|
+
|
|
13
|
+
// ოპტიმიზაცია: გამოყენებულია Set სწრაფი ძებნისთვის (O(1))
|
|
14
|
+
this.harmonicClusters = new Set([
|
|
15
|
+
'ბლ', 'ბრ', 'ბღ', 'ბზ', 'გდ', 'გლ', 'გმ', 'გნ', 'გვ', 'გზ', 'გრ',
|
|
16
|
+
'დრ', 'თლ', 'თრ', 'თღ', 'კლ', 'კმ', 'კნ', 'კრ', 'კვ', 'მტ', 'პლ',
|
|
17
|
+
'პრ', 'ჟღ', 'რგ', 'რლ', 'რმ', 'სწ', 'სხ', 'ტკ', 'ტპ', 'ტრ', 'ფლ',
|
|
18
|
+
'ფრ', 'ფქ', 'ფშ', 'ქლ', 'ქნ', 'ქვ', 'ქრ', 'ღლ', 'ღრ', 'ყლ', 'ყრ',
|
|
19
|
+
'შთ', 'შპ', 'ჩქ', 'ჩრ', 'ცლ', 'ცნ', 'ცრ', 'ცვ', 'ძგ', 'ძვ', 'ძღ',
|
|
20
|
+
'წლ', 'წრ', 'წნ', 'წკ', 'ჭკ', 'ჭრ', 'ჭყ', 'ხლ', 'ხმ', 'ხნ', 'ხვ', 'ჯგ'
|
|
21
|
+
]);
|
|
22
|
+
|
|
23
|
+
this.dictionary = new Map();
|
|
16
24
|
}
|
|
17
25
|
|
|
18
26
|
/**
|
|
19
|
-
*
|
|
20
|
-
* @param {string} word - Georgian word to hyphenate
|
|
21
|
-
* @returns {string} Word with hyphenation points
|
|
27
|
+
* შლის არსებულ დამარცვლის სიმბოლოებს (Sanitization)
|
|
22
28
|
*/
|
|
29
|
+
_stripHyphens(text) {
|
|
30
|
+
if (!text) return '';
|
|
31
|
+
// Escape special regex characters
|
|
32
|
+
const escapedChar = this.hyphenChar.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
33
|
+
const regex = new RegExp(`[\u00AD${escapedChar}]`, 'g');
|
|
34
|
+
return text.replace(regex, '');
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
loadLibrary(data) {
|
|
38
|
+
if (data && typeof data === 'object') {
|
|
39
|
+
Object.entries(data).forEach(([word, hyphenated]) => {
|
|
40
|
+
this.dictionary.set(word, hyphenated);
|
|
41
|
+
});
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
async loadDefaultLibrary() {
|
|
46
|
+
// 1. Browser Environment
|
|
47
|
+
if (typeof window !== 'undefined' && typeof fetch !== 'undefined') {
|
|
48
|
+
try {
|
|
49
|
+
const response = await fetch('https://unpkg.com/georgian-hyphenation@2/data/exceptions.json');
|
|
50
|
+
if (!response.ok) throw new Error("Network response error");
|
|
51
|
+
const data = await response.json();
|
|
52
|
+
this.loadLibrary(data);
|
|
53
|
+
} catch (error) {
|
|
54
|
+
console.warn("Georgian Hyphenation: Using algorithm only (Fetch failed)");
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
// 2. Node.js Environment (ESM context)
|
|
58
|
+
else if (typeof process !== 'undefined') {
|
|
59
|
+
try {
|
|
60
|
+
// Node-ში ლოკალური ფაილის წაკითხვა
|
|
61
|
+
const { default: data } = await import('../../data/exceptions.json', {
|
|
62
|
+
assert: { type: 'json' }
|
|
63
|
+
});
|
|
64
|
+
this.loadLibrary(data);
|
|
65
|
+
} catch (error) {
|
|
66
|
+
console.warn("Georgian Hyphenation: Local dictionary not found");
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
23
71
|
hyphenate(word) {
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
72
|
+
const sanitizedWord = this._stripHyphens(word);
|
|
73
|
+
const cleanWord = sanitizedWord.replace(/[.,/#!$%^&*;:{}=\-_`~()]/g, "");
|
|
74
|
+
|
|
75
|
+
if (this.dictionary.has(cleanWord)) {
|
|
76
|
+
return this.dictionary.get(cleanWord).replace(/-/g, this.hyphenChar);
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
return this.applyAlgorithm(sanitizedWord);
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
applyAlgorithm(word) {
|
|
83
|
+
if (word.length < (this.leftMin + this.rightMin)) return word;
|
|
27
84
|
|
|
28
|
-
|
|
29
|
-
let vowelIndices = [];
|
|
85
|
+
const vowelIndices = [];
|
|
30
86
|
for (let i = 0; i < word.length; i++) {
|
|
31
|
-
if (this.vowels.includes(word[i]))
|
|
32
|
-
vowelIndices.push(i);
|
|
33
|
-
}
|
|
87
|
+
if (this.vowels.includes(word[i])) vowelIndices.push(i);
|
|
34
88
|
}
|
|
35
89
|
|
|
36
|
-
// 3. If less than 2 vowels, cannot be hyphenated (e.g. "mcvrtnls")
|
|
37
90
|
if (vowelIndices.length < 2) return word;
|
|
38
91
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
// 4. Core Logic: Analyze distance between vowels
|
|
92
|
+
const insertPoints = [];
|
|
42
93
|
for (let i = 0; i < vowelIndices.length - 1; i++) {
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
94
|
+
const v1 = vowelIndices[i];
|
|
95
|
+
const v2 = vowelIndices[i + 1];
|
|
96
|
+
const distance = v2 - v1 - 1;
|
|
97
|
+
const betweenSubstring = word.substring(v1 + 1, v2);
|
|
47
98
|
|
|
48
99
|
let candidatePos = -1;
|
|
49
100
|
|
|
50
|
-
if (distance === 0) {
|
|
51
|
-
// Case V-V (Hiatus): Split between vowels (ga-a-a-na-li-za)
|
|
52
|
-
candidatePos = v1 + 1;
|
|
53
|
-
} else if (distance === 1) {
|
|
54
|
-
// Case V-C-V: Split before consonant (ga-da)
|
|
101
|
+
if (distance === 0 || distance === 1) {
|
|
55
102
|
candidatePos = v1 + 1;
|
|
56
103
|
} else {
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
104
|
+
let doubleConsonantIndex = -1;
|
|
105
|
+
for (let j = 0; j < betweenSubstring.length - 1; j++) {
|
|
106
|
+
if (betweenSubstring[j] === betweenSubstring[j + 1]) {
|
|
107
|
+
doubleConsonantIndex = j;
|
|
108
|
+
break;
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
if (doubleConsonantIndex !== -1) {
|
|
113
|
+
candidatePos = v1 + 1 + doubleConsonantIndex + 1;
|
|
62
114
|
} else {
|
|
63
|
-
|
|
115
|
+
let breakIndex = -1;
|
|
116
|
+
if (distance >= 2) {
|
|
117
|
+
const lastTwo = betweenSubstring.substring(distance - 2, distance);
|
|
118
|
+
if (this.harmonicClusters.has(lastTwo)) {
|
|
119
|
+
breakIndex = distance - 2;
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
candidatePos = (breakIndex !== -1) ? v1 + 1 + breakIndex : v1 + 2;
|
|
64
123
|
}
|
|
65
124
|
}
|
|
66
125
|
|
|
67
|
-
|
|
68
|
-
// Ensure at least 2 characters remain on both sides of the hyphen
|
|
69
|
-
if (candidatePos >= 2 && (word.length - candidatePos) >= 2) {
|
|
126
|
+
if (candidatePos >= this.leftMin && (word.length - candidatePos) >= this.rightMin) {
|
|
70
127
|
insertPoints.push(candidatePos);
|
|
71
128
|
}
|
|
72
129
|
}
|
|
73
130
|
|
|
74
|
-
// 6. Reconstruct the word
|
|
75
131
|
let result = word.split('');
|
|
76
132
|
for (let i = insertPoints.length - 1; i >= 0; i--) {
|
|
77
133
|
result.splice(insertPoints[i], 0, this.hyphenChar);
|
|
78
134
|
}
|
|
79
|
-
|
|
80
135
|
return result.join('');
|
|
81
136
|
}
|
|
82
137
|
|
|
83
|
-
/**
|
|
84
|
-
* Get array of syllables for a word
|
|
85
|
-
* @param {string} word - Georgian word
|
|
86
|
-
* @returns {string[]} Array of syllables
|
|
87
|
-
*/
|
|
88
138
|
getSyllables(word) {
|
|
89
|
-
|
|
90
|
-
const tempHyphenator = new GeorgianHyphenator('-');
|
|
91
|
-
return tempHyphenator.hyphenate(word).split('-');
|
|
139
|
+
return this.hyphenate(word).split(this.hyphenChar);
|
|
92
140
|
}
|
|
93
141
|
|
|
94
|
-
/**
|
|
95
|
-
* Hyphenate entire text (preserves punctuation)
|
|
96
|
-
* @param {string} text - Georgian text
|
|
97
|
-
* @returns {string} Hyphenated text
|
|
98
|
-
*/
|
|
99
142
|
hyphenateText(text) {
|
|
100
|
-
|
|
101
|
-
const
|
|
102
|
-
|
|
143
|
+
if (!text) return '';
|
|
144
|
+
const sanitizedText = this._stripHyphens(text);
|
|
145
|
+
const parts = sanitizedText.split(/([ა-ჰ]+)/);
|
|
146
|
+
|
|
103
147
|
return parts.map(part => {
|
|
104
|
-
|
|
105
|
-
if (/[ა-ჰ]{4,}/.test(part)) {
|
|
148
|
+
if (part.length >= 4 && /[ა-ჰ]/.test(part)) {
|
|
106
149
|
return this.hyphenate(part);
|
|
107
150
|
}
|
|
108
151
|
return part;
|
|
@@ -110,42 +153,15 @@ class GeorgianHyphenator {
|
|
|
110
153
|
}
|
|
111
154
|
}
|
|
112
155
|
|
|
113
|
-
/**
|
|
114
|
-
* Convert word to TeX pattern format (e.g., .გ1ა1ა1ნ1ა1ლ1ი1ზ1ა.)
|
|
115
|
-
* Useful for LaTeX or TeX engines
|
|
116
|
-
*/
|
|
117
|
-
function toTeXPattern(word) {
|
|
118
|
-
const hyphenator = new GeorgianHyphenator();
|
|
119
|
-
const syllables = hyphenator.getSyllables(word);
|
|
120
|
-
if (syllables.length <= 1) {
|
|
121
|
-
return `.${word}.`;
|
|
122
|
-
}
|
|
123
|
-
// TeX hyphenation patterns usually use odd numbers (1, 3, 5) to indicate hyphens
|
|
124
|
-
// Here we simply join syllables with '1'
|
|
125
|
-
return '.' + syllables.join('1') + '.';
|
|
126
|
-
}
|
|
127
|
-
|
|
128
|
-
/**
|
|
129
|
-
* Convert word to Hunspell format (syllable=syllable)
|
|
156
|
+
/** * კროს-პლატფორმული მხარდაჭერა
|
|
130
157
|
*/
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
return syllables.join('=');
|
|
158
|
+
// 1. ბრაუზერისთვის (Global Object)
|
|
159
|
+
if (typeof window !== 'undefined') {
|
|
160
|
+
window.GeorgianHyphenator = GeorgianHyphenator;
|
|
135
161
|
}
|
|
136
162
|
|
|
137
|
-
//
|
|
163
|
+
// 2. Node.js (CommonJS) - იმ შემთხვევაში თუ ვინმე მაინც require-ს გამოიყენებს
|
|
164
|
+
// (მხოლოდ თუ module.exports არსებობს)
|
|
138
165
|
if (typeof module !== 'undefined' && module.exports) {
|
|
139
|
-
module.exports =
|
|
140
|
-
GeorgianHyphenator,
|
|
141
|
-
toTeXPattern,
|
|
142
|
-
toHunspellFormat
|
|
143
|
-
};
|
|
144
|
-
}
|
|
145
|
-
|
|
146
|
-
// Export for Browser
|
|
147
|
-
if (typeof window !== 'undefined') {
|
|
148
|
-
window.GeorgianHyphenator = GeorgianHyphenator;
|
|
149
|
-
window.toTeXPattern = toTeXPattern;
|
|
150
|
-
window.toHunspellFormat = toHunspellFormat;
|
|
166
|
+
module.exports = GeorgianHyphenator;
|
|
151
167
|
}
|