georgian-hyphenation 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE.txt +21 -0
- package/README.md +318 -0
- package/dist/georgian_hyphenation-1.0.1-py3-none-any.whl +0 -0
- package/dist/georgian_hyphenation-1.0.1.tar.gz +0 -0
- package/dist/index.d.ts +47 -0
- package/dist/index.js +199 -0
- package/package.json +40 -0
package/LICENSE.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 [შენი სახელი]
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
# Georgian Language Hyphenation / ქართული ენის დამარცვლა
|
|
2
|
+
|
|
3
|
+
[](https://opensource.org/licenses/MIT)
|
|
4
|
+
[](https://www.python.org/downloads/)
|
|
5
|
+
[](https://www.ecma-international.org/)
|
|
6
|
+
[](https://github.com/guramzhgamadze/georgian-hyphenation)
|
|
7
|
+
|
|
8
|
+
A comprehensive hyphenation library for the Georgian language, supporting multiple output formats including TeX, Hunspell, and web standards.
|
|
9
|
+
|
|
10
|
+
ქართული ენის სრული დამარცვლის ბიბლიოთეკა, რომელიც მხარს უჭერს მრავალ ფორმატს: TeX, Hunspell და ვებ სტანდარტები.
|
|
11
|
+
|
|
12
|
+
## Features / ფუნქციები
|
|
13
|
+
|
|
14
|
+
- ✅ **Accurate syllabification** based on Georgian phonological rules
|
|
15
|
+
- ✅ **Multiple output formats**: Soft hyphens (U+00AD), TeX patterns, Hunspell dictionary
|
|
16
|
+
- ✅ **Python and JavaScript implementations** for maximum compatibility
|
|
17
|
+
- ✅ **Web-ready** with HTML/CSS/JS demo
|
|
18
|
+
- ✅ **Export capabilities**: JSON, CSV, TeX, Hunspell
|
|
19
|
+
- ✅ **Well-tested** with comprehensive Georgian word corpus
|
|
20
|
+
|
|
21
|
+
## Installation / ინსტალაცია
|
|
22
|
+
|
|
23
|
+
### Python
|
|
24
|
+
```
|
|
25
|
+
# Install from PyPI
|
|
26
|
+
pip install georgian-hyphenation
|
|
27
|
+
|
|
28
|
+
# Or install from source
|
|
29
|
+
git clone https://github.com/guramzhgamadze/georgian-hyphenation.git
|
|
30
|
+
cd georgian-hyphenation
|
|
31
|
+
pip install -e .
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
### JavaScript
|
|
35
|
+
```
|
|
36
|
+
npm install georgian-hyphenation # Coming soon to NPM
|
|
37
|
+
# For now, use directly from source
|
|
38
|
+
```
|
|
39
|
+
## Usage / გამოყენება
|
|
40
|
+
|
|
41
|
+
### Python
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
from georgian_hyphenation import GeorgianHyphenator
|
|
45
|
+
|
|
46
|
+
# Initialize with soft hyphen (default)
|
|
47
|
+
hyphenator = GeorgianHyphenator()
|
|
48
|
+
|
|
49
|
+
# Hyphenate a word
|
|
50
|
+
word = "საქართველო"
|
|
51
|
+
result = hyphenator.hyphenate(word)
|
|
52
|
+
print(result) # საქართველო (with U+00AD soft hyphens)
|
|
53
|
+
|
|
54
|
+
# Get syllables as a list
|
|
55
|
+
syllables = hyphenator.getSyllables(word)
|
|
56
|
+
print(syllables) # ['სა', 'ქარ', 'თვე', 'ლო']
|
|
57
|
+
|
|
58
|
+
# Use visible hyphens for display
|
|
59
|
+
visible = GeorgianHyphenator('-')
|
|
60
|
+
print(visible.hyphenate(word)) # სა-ქარ-თვე-ლო
|
|
61
|
+
|
|
62
|
+
# Hyphenate entire text (if you add this method)
|
|
63
|
+
text = "საქართველო არის ლამაზი ქვეყანა"
|
|
64
|
+
words = text.split()
|
|
65
|
+
hyphenated = ' '.join([hyphenator.hyphenate(w) for w in words])
|
|
66
|
+
print(hyphenated)
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### JavaScript
|
|
70
|
+
|
|
71
|
+
```javascript
|
|
72
|
+
// Initialize hyphenator
|
|
73
|
+
const hyphenator = new GeorgianHyphenator();
|
|
74
|
+
|
|
75
|
+
// Hyphenate a word
|
|
76
|
+
const word = "საქართველო";
|
|
77
|
+
const result = hyphenator.hyphenate(word);
|
|
78
|
+
console.log(result); // საქართველო (with U+00AD)
|
|
79
|
+
|
|
80
|
+
// Get syllables
|
|
81
|
+
const syllables = hyphenator.getSyllables(word);
|
|
82
|
+
console.log(syllables); // ['სა', 'ქარ', 'თვე', 'ლო']
|
|
83
|
+
|
|
84
|
+
// Use visible hyphens
|
|
85
|
+
const visible = new GeorgianHyphenator('-');
|
|
86
|
+
console.log(visible.hyphenate(word)); // სა-ქარ-თვე-ლო
|
|
87
|
+
|
|
88
|
+
// Hyphenate text
|
|
89
|
+
const text = "საქართველო არის ლამაზი ქვეყანა";
|
|
90
|
+
console.log(hyphenator.hyphenateText(text));
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### HTML/CSS Integration
|
|
94
|
+
|
|
95
|
+
```html
|
|
96
|
+
<!DOCTYPE html>
|
|
97
|
+
<html lang="ka">
|
|
98
|
+
<head>
|
|
99
|
+
<style>
|
|
100
|
+
.hyphenated {
|
|
101
|
+
hyphens: manual;
|
|
102
|
+
text-align: justify;
|
|
103
|
+
}
|
|
104
|
+
</style>
|
|
105
|
+
</head>
|
|
106
|
+
<body>
|
|
107
|
+
<p class="hyphenated" id="text"></p>
|
|
108
|
+
|
|
109
|
+
<script src="georgian-hyphenation.js"></script>
|
|
110
|
+
<script>
|
|
111
|
+
const hyphenator = new GeorgianHyphenator('\u00AD');
|
|
112
|
+
const text = "საქართველო არის ძალიან ლამაზი ქვეყანა";
|
|
113
|
+
document.getElementById('text').textContent =
|
|
114
|
+
hyphenator.hyphenateText(text);
|
|
115
|
+
</script>
|
|
116
|
+
</body>
|
|
117
|
+
</html>
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
## Export Formats / ექსპორტის ფორმატები
|
|
121
|
+
|
|
122
|
+
### TeX Patterns
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
from georgian_hyphenation import TeXPatternGenerator
|
|
126
|
+
|
|
127
|
+
hyphenator = GeorgianHyphenator()
|
|
128
|
+
tex_gen = TeXPatternGenerator(hyphenator)
|
|
129
|
+
|
|
130
|
+
words = ["საქართველო", "მთავრობა", "დედაქალაქი"]
|
|
131
|
+
tex_gen.generate_patterns_file(words, "hyph-ka.tex")
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
Output (`hyph-ka.tex`):
|
|
135
|
+
```tex
|
|
136
|
+
% Georgian hyphenation patterns
|
|
137
|
+
\patterns{
|
|
138
|
+
.სა1ქარ1თვე1ლო
|
|
139
|
+
.მთავ1რო1ბა
|
|
140
|
+
.დე1და1ქა1ლა1ქი
|
|
141
|
+
}
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
### Hunspell Dictionary
|
|
145
|
+
|
|
146
|
+
```python
|
|
147
|
+
from georgian_hyphenation import HunspellDictionaryGenerator
|
|
148
|
+
|
|
149
|
+
hunspell_gen = HunspellDictionaryGenerator(hyphenator)
|
|
150
|
+
words = ["საქართველო", "მთავრობა"]
|
|
151
|
+
hunspell_gen.generate_dictionary(words, "hyph_ka_GE")
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
Output (`hyph_ka_GE.dic`):
|
|
155
|
+
```
|
|
156
|
+
UTF-8
|
|
157
|
+
2
|
|
158
|
+
სა=ქარ=თვე=ლო
|
|
159
|
+
მთავ=რო=ბა
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
### JSON Export
|
|
163
|
+
|
|
164
|
+
```python
|
|
165
|
+
from georgian_hyphenation import HyphenationExporter
|
|
166
|
+
|
|
167
|
+
exporter = HyphenationExporter(hyphenator)
|
|
168
|
+
words = ["საქართველო", "მთავრობა"]
|
|
169
|
+
exporter.export_json(words, "georgian_hyphenation.json")
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
Output:
|
|
173
|
+
```json
|
|
174
|
+
{
|
|
175
|
+
"საქართველო": {
|
|
176
|
+
"syllables": ["სა", "ქარ", "თვე", "ლო"],
|
|
177
|
+
"hyphenated": "საქართველო"
|
|
178
|
+
},
|
|
179
|
+
"მთავრობა": {
|
|
180
|
+
"syllables": ["მთავ", "რო", "ბა"],
|
|
181
|
+
"hyphenated": "მთავრობა"
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
## Hyphenation Rules / დამარცვლის წესები
|
|
187
|
+
|
|
188
|
+
The library implements Georgian syllabification rules based on phonological patterns:
|
|
189
|
+
|
|
190
|
+
ბიბლიოთეკა იყენებს ქართული ფონოლოგიის წესებზე დაფუძნებულ მარცვლების გამოყოფას:
|
|
191
|
+
|
|
192
|
+
1. **V+C+C+V** → VC|CV (ხმოვანი + თანხმოვანი + თანხმოვნები + ხმოვანი)
|
|
193
|
+
2. **V+C+V+C+V** → VCV|CV
|
|
194
|
+
3. **C+V+C+V** → CV|CV
|
|
195
|
+
4. **V+V+V** → VV|V (სამი ხმოვანი ზედიზედ)
|
|
196
|
+
5. Special rules for word boundaries (სიტყვის საზღვრების სპეციალური წესები)
|
|
197
|
+
|
|
198
|
+
Where:
|
|
199
|
+
- **V** = vowel (ხმოვანი): ა, ე, ი, ო, უ
|
|
200
|
+
- **C** = consonant (თანხმოვანი): ბ, გ, დ, ვ, ზ, თ, კ, ლ, მ, ნ, პ, ჟ, რ, ს, ტ, ფ, ქ, ღ, ყ, შ, ჩ, ც, ძ, წ, ჭ, ხ, ჯ, ჰ
|
|
201
|
+
|
|
202
|
+
## Examples / მაგალითები
|
|
203
|
+
|
|
204
|
+
| Word (სიტყვა) | Syllables (მარცვლები) | Pattern |
|
|
205
|
+
|---------------|----------------------|---------|
|
|
206
|
+
| საქართველო | სა-ქარ-თვე-ლო | .სა1ქარ1თვე1ლო |
|
|
207
|
+
| მთავრობა | მთავ-რო-ბა | .მთავ1რო1ბა |
|
|
208
|
+
| დედაქალაქი | დე-და-ქა-ლა-ქი | .დე1და1ქა1ლა1ქი |
|
|
209
|
+
| ტელევიზორი | ტე-ლე-ვი-ზო-რი | .ტე1ლე1ვი1ზო1რი |
|
|
210
|
+
| კომპიუტერი | კომ-პი-უ-ტე-რი | .კომ1პი1უ1ტე1რი |
|
|
211
|
+
| უნივერსიტეტი | უ-ნი-ვერ-სი-ტე-ტი | .უ1ნი1ვერ1სი1ტე1ტი |
|
|
212
|
+
|
|
213
|
+
## Testing / ტესტირება
|
|
214
|
+
|
|
215
|
+
```bash
|
|
216
|
+
# Python tests
|
|
217
|
+
python -m pytest tests/
|
|
218
|
+
|
|
219
|
+
# JavaScript tests
|
|
220
|
+
npm test
|
|
221
|
+
|
|
222
|
+
# Run demo
|
|
223
|
+
python georgian_hyphenation.py
|
|
224
|
+
# or open demo.html in browser
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
## Contributing / წვლილის შეტანა
|
|
228
|
+
|
|
229
|
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
|
230
|
+
|
|
231
|
+
მოხარული ვიქნებით თქვენი წვლილით! გთხოვთ გამოგზავნოთ Pull Request.
|
|
232
|
+
|
|
233
|
+
1. Fork the repository
|
|
234
|
+
2. Create your feature branch (`git checkout -b feature/AmazingFeature`)
|
|
235
|
+
3. Commit your changes (`git commit -m 'Add some AmazingFeature'`)
|
|
236
|
+
4. Push to the branch (`git push origin feature/AmazingFeature`)
|
|
237
|
+
5. Open a Pull Request
|
|
238
|
+
|
|
239
|
+
## Integration with Popular Tools / ინტეგრაცია პოპულარულ ხელსაწყოებთან
|
|
240
|
+
|
|
241
|
+
### LibreOffice / OpenOffice
|
|
242
|
+
|
|
243
|
+
1. Generate Hunspell dictionary files
|
|
244
|
+
2. Copy to extensions directory:
|
|
245
|
+
- Linux: `~/.config/libreoffice/4/user/uno_packages/cache/`
|
|
246
|
+
- Windows: `%APPDATA%\LibreOffice\4\user\uno_packages\cache\`
|
|
247
|
+
- macOS: `~/Library/Application Support/LibreOffice/4/user/uno_packages/cache/`
|
|
248
|
+
|
|
249
|
+
### LaTeX / XeLaTeX
|
|
250
|
+
|
|
251
|
+
```latex
|
|
252
|
+
\documentclass{article}
|
|
253
|
+
\usepackage{polyglossia}
|
|
254
|
+
\setmainlanguage{georgian}
|
|
255
|
+
\usepackage{hyphenat}
|
|
256
|
+
|
|
257
|
+
% Include generated patterns
|
|
258
|
+
\input{hyph-ka.tex}
|
|
259
|
+
|
|
260
|
+
\begin{document}
|
|
261
|
+
საქართველო არის ძალიან ლამაზი ქვეყანა
|
|
262
|
+
\end{document}
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
### Web Browsers (CSS)
|
|
266
|
+
|
|
267
|
+
```css
|
|
268
|
+
html {
|
|
269
|
+
lang: ka;
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
p {
|
|
273
|
+
hyphens: manual; /* Use with soft hyphens */
|
|
274
|
+
/* or */
|
|
275
|
+
hyphens: auto; /* If browser supports Georgian */
|
|
276
|
+
text-align: justify;
|
|
277
|
+
}
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
## Roadmap / სამომავლო გეგმები
|
|
281
|
+
|
|
282
|
+
- [ ] PyPI package release
|
|
283
|
+
- [ ] NPM package release
|
|
284
|
+
- [ ] Browser extension (Chrome, Firefox)
|
|
285
|
+
- [ ] InDesign plugin
|
|
286
|
+
- [ ] MS Word add-in
|
|
287
|
+
- [ ] Submit to TeX Live hyphenation database
|
|
288
|
+
- [ ] Submit to Unicode CLDR
|
|
289
|
+
- [ ] Mobile apps (iOS, Android)
|
|
290
|
+
- [ ] API service
|
|
291
|
+
|
|
292
|
+
## License / ლიცენზია
|
|
293
|
+
|
|
294
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
295
|
+
|
|
296
|
+
## Acknowledgments / მადლობა
|
|
297
|
+
|
|
298
|
+
- Based on Georgian phonological research
|
|
299
|
+
- Inspired by TeX hyphenation patterns
|
|
300
|
+
- Thanks to the Georgian linguistic community
|
|
301
|
+
|
|
302
|
+
## Contact / კონტაქტი
|
|
303
|
+
|
|
304
|
+
- GitHub Issues: [Report bugs or request features](https://github.com/guramzhgamadze/georgian-hyphenation/issues)
|
|
305
|
+
- Email: guramzhgamadze@gmail.com
|
|
306
|
+
|
|
307
|
+
## References / ლიტერატურა
|
|
308
|
+
|
|
309
|
+
- Georgian Language Phonology and Syllable Structure
|
|
310
|
+
- TeX Hyphenation Algorithm (Liang, 1983)
|
|
311
|
+
- Hunspell Hyphenation Documentation
|
|
312
|
+
- Unicode Standard for Georgian Script
|
|
313
|
+
|
|
314
|
+
---
|
|
315
|
+
|
|
316
|
+
Made with ❤️ for the Georgian language community
|
|
317
|
+
|
|
318
|
+
შექმნილია ❤️-ით ქართული ენის საზოგადოებისთვის
|
|
Binary file
|
|
Binary file
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Georgian Language Hyphenation Library
|
|
3
|
+
* ქართული ენის დამარცვლის ბიბლიოთეკა
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
export class GeorgianHyphenator {
|
|
7
|
+
/**
|
|
8
|
+
* Create a Georgian hyphenator
|
|
9
|
+
* @param hyphenChar - Character to use for hyphenation points (default: U+00AD soft hyphen)
|
|
10
|
+
*/
|
|
11
|
+
constructor(hyphenChar?: string);
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Hyphenate a Georgian word
|
|
15
|
+
* @param word - Georgian word to hyphenate
|
|
16
|
+
* @returns Word with hyphenation points inserted
|
|
17
|
+
*/
|
|
18
|
+
hyphenate(word: string): string;
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Get syllables for a Georgian word
|
|
22
|
+
* @param word - Georgian word
|
|
23
|
+
* @returns Array of syllables
|
|
24
|
+
*/
|
|
25
|
+
getSyllables(word: string): string[];
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Hyphenate entire text
|
|
29
|
+
* @param text - Georgian text
|
|
30
|
+
* @returns Hyphenated text
|
|
31
|
+
*/
|
|
32
|
+
hyphenateText(text: string): string;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Convert word to TeX pattern format
|
|
37
|
+
* @param word - Georgian word
|
|
38
|
+
* @returns TeX pattern
|
|
39
|
+
*/
|
|
40
|
+
export function toTeXPattern(word: string): string;
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* Convert word to Hunspell format
|
|
44
|
+
* @param word - Georgian word
|
|
45
|
+
* @returns Hunspell format
|
|
46
|
+
*/
|
|
47
|
+
export function toHunspellFormat(word: string): string;
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Georgian Language Hyphenation Library (JavaScript)
|
|
3
|
+
* ქართული ენის დამარცვლის ბიბლიოთეკა
|
|
4
|
+
*
|
|
5
|
+
* Usage:
|
|
6
|
+
* const hyphenator = new GeorgianHyphenator();
|
|
7
|
+
* const result = hyphenator.hyphenate("საქართველო");
|
|
8
|
+
* // Result: "სა\u00ADქარ\u00ADთვე\u00ADლო"
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
class GeorgianHyphenator {
|
|
12
|
+
/**
|
|
13
|
+
* Initialize Georgian Hyphenator
|
|
14
|
+
* @param {string} hyphenChar - Character to use for hyphenation (default: soft hyphen U+00AD)
|
|
15
|
+
*/
|
|
16
|
+
constructor(hyphenChar = '\u00AD') {
|
|
17
|
+
this.hyphenChar = hyphenChar;
|
|
18
|
+
this.C = '[ბგდვზთკლმნპჟრსტფქღყშჩცძწჭხჯჰ]'; // Consonants
|
|
19
|
+
this.V = '[აეიოუ]'; // Vowels
|
|
20
|
+
this.char = '[ა-ჰ]'; // All Georgian letters
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Count vowels in a word
|
|
25
|
+
* @param {string} word - Georgian word
|
|
26
|
+
* @returns {number} Number of vowels
|
|
27
|
+
*/
|
|
28
|
+
countVowels(word) {
|
|
29
|
+
const vowels = 'აეიოუ';
|
|
30
|
+
let count = 0;
|
|
31
|
+
for (let v of vowels) {
|
|
32
|
+
count += (word.match(new RegExp(v, 'g')) || []).length;
|
|
33
|
+
}
|
|
34
|
+
return count;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Apply hyphenation rules with specified boundary markers
|
|
39
|
+
* @private
|
|
40
|
+
*/
|
|
41
|
+
_applyRules(w, softhpn, startchar, endchar) {
|
|
42
|
+
const C = this.C;
|
|
43
|
+
const V = this.V;
|
|
44
|
+
const char = this.char;
|
|
45
|
+
|
|
46
|
+
let t = w;
|
|
47
|
+
|
|
48
|
+
// Rule 1: V+C+C++V → VC|CV
|
|
49
|
+
t = t.replace(new RegExp(`(${V})(${C})(${C}+)(${V})`, 'gu'),
|
|
50
|
+
`$1$2${softhpn}$3$4`);
|
|
51
|
+
|
|
52
|
+
// Rule 2: V+C+V+C+V → VCV|CV
|
|
53
|
+
t = t.replace(new RegExp(`(${V})(${C})(${V})(${C})(${V})`, 'gu'),
|
|
54
|
+
`$1$2$3${softhpn}$4$5`);
|
|
55
|
+
|
|
56
|
+
// Rule 3: C+V+C+V → CV|CV
|
|
57
|
+
t = t.replace(new RegExp(`(${C})(${V})(${C})(${V})`, 'gu'),
|
|
58
|
+
`$1$2${softhpn}$3$4`);
|
|
59
|
+
|
|
60
|
+
// Rule 4: V+V+V → VV|V
|
|
61
|
+
t = t.replace(new RegExp(`(${V})(${V})(${V})`, 'gu'),
|
|
62
|
+
`$1$2${softhpn}$3`);
|
|
63
|
+
|
|
64
|
+
// Rule 5: Word start - ^VCVCV
|
|
65
|
+
t = t.replace(new RegExp(`${startchar}(${V})(${C})(${V})(${C})(${V})`, 'gu'),
|
|
66
|
+
`$1$2$3${softhpn}$4$5`);
|
|
67
|
+
|
|
68
|
+
// Rule 6: Word start - ^VCVCchar
|
|
69
|
+
t = t.replace(new RegExp(`${startchar}(${V})(${C})(${V})(${C})(${char})`, 'gu'),
|
|
70
|
+
`$1$2$3${softhpn}$4$5`);
|
|
71
|
+
|
|
72
|
+
// Rule 7: Word start - ^C++CVCV
|
|
73
|
+
t = t.replace(new RegExp(`${startchar}(${C}+)(${V})(${C})(${V})`, 'gu'),
|
|
74
|
+
`$1$2${softhpn}$3$4`);
|
|
75
|
+
|
|
76
|
+
// Rule 8: Word start - ^C++VVchar
|
|
77
|
+
t = t.replace(new RegExp(`${startchar}(${C}+)(${V})(${V})(${char})`, 'gu'),
|
|
78
|
+
`$1$2${softhpn}$3$4`);
|
|
79
|
+
|
|
80
|
+
// Rule 9: Word end - charVVC++$
|
|
81
|
+
t = t.replace(new RegExp(`(${char})(${V})(${V})(${C}+)${endchar}`, 'gu'),
|
|
82
|
+
`$1$2${softhpn}$3$4`);
|
|
83
|
+
|
|
84
|
+
// Rule 10: Word end - charVCV$
|
|
85
|
+
t = t.replace(new RegExp(`(${char})(${V})(${C})(${V})${endchar}`, 'gu'),
|
|
86
|
+
`$1$2${softhpn}$3$4`);
|
|
87
|
+
|
|
88
|
+
// Rule 11: Word end - VCC++VC++$
|
|
89
|
+
t = t.replace(new RegExp(`(${V})(${C})(${C}+)(${V})(${C}+)${endchar}`, 'gu'),
|
|
90
|
+
`$1$2${softhpn}$3$4$5`);
|
|
91
|
+
|
|
92
|
+
// Rule 12: Word end - charVCVC++$
|
|
93
|
+
t = t.replace(new RegExp(`(${char})(${V})(${C})(${V}+)(${C}+)${endchar}`, 'gu'),
|
|
94
|
+
`$1$2${softhpn}$3$4$5`);
|
|
95
|
+
|
|
96
|
+
return t;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* Hyphenate a single Georgian word
|
|
101
|
+
* @param {string} word - Georgian word to hyphenate
|
|
102
|
+
* @returns {string} Word with hyphenation points
|
|
103
|
+
*/
|
|
104
|
+
hyphenate(word) {
|
|
105
|
+
// Don't hyphenate words with 0-1 vowels
|
|
106
|
+
if (this.countVowels(word) <= 1) {
|
|
107
|
+
return word;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
const softhpn = this.hyphenChar;
|
|
111
|
+
|
|
112
|
+
// Apply hyphenation rules with different boundary markers
|
|
113
|
+
let result = this._applyRules(word, softhpn, '^', '$');
|
|
114
|
+
result = this._applyRules(result, softhpn, '^', this._escapeRegex(softhpn));
|
|
115
|
+
result = this._applyRules(result, this._escapeRegex(softhpn), '$');
|
|
116
|
+
result = this._applyRules(result, this._escapeRegex(softhpn), this._escapeRegex(softhpn));
|
|
117
|
+
|
|
118
|
+
// Remove duplicate hyphens
|
|
119
|
+
const escapedHyphen = this._escapeRegex(softhpn);
|
|
120
|
+
result = result.replace(new RegExp(`${escapedHyphen}+`, 'gu'), softhpn);
|
|
121
|
+
|
|
122
|
+
return result;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
/**
|
|
126
|
+
* Get array of syllables for a word
|
|
127
|
+
* @param {string} word - Georgian word
|
|
128
|
+
* @returns {string[]} Array of syllables
|
|
129
|
+
*/
|
|
130
|
+
getSyllables(word) {
|
|
131
|
+
const hyphenated = this.hyphenate(word);
|
|
132
|
+
return hyphenated.split(this.hyphenChar);
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* Hyphenate entire text
|
|
137
|
+
* @param {string} text - Georgian text
|
|
138
|
+
* @returns {string} Hyphenated text
|
|
139
|
+
*/
|
|
140
|
+
hyphenateText(text) {
|
|
141
|
+
const words = text.split(' ');
|
|
142
|
+
const hyphenatedWords = words.map(w => this.hyphenate(w));
|
|
143
|
+
return hyphenatedWords.join(' ');
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
/**
|
|
147
|
+
* Escape special regex characters
|
|
148
|
+
* @private
|
|
149
|
+
*/
|
|
150
|
+
_escapeRegex(str) {
|
|
151
|
+
return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
/**
|
|
156
|
+
* Convert word to TeX pattern format
|
|
157
|
+
* @param {string} word - Georgian word
|
|
158
|
+
* @returns {string} TeX pattern
|
|
159
|
+
*/
|
|
160
|
+
function toTeXPattern(word) {
|
|
161
|
+
const hyphenator = new GeorgianHyphenator();
|
|
162
|
+
const syllables = hyphenator.getSyllables(word);
|
|
163
|
+
if (syllables.length <= 1) {
|
|
164
|
+
return `.${word}`;
|
|
165
|
+
}
|
|
166
|
+
return '.' + syllables.join('1');
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
/**
|
|
170
|
+
* Convert word to Hunspell format
|
|
171
|
+
* @param {string} word - Georgian word
|
|
172
|
+
* @returns {string} Hunspell format
|
|
173
|
+
*/
|
|
174
|
+
function toHunspellFormat(word) {
|
|
175
|
+
const hyphenator = new GeorgianHyphenator();
|
|
176
|
+
const syllables = hyphenator.getSyllables(word);
|
|
177
|
+
return syllables.join('=');
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
// Export for use in Node.js or browser
|
|
181
|
+
if (typeof module !== 'undefined' && module.exports) {
|
|
182
|
+
module.exports = {
|
|
183
|
+
GeorgianHyphenator,
|
|
184
|
+
toTeXPattern,
|
|
185
|
+
toHunspellFormat
|
|
186
|
+
};
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
// Demo usage
|
|
190
|
+
if (typeof window !== 'undefined') {
|
|
191
|
+
window.GeorgianHyphenator = GeorgianHyphenator;
|
|
192
|
+
window.toTeXPattern = toTeXPattern;
|
|
193
|
+
window.toHunspellFormat = toHunspellFormat;
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
// Example usage:
|
|
197
|
+
// const hyphenator = new GeorgianHyphenator('-'); // visible hyphens
|
|
198
|
+
// console.log(hyphenator.hyphenate("საქართველო")); // "სა-ქარ-თვე-ლო"
|
|
199
|
+
// console.log(hyphenator.getSyllables("საქართველო")); // ["სა", "ქარ", "თვე", "ლო"]
|
package/package.json
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "georgian-hyphenation",
|
|
3
|
+
"version": "1.0.1",
|
|
4
|
+
"description": "Georgian Language Hyphenation Library - ქართული ენის დამარცვლის ბიბლიოთეკა",
|
|
5
|
+
"main": "dist/index.js",
|
|
6
|
+
"types": "dist/index.d.ts",
|
|
7
|
+
"files": [
|
|
8
|
+
"dist",
|
|
9
|
+
"README.md",
|
|
10
|
+
"LICENSE"
|
|
11
|
+
],
|
|
12
|
+
"scripts": {
|
|
13
|
+
"build": "node build.js",
|
|
14
|
+
"test": "node test.js",
|
|
15
|
+
"prepublishOnly": "npm run build"
|
|
16
|
+
},
|
|
17
|
+
"repository": {
|
|
18
|
+
"type": "git",
|
|
19
|
+
"url": "git+https://github.com/guramzhgamadze/georgian-hyphenation.git"
|
|
20
|
+
},
|
|
21
|
+
"keywords": [
|
|
22
|
+
"georgian",
|
|
23
|
+
"kartuli",
|
|
24
|
+
"ქართული",
|
|
25
|
+
"hyphenation",
|
|
26
|
+
"syllabification",
|
|
27
|
+
"nlp",
|
|
28
|
+
"natural-language-processing",
|
|
29
|
+
"linguistics",
|
|
30
|
+
"text-processing",
|
|
31
|
+
"i18n",
|
|
32
|
+
"localization"
|
|
33
|
+
],
|
|
34
|
+
"author": "Guram Zhgamadze <guramzhgamadze@gmail.com>",
|
|
35
|
+
"license": "MIT",
|
|
36
|
+
"bugs": {
|
|
37
|
+
"url": "https://github.com/guramzhgamadze/georgian-hyphenation/issues"
|
|
38
|
+
},
|
|
39
|
+
"homepage": "https://github.com/guramzhgamadze/georgian-hyphenation#readme"
|
|
40
|
+
}
|