georgian-hyphenation 2.2.2 → 2.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +211 -67
- package/package.json +24 -29
- package/src/georgian_hyphenation/__init__.py +26 -0
- package/src/georgian_hyphenation/__pycache__/__init__.cpython-313.pyc +0 -0
- package/src/georgian_hyphenation/__pycache__/hyphenator.cpython-313.pyc +0 -0
- package/src/georgian_hyphenation/hyphenator.py +358 -0
- package/src/georgian_hyphenation/hyphenator.py.backup +312 -0
- package/src/georgian_hyphenation.egg-info/PKG-INFO +657 -0
- package/src/georgian_hyphenation.egg-info/SOURCES.txt +14 -0
- package/src/georgian_hyphenation.egg-info/dependency_links.txt +1 -0
- package/src/georgian_hyphenation.egg-info/requires.txt +3 -0
- package/src/georgian_hyphenation.egg-info/top_level.txt +2 -0
- package/src/javascript/index.js +60 -22
|
@@ -0,0 +1,358 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
Georgian Hyphenation Library v2.2.1
|
|
4
|
+
ქართული ენის დამარცვლის ბიბლიოთეკა
|
|
5
|
+
|
|
6
|
+
Modernized & Optimized
|
|
7
|
+
- Hybrid Engine: Algorithm + Dictionary
|
|
8
|
+
- Harmonic Clusters Support
|
|
9
|
+
- Gemination Handling
|
|
10
|
+
- O(1) Cluster Lookup with Set
|
|
11
|
+
|
|
12
|
+
Author: Guram Zhgamadze
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
import os
|
|
17
|
+
import re
|
|
18
|
+
from typing import List, Dict, Set
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class GeorgianHyphenator:
|
|
22
|
+
"""
|
|
23
|
+
Georgian language hyphenation with hybrid engine
|
|
24
|
+
|
|
25
|
+
Features:
|
|
26
|
+
- Phonological distance analysis
|
|
27
|
+
- Dictionary-based exception handling
|
|
28
|
+
- Harmonic cluster awareness
|
|
29
|
+
- Gemination (double consonant) handling
|
|
30
|
+
- Anti-orphan protection
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self, hyphen_char: str = '\u00AD'):
|
|
34
|
+
"""
|
|
35
|
+
Initialize Georgian Hyphenator
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
hyphen_char: Character to use for hyphenation (default: soft hyphen U+00AD)
|
|
39
|
+
"""
|
|
40
|
+
self.hyphen_char = hyphen_char
|
|
41
|
+
self.vowels = 'აეიოუ'
|
|
42
|
+
self.left_min = 2
|
|
43
|
+
self.right_min = 2
|
|
44
|
+
|
|
45
|
+
# v2.2.1: Optimized - Set for O(1) lookup instead of list
|
|
46
|
+
self.harmonic_clusters: Set[str] = {
|
|
47
|
+
'ბლ', 'ბრ', 'ბღ', 'ბზ', 'გდ', 'გლ', 'გმ', 'გნ', 'გვ', 'გზ', 'გრ',
|
|
48
|
+
'დრ', 'თლ', 'თრ', 'თღ', 'კლ', 'კმ', 'კნ', 'კრ', 'კვ', 'მტ', 'პლ',
|
|
49
|
+
'პრ', 'ჟღ', 'რგ', 'რლ', 'რმ', 'სწ', 'სხ', 'ტკ', 'ტპ', 'ტრ', 'ფლ',
|
|
50
|
+
'ფრ', 'ფქ', 'ფშ', 'ქლ', 'ქნ', 'ქვ', 'ქრ', 'ღლ', 'ღრ', 'ყლ', 'ყრ',
|
|
51
|
+
'შთ', 'შპ', 'ჩქ', 'ჩრ', 'ცლ', 'ცნ', 'ცრ', 'ცვ', 'ძგ', 'ძვ', 'ძღ',
|
|
52
|
+
'წლ', 'წრ', 'წნ', 'წკ', 'ჭკ', 'ჭრ', 'ჭყ', 'ხლ', 'ხმ', 'ხნ', 'ხვ', 'ჯგ'
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
# v2.2.1: Dictionary for exception words
|
|
56
|
+
self.dictionary: Dict[str, str] = {}
|
|
57
|
+
|
|
58
|
+
def _strip_hyphens(self, text: str) -> str:
|
|
59
|
+
"""
|
|
60
|
+
Remove existing hyphenation symbols (Sanitization)
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
text: Input text
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
Text without any hyphens
|
|
67
|
+
"""
|
|
68
|
+
if not text:
|
|
69
|
+
return ''
|
|
70
|
+
# Remove soft hyphens and visible hyphens
|
|
71
|
+
return text.replace('\u00AD', '').replace(self.hyphen_char, '').replace('-', '')
|
|
72
|
+
|
|
73
|
+
def load_library(self, data: Dict[str, str]) -> None:
|
|
74
|
+
"""
|
|
75
|
+
Load custom dictionary
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
data: Dictionary mapping words to their hyphenation
|
|
79
|
+
Example: {"საქართველო": "სა-ქარ-თვე-ლო"}
|
|
80
|
+
"""
|
|
81
|
+
if data and isinstance(data, dict):
|
|
82
|
+
self.dictionary.update(data)
|
|
83
|
+
|
|
84
|
+
def load_default_library(self) -> None:
|
|
85
|
+
"""
|
|
86
|
+
Load default exceptions dictionary from data/exceptions.json
|
|
87
|
+
|
|
88
|
+
Works in both development and installed package modes.
|
|
89
|
+
Tries multiple locations to find the data file.
|
|
90
|
+
"""
|
|
91
|
+
try:
|
|
92
|
+
package_dir = os.path.dirname(__file__)
|
|
93
|
+
|
|
94
|
+
# Try multiple possible locations
|
|
95
|
+
locations = [
|
|
96
|
+
# Development mode (root data/ folder)
|
|
97
|
+
os.path.join(package_dir, '..', '..', 'data', 'exceptions.json'),
|
|
98
|
+
# Installed via pip (data/ copied to site-packages)
|
|
99
|
+
os.path.join(os.path.dirname(package_dir), 'data', 'exceptions.json'),
|
|
100
|
+
# Alternative installed location
|
|
101
|
+
os.path.join(package_dir, 'data', 'exceptions.json'),
|
|
102
|
+
]
|
|
103
|
+
|
|
104
|
+
data_file = None
|
|
105
|
+
for loc in locations:
|
|
106
|
+
abs_loc = os.path.abspath(loc)
|
|
107
|
+
if os.path.exists(abs_loc):
|
|
108
|
+
data_file = abs_loc
|
|
109
|
+
break
|
|
110
|
+
|
|
111
|
+
if data_file:
|
|
112
|
+
with open(data_file, 'r', encoding='utf-8') as f:
|
|
113
|
+
data = json.load(f)
|
|
114
|
+
self.load_library(data)
|
|
115
|
+
print(f"Georgian Hyphenation v2.2.1: Dictionary loaded ({len(self.dictionary)} words)")
|
|
116
|
+
else:
|
|
117
|
+
print("Georgian Hyphenation v2.2.1: Dictionary not found, using algorithm only")
|
|
118
|
+
|
|
119
|
+
except Exception as e:
|
|
120
|
+
print(f"Georgian Hyphenation v2.2.1: Could not load dictionary ({e}), using algorithm only")
|
|
121
|
+
|
|
122
|
+
def hyphenate(self, word: str) -> str:
|
|
123
|
+
"""
|
|
124
|
+
Hyphenate a Georgian word
|
|
125
|
+
|
|
126
|
+
v2.2.1 Behavior: Always strip existing hyphens and re-hyphenate.
|
|
127
|
+
This corrects any previously incorrect hyphenation.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
word: Georgian word to hyphenate
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
Hyphenated word with configured hyphen character
|
|
134
|
+
"""
|
|
135
|
+
# v2.2.1: Always strip existing hyphens first (sanitization)
|
|
136
|
+
sanitized_word = self._strip_hyphens(word)
|
|
137
|
+
|
|
138
|
+
# Remove punctuation for dictionary lookup
|
|
139
|
+
clean_word = re.sub(r'[.,/#!$%^&*;:{}=\-_`~()]', '', sanitized_word)
|
|
140
|
+
|
|
141
|
+
# Check dictionary first (if available)
|
|
142
|
+
if clean_word in self.dictionary:
|
|
143
|
+
return self.dictionary[clean_word].replace('-', self.hyphen_char)
|
|
144
|
+
|
|
145
|
+
# Fallback to algorithm
|
|
146
|
+
return self.apply_algorithm(sanitized_word)
|
|
147
|
+
|
|
148
|
+
def apply_algorithm(self, word: str) -> str:
|
|
149
|
+
"""
|
|
150
|
+
Apply hyphenation algorithm
|
|
151
|
+
|
|
152
|
+
v2.2.1 Algorithm Features:
|
|
153
|
+
- Vowel-based syllable detection
|
|
154
|
+
- Gemination (double consonant) handling
|
|
155
|
+
- Harmonic cluster preservation
|
|
156
|
+
- Anti-orphan protection (leftMin=2, rightMin=2)
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
word: Word to hyphenate
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
Hyphenated word
|
|
163
|
+
"""
|
|
164
|
+
# Skip short words
|
|
165
|
+
if len(word) < (self.left_min + self.right_min):
|
|
166
|
+
return word
|
|
167
|
+
|
|
168
|
+
# Find all vowel positions
|
|
169
|
+
vowel_indices = [i for i, char in enumerate(word) if char in self.vowels]
|
|
170
|
+
|
|
171
|
+
# Need at least 2 vowels for hyphenation
|
|
172
|
+
if len(vowel_indices) < 2:
|
|
173
|
+
return word
|
|
174
|
+
|
|
175
|
+
insert_points = []
|
|
176
|
+
|
|
177
|
+
# Analyze each vowel pair
|
|
178
|
+
for i in range(len(vowel_indices) - 1):
|
|
179
|
+
v1 = vowel_indices[i]
|
|
180
|
+
v2 = vowel_indices[i + 1]
|
|
181
|
+
distance = v2 - v1 - 1 # Number of consonants between vowels
|
|
182
|
+
between_substring = word[v1 + 1:v2]
|
|
183
|
+
|
|
184
|
+
candidate_pos = -1
|
|
185
|
+
|
|
186
|
+
if distance == 0:
|
|
187
|
+
# V-V: Split between vowels (გა-ა-ნა-ლი-ზა)
|
|
188
|
+
candidate_pos = v1 + 1
|
|
189
|
+
elif distance == 1:
|
|
190
|
+
# V-C-V: Split after vowel (მა-მა)
|
|
191
|
+
candidate_pos = v1 + 1
|
|
192
|
+
else:
|
|
193
|
+
# V-CC...C-V: Complex case
|
|
194
|
+
|
|
195
|
+
# v2.2.1: Check for gemination (double consonants)
|
|
196
|
+
double_consonant_index = -1
|
|
197
|
+
for j in range(len(between_substring) - 1):
|
|
198
|
+
if between_substring[j] == between_substring[j + 1]:
|
|
199
|
+
double_consonant_index = j
|
|
200
|
+
break
|
|
201
|
+
|
|
202
|
+
if double_consonant_index != -1:
|
|
203
|
+
# Split between double consonants (კლას-სი, მას-სა)
|
|
204
|
+
candidate_pos = v1 + 1 + double_consonant_index + 1
|
|
205
|
+
else:
|
|
206
|
+
# v2.2.1: Check for harmonic clusters
|
|
207
|
+
break_index = -1
|
|
208
|
+
if distance >= 2:
|
|
209
|
+
last_two = between_substring[distance - 2:distance]
|
|
210
|
+
if last_two in self.harmonic_clusters:
|
|
211
|
+
break_index = distance - 2
|
|
212
|
+
|
|
213
|
+
if break_index != -1:
|
|
214
|
+
# Split before harmonic cluster (ას-ტრო-ნო-მი-ა)
|
|
215
|
+
candidate_pos = v1 + 1 + break_index
|
|
216
|
+
else:
|
|
217
|
+
# Default: split after first consonant (ბარ-ბა-რე)
|
|
218
|
+
candidate_pos = v1 + 2
|
|
219
|
+
|
|
220
|
+
# Anti-orphan protection: ensure minimum 2 chars on each side
|
|
221
|
+
if candidate_pos >= self.left_min and (len(word) - candidate_pos) >= self.right_min:
|
|
222
|
+
insert_points.append(candidate_pos)
|
|
223
|
+
|
|
224
|
+
# Insert hyphens (from right to left to maintain positions)
|
|
225
|
+
result = list(word)
|
|
226
|
+
for pos in reversed(insert_points):
|
|
227
|
+
result.insert(pos, self.hyphen_char)
|
|
228
|
+
|
|
229
|
+
return ''.join(result)
|
|
230
|
+
|
|
231
|
+
def get_syllables(self, word: str) -> List[str]:
|
|
232
|
+
"""
|
|
233
|
+
Get syllables as a list
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
word: Word to split into syllables
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
List of syllables without hyphen characters
|
|
240
|
+
"""
|
|
241
|
+
hyphenated = self.hyphenate(word)
|
|
242
|
+
return hyphenated.split(self.hyphen_char)
|
|
243
|
+
|
|
244
|
+
def hyphenate_text(self, text: str) -> str:
|
|
245
|
+
"""
|
|
246
|
+
Hyphenate entire Georgian text
|
|
247
|
+
|
|
248
|
+
Preserves:
|
|
249
|
+
- Punctuation
|
|
250
|
+
- Non-Georgian characters
|
|
251
|
+
- Word boundaries
|
|
252
|
+
- Whitespace
|
|
253
|
+
|
|
254
|
+
v2.2.1: Strips existing hyphens from entire text first
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
text: Text to hyphenate (can contain multiple words)
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
Hyphenated text
|
|
261
|
+
"""
|
|
262
|
+
if not text:
|
|
263
|
+
return ''
|
|
264
|
+
|
|
265
|
+
# v2.2.1: Strip existing hyphens from entire text
|
|
266
|
+
sanitized_text = self._strip_hyphens(text)
|
|
267
|
+
|
|
268
|
+
# Split text into Georgian words and other characters
|
|
269
|
+
# Pattern captures Georgian letter sequences
|
|
270
|
+
parts = re.split(r'([ა-ჰ]+)', sanitized_text)
|
|
271
|
+
|
|
272
|
+
result = []
|
|
273
|
+
for part in parts:
|
|
274
|
+
# Only hyphenate Georgian words with 4+ characters
|
|
275
|
+
if len(part) >= 4 and re.search(r'[ა-ჰ]', part):
|
|
276
|
+
result.append(self.hyphenate(part))
|
|
277
|
+
else:
|
|
278
|
+
result.append(part)
|
|
279
|
+
|
|
280
|
+
return ''.join(result)
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
# Convenience functions for backward compatibility and quick usage
|
|
284
|
+
|
|
285
|
+
def hyphenate(word: str, hyphen_char: str = '\u00AD') -> str:
|
|
286
|
+
"""
|
|
287
|
+
Hyphenate a single Georgian word
|
|
288
|
+
|
|
289
|
+
Args:
|
|
290
|
+
word: Georgian word
|
|
291
|
+
hyphen_char: Hyphen character to use
|
|
292
|
+
|
|
293
|
+
Returns:
|
|
294
|
+
Hyphenated word
|
|
295
|
+
"""
|
|
296
|
+
h = GeorgianHyphenator(hyphen_char)
|
|
297
|
+
return h.hyphenate(word)
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def get_syllables(word: str) -> List[str]:
|
|
301
|
+
"""
|
|
302
|
+
Get syllables of a Georgian word
|
|
303
|
+
|
|
304
|
+
Args:
|
|
305
|
+
word: Georgian word
|
|
306
|
+
|
|
307
|
+
Returns:
|
|
308
|
+
List of syllables
|
|
309
|
+
"""
|
|
310
|
+
h = GeorgianHyphenator('-')
|
|
311
|
+
return h.get_syllables(word)
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def hyphenate_text(text: str, hyphen_char: str = '\u00AD') -> str:
|
|
315
|
+
"""
|
|
316
|
+
Hyphenate Georgian text
|
|
317
|
+
|
|
318
|
+
Args:
|
|
319
|
+
text: Text containing Georgian words
|
|
320
|
+
hyphen_char: Hyphen character to use
|
|
321
|
+
|
|
322
|
+
Returns:
|
|
323
|
+
Hyphenated text
|
|
324
|
+
"""
|
|
325
|
+
h = GeorgianHyphenator(hyphen_char)
|
|
326
|
+
return h.hyphenate_text(text)
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
# Export format converters (v2.0 compatibility)
|
|
330
|
+
|
|
331
|
+
def to_tex_pattern(word: str) -> str:
|
|
332
|
+
"""
|
|
333
|
+
Convert to TeX hyphenation pattern format
|
|
334
|
+
|
|
335
|
+
Args:
|
|
336
|
+
word: Georgian word
|
|
337
|
+
|
|
338
|
+
Returns:
|
|
339
|
+
TeX pattern (e.g., ".სა1ქარ1თვე1ლო.")
|
|
340
|
+
"""
|
|
341
|
+
h = GeorgianHyphenator('-')
|
|
342
|
+
syllables = h.get_syllables(word)
|
|
343
|
+
return '.' + '1'.join(syllables) + '.'
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def to_hunspell_format(word: str) -> str:
|
|
347
|
+
"""
|
|
348
|
+
Convert to Hunspell hyphenation format
|
|
349
|
+
|
|
350
|
+
Args:
|
|
351
|
+
word: Georgian word
|
|
352
|
+
|
|
353
|
+
Returns:
|
|
354
|
+
Hunspell format (e.g., "სა=ქარ=თვე=ლო")
|
|
355
|
+
"""
|
|
356
|
+
h = GeorgianHyphenator('-')
|
|
357
|
+
hyphenated = h.hyphenate(word)
|
|
358
|
+
return hyphenated.replace('-', '=')
|
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
Georgian Language Hyphenation Library
|
|
4
|
+
ქართული ენის დამარცვლის ბიბლიოთეკა
|
|
5
|
+
|
|
6
|
+
Supports multiple output formats:
|
|
7
|
+
- Soft hyphens for web/documents
|
|
8
|
+
- TeX hyphenation patterns
|
|
9
|
+
- Hunspell dictionary format
|
|
10
|
+
- Plain syllable lists
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import re
|
|
14
|
+
from functools import reduce
|
|
15
|
+
from typing import List, Dict, Optional
|
|
16
|
+
import json
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class GeorgianHyphenator:
|
|
20
|
+
"""
|
|
21
|
+
Main hyphenation class for Georgian language
|
|
22
|
+
ქართული ენის დამარცვლის ძირითადი კლასი
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(self, hyphen_char: str = '\u00AD'):
|
|
26
|
+
"""
|
|
27
|
+
Initialize hyphenator with specified hyphen character
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
hyphen_char: Character to use for hyphenation points
|
|
31
|
+
Default is soft hyphen (U+00AD)
|
|
32
|
+
"""
|
|
33
|
+
self.hyphen_char = hyphen_char
|
|
34
|
+
self.C = '[ბგდვზთკლმნპჟრსტფქღყშჩცძწჭხჯჰ]' # Consonants
|
|
35
|
+
self.V = '[აეიოუ]' # Vowels
|
|
36
|
+
self.char = '[ა-ჰ]' # All Georgian letters
|
|
37
|
+
|
|
38
|
+
def count_vowels(self, word: str) -> int:
|
|
39
|
+
"""Count vowels in a word"""
|
|
40
|
+
vowel_counts = [word.count(x) for x in "აეიოუ"]
|
|
41
|
+
return reduce(lambda x, y: x + y, vowel_counts, 0)
|
|
42
|
+
|
|
43
|
+
def hyphenate(self, word: str) -> str:
|
|
44
|
+
"""
|
|
45
|
+
Hyphenate a single Georgian word
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
word: Georgian word to hyphenate
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Word with hyphenation points inserted
|
|
52
|
+
"""
|
|
53
|
+
# Don't hyphenate words with 0-1 vowels
|
|
54
|
+
if self.count_vowels(word) <= 1:
|
|
55
|
+
return word
|
|
56
|
+
|
|
57
|
+
softhpn = self.hyphen_char
|
|
58
|
+
|
|
59
|
+
# Apply hyphenation rules with different boundary markers
|
|
60
|
+
result = self._apply_rules(word, softhpn, '^', '$')
|
|
61
|
+
result = self._apply_rules(result, softhpn, '^', softhpn)
|
|
62
|
+
result = self._apply_rules(result, softhpn, softhpn, '$')
|
|
63
|
+
result = self._apply_rules(result, softhpn, softhpn, softhpn)
|
|
64
|
+
|
|
65
|
+
# Remove duplicate hyphens
|
|
66
|
+
result = re.sub(f"{re.escape(softhpn)}+", softhpn, result, flags=re.U)
|
|
67
|
+
|
|
68
|
+
return result
|
|
69
|
+
|
|
70
|
+
def _apply_rules(self, w: str, softhpn: str, startchar: str, endchar: str) -> str:
|
|
71
|
+
"""Apply hyphenation regex rules"""
|
|
72
|
+
C, V, char = self.C, self.V, self.char
|
|
73
|
+
|
|
74
|
+
# Rule 1: V+C+C++V → VC|CV
|
|
75
|
+
t = re.sub(f"({V})({C})({C}+)({V})",
|
|
76
|
+
rf"\1\2{softhpn}\3\4", w, flags=re.U)
|
|
77
|
+
|
|
78
|
+
# Rule 2: V+C+V+C+V → VCV|CV
|
|
79
|
+
t = re.sub(f"({V})({C})({V})({C})({V})",
|
|
80
|
+
rf"\1\2\3{softhpn}\4\5", t, flags=re.U)
|
|
81
|
+
|
|
82
|
+
# Rule 3: C+V+C+V → CV|CV
|
|
83
|
+
t = re.sub(f"({C})({V})({C})({V})",
|
|
84
|
+
rf"\1\2{softhpn}\3\4", t, flags=re.U)
|
|
85
|
+
|
|
86
|
+
# Rule 4: V+V+V → VV|V
|
|
87
|
+
t = re.sub(f"({V})({V})({V})",
|
|
88
|
+
rf"\1\2{softhpn}\3", t, flags=re.U)
|
|
89
|
+
|
|
90
|
+
# Rule 5: Word start - ^VCVCV
|
|
91
|
+
t = re.sub(f"{startchar}({V})({C})({V})({C})({V})",
|
|
92
|
+
rf"\1\2\3{softhpn}\4\5", t, flags=re.U)
|
|
93
|
+
|
|
94
|
+
# Rule 6: Word start - ^VCVCchar
|
|
95
|
+
t = re.sub(f"{startchar}({V})({C})({V})({C})({char})",
|
|
96
|
+
rf"\1\2\3{softhpn}\4\5", t, flags=re.U)
|
|
97
|
+
|
|
98
|
+
# Rule 7: Word start - ^C++CVCV
|
|
99
|
+
t = re.sub(f"{startchar}({C}+)({V})({C})({V})",
|
|
100
|
+
rf"\1\2{softhpn}\3\4", t, flags=re.U)
|
|
101
|
+
|
|
102
|
+
# Rule 8: Word start - ^C++VVchar
|
|
103
|
+
t = re.sub(f"{startchar}({C}+)({V})({V})({char})",
|
|
104
|
+
rf"\1\2{softhpn}\3\4", t, flags=re.U)
|
|
105
|
+
|
|
106
|
+
# Rule 9: Word end - charVVC++$
|
|
107
|
+
t = re.sub(f"({char})({V})({V})({C}+){endchar}",
|
|
108
|
+
rf"\1\2{softhpn}\3\4", t, flags=re.U)
|
|
109
|
+
|
|
110
|
+
# Rule 10: Word end - charVCV$
|
|
111
|
+
t = re.sub(f"({char})({V})({C})({V}){endchar}",
|
|
112
|
+
rf"\1\2{softhpn}\3\4", t, flags=re.U)
|
|
113
|
+
|
|
114
|
+
# Rule 11: Word end - VCC++VC++$
|
|
115
|
+
t = re.sub(f"({V})({C})({C}+)({V})({C}+){endchar}",
|
|
116
|
+
rf"\1\2{softhpn}\3\4\5", t, flags=re.U)
|
|
117
|
+
|
|
118
|
+
# Rule 12: Word end - charVCVC++$
|
|
119
|
+
t = re.sub(f"({char})({V})({C})({V}+)({C}+){endchar}",
|
|
120
|
+
rf"\1\2{softhpn}\3\4\5", t, flags=re.U)
|
|
121
|
+
|
|
122
|
+
return t
|
|
123
|
+
|
|
124
|
+
def get_syllables(self, word: str) -> List[str]:
|
|
125
|
+
"""
|
|
126
|
+
Get list of syllables for a word
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
word: Georgian word
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
List of syllables
|
|
133
|
+
"""
|
|
134
|
+
hyphenated = self.hyphenate(word)
|
|
135
|
+
return hyphenated.split(self.hyphen_char)
|
|
136
|
+
|
|
137
|
+
def hyphenate_text(self, text: str) -> str:
|
|
138
|
+
"""
|
|
139
|
+
Hyphenate entire text
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
text: Georgian text
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Hyphenated text
|
|
146
|
+
"""
|
|
147
|
+
words = text.split(' ')
|
|
148
|
+
hyphenated_words = [self.hyphenate(w) for w in words]
|
|
149
|
+
return ' '.join(hyphenated_words)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
class TeXPatternGenerator:
|
|
153
|
+
"""Generate TeX hyphenation patterns"""
|
|
154
|
+
|
|
155
|
+
def __init__(self, hyphenator: GeorgianHyphenator):
|
|
156
|
+
self.hyphenator = hyphenator
|
|
157
|
+
|
|
158
|
+
def word_to_pattern(self, word: str) -> str:
|
|
159
|
+
"""
|
|
160
|
+
Convert a word to TeX pattern format
|
|
161
|
+
|
|
162
|
+
Example: საქართველო → .სა1ქარ1თვე1ლო
|
|
163
|
+
"""
|
|
164
|
+
syllables = self.hyphenator.get_syllables(word)
|
|
165
|
+
if len(syllables) <= 1:
|
|
166
|
+
return f".{word}"
|
|
167
|
+
return "." + "1".join(syllables)
|
|
168
|
+
|
|
169
|
+
def generate_patterns_file(self, words: List[str], output_file: str):
|
|
170
|
+
"""
|
|
171
|
+
Generate complete TeX patterns file
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
words: List of Georgian words
|
|
175
|
+
output_file: Path to output .tex file
|
|
176
|
+
"""
|
|
177
|
+
patterns = [self.word_to_pattern(w) for w in words]
|
|
178
|
+
|
|
179
|
+
with open(output_file, 'w', encoding='utf-8') as f:
|
|
180
|
+
f.write("% Georgian hyphenation patterns\n")
|
|
181
|
+
f.write("% ქართული დამარცვლის პატერნები\n")
|
|
182
|
+
f.write("% Generated automatically\n\n")
|
|
183
|
+
f.write("\\patterns{\n")
|
|
184
|
+
for pattern in sorted(set(patterns)):
|
|
185
|
+
f.write(f" {pattern}\n")
|
|
186
|
+
f.write("}\n")
|
|
187
|
+
|
|
188
|
+
print(f"TeX patterns saved to {output_file}")
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
class HunspellDictionaryGenerator:
|
|
192
|
+
"""Generate Hunspell dictionary format"""
|
|
193
|
+
|
|
194
|
+
def __init__(self, hyphenator: GeorgianHyphenator):
|
|
195
|
+
self.hyphenator = hyphenator
|
|
196
|
+
|
|
197
|
+
def word_to_hunspell(self, word: str) -> str:
|
|
198
|
+
"""
|
|
199
|
+
Convert word to Hunspell format
|
|
200
|
+
|
|
201
|
+
Example: საქართველო → სა=ქარ=თვე=ლო
|
|
202
|
+
"""
|
|
203
|
+
syllables = self.hyphenator.get_syllables(word)
|
|
204
|
+
return "=".join(syllables)
|
|
205
|
+
|
|
206
|
+
def generate_dictionary(self, words: List[str], output_prefix: str):
|
|
207
|
+
"""
|
|
208
|
+
Generate Hunspell .dic file
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
words: List of Georgian words
|
|
212
|
+
output_prefix: Prefix for output files (e.g., 'hyph_ka_GE')
|
|
213
|
+
"""
|
|
214
|
+
dic_file = f"{output_prefix}.dic"
|
|
215
|
+
|
|
216
|
+
with open(dic_file, 'w', encoding='utf-8') as f:
|
|
217
|
+
# Header with word count
|
|
218
|
+
f.write(f"UTF-8\n{len(words)}\n")
|
|
219
|
+
for word in words:
|
|
220
|
+
f.write(self.word_to_hunspell(word) + "\n")
|
|
221
|
+
|
|
222
|
+
print(f"Hunspell dictionary saved to {dic_file}")
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
class HyphenationExporter:
|
|
226
|
+
"""Export hyphenation data in various formats"""
|
|
227
|
+
|
|
228
|
+
def __init__(self, hyphenator: GeorgianHyphenator):
|
|
229
|
+
self.hyphenator = hyphenator
|
|
230
|
+
|
|
231
|
+
def export_json(self, words: List[str], output_file: str):
|
|
232
|
+
"""Export as JSON for JavaScript usage"""
|
|
233
|
+
data = {}
|
|
234
|
+
for word in words:
|
|
235
|
+
data[word] = {
|
|
236
|
+
"syllables": self.hyphenator.get_syllables(word),
|
|
237
|
+
"hyphenated": self.hyphenator.hyphenate(word)
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
with open(output_file, 'w', encoding='utf-8') as f:
|
|
241
|
+
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
242
|
+
|
|
243
|
+
print(f"JSON export saved to {output_file}")
|
|
244
|
+
|
|
245
|
+
def export_csv(self, words: List[str], output_file: str):
|
|
246
|
+
"""Export as CSV"""
|
|
247
|
+
import csv
|
|
248
|
+
|
|
249
|
+
with open(output_file, 'w', encoding='utf-8', newline='') as f:
|
|
250
|
+
writer = csv.writer(f)
|
|
251
|
+
writer.writerow(['word', 'syllables', 'syllable_count'])
|
|
252
|
+
|
|
253
|
+
for word in words:
|
|
254
|
+
syllables = self.hyphenator.get_syllables(word)
|
|
255
|
+
writer.writerow([word, '-'.join(syllables), len(syllables)])
|
|
256
|
+
|
|
257
|
+
print(f"CSV export saved to {output_file}")
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
# Test and demonstration
|
|
261
|
+
if __name__ == "__main__":
|
|
262
|
+
# Initialize hyphenator
|
|
263
|
+
hyphenator = GeorgianHyphenator()
|
|
264
|
+
|
|
265
|
+
# Test words
|
|
266
|
+
test_words = [
|
|
267
|
+
"საქართველო",
|
|
268
|
+
"მთავრობა",
|
|
269
|
+
"დედაქალაქი",
|
|
270
|
+
"ტელევიზორი",
|
|
271
|
+
"კომპიუტერი",
|
|
272
|
+
"უნივერსიტეტი",
|
|
273
|
+
"პარლამენტი",
|
|
274
|
+
"დამოუკიდებლობა",
|
|
275
|
+
"განათლება",
|
|
276
|
+
"ეკონომიკა"
|
|
277
|
+
]
|
|
278
|
+
|
|
279
|
+
print("=" * 60)
|
|
280
|
+
print("Georgian Hyphenation Examples")
|
|
281
|
+
print("ქართული დამარცვლის მაგალითები")
|
|
282
|
+
print("=" * 60)
|
|
283
|
+
print()
|
|
284
|
+
|
|
285
|
+
# Test basic hyphenation with visible hyphens
|
|
286
|
+
visible_hyphenator = GeorgianHyphenator('-')
|
|
287
|
+
for word in test_words:
|
|
288
|
+
syllables = visible_hyphenator.get_syllables(word)
|
|
289
|
+
hyphenated = visible_hyphenator.hyphenate(word)
|
|
290
|
+
print(f"{word:20} → {hyphenated:25} [{len(syllables)} syllables]")
|
|
291
|
+
|
|
292
|
+
print("\n" + "=" * 60)
|
|
293
|
+
print("Generating export files...")
|
|
294
|
+
print("=" * 60)
|
|
295
|
+
print()
|
|
296
|
+
|
|
297
|
+
# Generate TeX patterns
|
|
298
|
+
tex_gen = TeXPatternGenerator(hyphenator)
|
|
299
|
+
tex_gen.generate_patterns_file(test_words, "hyph-ka.tex")
|
|
300
|
+
|
|
301
|
+
# Generate Hunspell dictionary
|
|
302
|
+
hunspell_gen = HunspellDictionaryGenerator(hyphenator)
|
|
303
|
+
hunspell_gen.generate_dictionary(test_words, "hyph_ka_GE")
|
|
304
|
+
|
|
305
|
+
# Generate exports
|
|
306
|
+
exporter = HyphenationExporter(hyphenator)
|
|
307
|
+
exporter.export_json(test_words, "georgian_hyphenation.json")
|
|
308
|
+
exporter.export_csv(test_words, "georgian_hyphenation.csv")
|
|
309
|
+
|
|
310
|
+
print("\n" + "=" * 60)
|
|
311
|
+
print("All files generated successfully!")
|
|
312
|
+
print("=" * 60)
|