georgian-hyphenation 2.0.1 → 2.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,16 +1,22 @@
1
1
  {
2
2
  "name": "georgian-hyphenation",
3
- "version": "2.0.1",
4
- "description": "Georgian Language Hyphenation Library v2.0 - Academic Logic with Phonological Distance Analysis",
3
+ "version": "2.2.1",
4
+ "description": "Georgian Language Hyphenation Library v2.2.1 - Academic Logic with Sanitization & Dictionary Support",
5
+ "type": "module",
5
6
  "main": "src/javascript/index.js",
6
7
  "types": "src/javascript/index.d.ts",
7
8
  "files": [
8
9
  "src/javascript",
9
- "README-NPM.md",
10
- "LICENSE"
10
+ "data/exceptions.json",
11
+ "README.md",
12
+ "LICENSE.txt"
11
13
  ],
14
+ "exports": {
15
+ ".": "./src/javascript/index.js",
16
+ "./data/*": "./data/*"
17
+ },
12
18
  "scripts": {
13
- "test": "echo \"Error: no test specified\" && exit 1"
19
+ "test": "node test-suite.js"
14
20
  },
15
21
  "repository": {
16
22
  "type": "git",
@@ -27,7 +33,8 @@
27
33
  "linguistics",
28
34
  "text-processing",
29
35
  "i18n",
30
- "localization"
36
+ "localization",
37
+ "sanitization"
31
38
  ],
32
39
  "author": "Guram Zhgamadze <guramzhgamadze@gmail.com>",
33
40
  "license": "MIT",
@@ -1,108 +1,151 @@
1
1
  /**
2
- * Georgian Language Hyphenation Library (v2.0 - Academic Logic)
3
- * ქართული ენის დამარცვლის ბიბლიოთეკა
4
- * * Logic: Phonological distance analysis & Anti-Orphan protection.
5
- * Author: Guram Zhgamadze
2
+ * Georgian Hyphenation Library v2.2.1
3
+ * Modernized & Optimized by GitHub Code Architect
6
4
  */
7
5
 
8
- class GeorgianHyphenator {
9
- /**
10
- * Initialize Georgian Hyphenator
11
- * @param {string} hyphenChar - Character to use for hyphenation (default: soft hyphen U+00AD)
12
- */
6
+ export default class GeorgianHyphenator {
13
7
  constructor(hyphenChar = '\u00AD') {
14
8
  this.hyphenChar = hyphenChar;
15
9
  this.vowels = 'აეიოუ';
10
+ this.leftMin = 2;
11
+ this.rightMin = 2;
12
+
13
+ // ოპტიმიზაცია: გამოყენებულია Set სწრაფი ძებნისთვის (O(1))
14
+ this.harmonicClusters = new Set([
15
+ 'ბლ', 'ბრ', 'ბღ', 'ბზ', 'გდ', 'გლ', 'გმ', 'გნ', 'გვ', 'გზ', 'გრ',
16
+ 'დრ', 'თლ', 'თრ', 'თღ', 'კლ', 'კმ', 'კნ', 'კრ', 'კვ', 'მტ', 'პლ',
17
+ 'პრ', 'ჟღ', 'რგ', 'რლ', 'რმ', 'სწ', 'სხ', 'ტკ', 'ტპ', 'ტრ', 'ფლ',
18
+ 'ფრ', 'ფქ', 'ფშ', 'ქლ', 'ქნ', 'ქვ', 'ქრ', 'ღლ', 'ღრ', 'ყლ', 'ყრ',
19
+ 'შთ', 'შპ', 'ჩქ', 'ჩრ', 'ცლ', 'ცნ', 'ცრ', 'ცვ', 'ძგ', 'ძვ', 'ძღ',
20
+ 'წლ', 'წრ', 'წნ', 'წკ', 'ჭკ', 'ჭრ', 'ჭყ', 'ხლ', 'ხმ', 'ხნ', 'ხვ', 'ჯგ'
21
+ ]);
22
+
23
+ this.dictionary = new Map();
16
24
  }
17
25
 
18
26
  /**
19
- * Hyphenate a single Georgian word
20
- * @param {string} word - Georgian word to hyphenate
21
- * @returns {string} Word with hyphenation points
27
+ * შლის არსებულ დამარცვლის სიმბოლოებს (Sanitization)
22
28
  */
29
+ _stripHyphens(text) {
30
+ if (!text) return '';
31
+ // Escape special regex characters
32
+ const escapedChar = this.hyphenChar.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
33
+ const regex = new RegExp(`[\u00AD${escapedChar}]`, 'g');
34
+ return text.replace(regex, '');
35
+ }
36
+
37
+ loadLibrary(data) {
38
+ if (data && typeof data === 'object') {
39
+ Object.entries(data).forEach(([word, hyphenated]) => {
40
+ this.dictionary.set(word, hyphenated);
41
+ });
42
+ }
43
+ }
44
+
45
+ async loadDefaultLibrary() {
46
+ // 1. Browser Environment
47
+ if (typeof window !== 'undefined' && typeof fetch !== 'undefined') {
48
+ try {
49
+ const response = await fetch('https://unpkg.com/georgian-hyphenation@2/data/exceptions.json');
50
+ if (!response.ok) throw new Error("Network response error");
51
+ const data = await response.json();
52
+ this.loadLibrary(data);
53
+ } catch (error) {
54
+ console.warn("Georgian Hyphenation: Using algorithm only (Fetch failed)");
55
+ }
56
+ }
57
+ // 2. Node.js Environment (ESM context)
58
+ else if (typeof process !== 'undefined') {
59
+ try {
60
+ // Node-ში ლოკალური ფაილის წაკითხვა
61
+ const { default: data } = await import('../../data/exceptions.json', {
62
+ assert: { type: 'json' }
63
+ });
64
+ this.loadLibrary(data);
65
+ } catch (error) {
66
+ console.warn("Georgian Hyphenation: Local dictionary not found");
67
+ }
68
+ }
69
+ }
70
+
23
71
  hyphenate(word) {
24
- // 1. Safety Rule: Words shorter than 4 chars are never hyphenated
25
- // (Prevents: "a-ra", "i-gi", "e-na")
26
- if (word.length < 4) return word;
72
+ const sanitizedWord = this._stripHyphens(word);
73
+ const cleanWord = sanitizedWord.replace(/[.,/#!$%^&*;:{}=\-_`~()]/g, "");
74
+
75
+ if (this.dictionary.has(cleanWord)) {
76
+ return this.dictionary.get(cleanWord).replace(/-/g, this.hyphenChar);
77
+ }
78
+
79
+ return this.applyAlgorithm(sanitizedWord);
80
+ }
81
+
82
+ applyAlgorithm(word) {
83
+ if (word.length < (this.leftMin + this.rightMin)) return word;
27
84
 
28
- // 2. Find all vowel indices
29
- let vowelIndices = [];
85
+ const vowelIndices = [];
30
86
  for (let i = 0; i < word.length; i++) {
31
- if (this.vowels.includes(word[i])) {
32
- vowelIndices.push(i);
33
- }
87
+ if (this.vowels.includes(word[i])) vowelIndices.push(i);
34
88
  }
35
89
 
36
- // 3. If less than 2 vowels, cannot be hyphenated (e.g. "mcvrtnls")
37
90
  if (vowelIndices.length < 2) return word;
38
91
 
39
- let insertPoints = [];
40
-
41
- // 4. Core Logic: Analyze distance between vowels
92
+ const insertPoints = [];
42
93
  for (let i = 0; i < vowelIndices.length - 1; i++) {
43
- let v1 = vowelIndices[i];
44
- let v2 = vowelIndices[i + 1];
45
- let distance = v2 - v1 - 1; // Number of consonants between vowels
46
- let betweenSubstring = word.substring(v1 + 1, v2);
94
+ const v1 = vowelIndices[i];
95
+ const v2 = vowelIndices[i + 1];
96
+ const distance = v2 - v1 - 1;
97
+ const betweenSubstring = word.substring(v1 + 1, v2);
47
98
 
48
99
  let candidatePos = -1;
49
100
 
50
- if (distance === 0) {
51
- // Case V-V (Hiatus): Split between vowels (ga-a-a-na-li-za)
52
- candidatePos = v1 + 1;
53
- } else if (distance === 1) {
54
- // Case V-C-V: Split before consonant (ga-da)
101
+ if (distance === 0 || distance === 1) {
55
102
  candidatePos = v1 + 1;
56
103
  } else {
57
- // Case V-CC...-V: Cluster handling
58
- // 'R' Rule: If cluster starts with 'r', keep it left (bar-bi)
59
- // Otherwise, split after first consonant (saq-me) for balance
60
- if (betweenSubstring[0] === 'რ') {
61
- candidatePos = v1 + 2;
104
+ let doubleConsonantIndex = -1;
105
+ for (let j = 0; j < betweenSubstring.length - 1; j++) {
106
+ if (betweenSubstring[j] === betweenSubstring[j + 1]) {
107
+ doubleConsonantIndex = j;
108
+ break;
109
+ }
110
+ }
111
+
112
+ if (doubleConsonantIndex !== -1) {
113
+ candidatePos = v1 + 1 + doubleConsonantIndex + 1;
62
114
  } else {
63
- candidatePos = v1 + 2;
115
+ let breakIndex = -1;
116
+ if (distance >= 2) {
117
+ const lastTwo = betweenSubstring.substring(distance - 2, distance);
118
+ if (this.harmonicClusters.has(lastTwo)) {
119
+ breakIndex = distance - 2;
120
+ }
121
+ }
122
+ candidatePos = (breakIndex !== -1) ? v1 + 1 + breakIndex : v1 + 2;
64
123
  }
65
124
  }
66
125
 
67
- // 5. Critical Filter (Anti-Orphan / Anti-Widow)
68
- // Ensure at least 2 characters remain on both sides of the hyphen
69
- if (candidatePos >= 2 && (word.length - candidatePos) >= 2) {
126
+ if (candidatePos >= this.leftMin && (word.length - candidatePos) >= this.rightMin) {
70
127
  insertPoints.push(candidatePos);
71
128
  }
72
129
  }
73
130
 
74
- // 6. Reconstruct the word
75
131
  let result = word.split('');
76
132
  for (let i = insertPoints.length - 1; i >= 0; i--) {
77
133
  result.splice(insertPoints[i], 0, this.hyphenChar);
78
134
  }
79
-
80
135
  return result.join('');
81
136
  }
82
137
 
83
- /**
84
- * Get array of syllables for a word
85
- * @param {string} word - Georgian word
86
- * @returns {string[]} Array of syllables
87
- */
88
138
  getSyllables(word) {
89
- // Use a temporary hyphenator with a safe delimiter to split
90
- const tempHyphenator = new GeorgianHyphenator('-');
91
- return tempHyphenator.hyphenate(word).split('-');
139
+ return this.hyphenate(word).split(this.hyphenChar);
92
140
  }
93
141
 
94
- /**
95
- * Hyphenate entire text (preserves punctuation)
96
- * @param {string} text - Georgian text
97
- * @returns {string} Hyphenated text
98
- */
99
142
  hyphenateText(text) {
100
- // Improved Tokenizer: Splits by non-Georgian chars to protect punctuation
101
- const parts = text.split(/([^ა-ჰ]+)/);
102
-
143
+ if (!text) return '';
144
+ const sanitizedText = this._stripHyphens(text);
145
+ const parts = sanitizedText.split(/([ა-ჰ]+)/);
146
+
103
147
  return parts.map(part => {
104
- // Process only Georgian words with length >= 4
105
- if (/[ა-ჰ]{4,}/.test(part)) {
148
+ if (part.length >= 4 && /[ა-ჰ]/.test(part)) {
106
149
  return this.hyphenate(part);
107
150
  }
108
151
  return part;
@@ -110,42 +153,15 @@ class GeorgianHyphenator {
110
153
  }
111
154
  }
112
155
 
113
- /**
114
- * Convert word to TeX pattern format (e.g., .გ1ა1ა1ნ1ა1ლ1ი1ზ1ა.)
115
- * Useful for LaTeX or TeX engines
116
- */
117
- function toTeXPattern(word) {
118
- const hyphenator = new GeorgianHyphenator();
119
- const syllables = hyphenator.getSyllables(word);
120
- if (syllables.length <= 1) {
121
- return `.${word}.`;
122
- }
123
- // TeX hyphenation patterns usually use odd numbers (1, 3, 5) to indicate hyphens
124
- // Here we simply join syllables with '1'
125
- return '.' + syllables.join('1') + '.';
126
- }
127
-
128
- /**
129
- * Convert word to Hunspell format (syllable=syllable)
156
+ /** * კროს-პლატფორმული მხარდაჭერა
130
157
  */
131
- function toHunspellFormat(word) {
132
- const hyphenator = new GeorgianHyphenator();
133
- const syllables = hyphenator.getSyllables(word);
134
- return syllables.join('=');
158
+ // 1. ბრაუზერისთვის (Global Object)
159
+ if (typeof window !== 'undefined') {
160
+ window.GeorgianHyphenator = GeorgianHyphenator;
135
161
  }
136
162
 
137
- // Export for Node.js
163
+ // 2. Node.js (CommonJS) - იმ შემთხვევაში თუ ვინმე მაინც require-ს გამოიყენებს
164
+ // (მხოლოდ თუ module.exports არსებობს)
138
165
  if (typeof module !== 'undefined' && module.exports) {
139
- module.exports = {
140
- GeorgianHyphenator,
141
- toTeXPattern,
142
- toHunspellFormat
143
- };
144
- }
145
-
146
- // Export for Browser
147
- if (typeof window !== 'undefined') {
148
- window.GeorgianHyphenator = GeorgianHyphenator;
149
- window.toTeXPattern = toTeXPattern;
150
- window.toHunspellFormat = toHunspellFormat;
166
+ module.exports = GeorgianHyphenator;
151
167
  }