formosa 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1223 @@
1
+ // LibHolo.h: LibFormosa's Holo Processing Library
2
+ //
3
+ // Copyright (c) 2007 The OpenVanilla Project (http://openvanilla.org)
4
+ // All rights reserved.
5
+ //
6
+ // Redistribution and use in source and binary forms, with or without
7
+ // modification, are permitted provided that the following conditions
8
+ // are met:
9
+ //
10
+ // 1. Redistributions of source code must retain the above copyright
11
+ // notice, this list of conditions and the following disclaimer.
12
+ // 2. Redistributions in binary form must reproduce the above copyright
13
+ // notice, this list of conditions and the following disclaimer in the
14
+ // documentation and/or other materials provided with the distribution.
15
+ // 3. Neither the name of OpenVanilla nor the names of its contributors
16
+ // may be used to endorse or promote products derived from this software
17
+ // without specific prior written permission.
18
+ //
19
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20
+ // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21
+ // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22
+ // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23
+ // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24
+ // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25
+ // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26
+ // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27
+ // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28
+ // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29
+ // POSSIBILITY OF SUCH DAMAGE.
30
+
31
+ #include <vector>
32
+ #include <string>
33
+ #include <iostream>
34
+ #include <ctype.h>
35
+
36
+ #include "HoloVowels.h"
37
+
38
+ namespace LibHolo
39
+ {
40
+ using namespace std;
41
+
42
+ class Composable
43
+ {
44
+ public:
45
+ virtual ~Composable() {}
46
+ virtual void clear()=0;
47
+ virtual bool empty()=0;
48
+ virtual unsigned int numberOfCodepoints()=0;
49
+ virtual string composedForm()=0;
50
+ };
51
+
52
+ class ComposableStringBuffer : Composable {
53
+ public:
54
+ ComposableStringBuffer() : _cursor(0)
55
+ {
56
+ }
57
+
58
+ virtual unsigned int cursor()
59
+ {
60
+ return _cursor;
61
+ }
62
+
63
+ virtual unsigned int setCursor(unsigned int c)
64
+ {
65
+ if (c <= numberOfCodepoints()) _cursor = c;
66
+ return _cursor;
67
+ }
68
+
69
+ virtual void clear()
70
+ {
71
+ _cursor = 0;
72
+ strvec.clear();
73
+ }
74
+
75
+ virtual bool empty()
76
+ {
77
+ return strvec.empty();
78
+ }
79
+
80
+ virtual unsigned int numberOfCodepoints()
81
+ {
82
+ return strvec.size();
83
+ }
84
+
85
+ virtual bool insertCharacterAt(unsigned int i, char c)
86
+ {
87
+ if (i > numberOfCodepoints()) return false;
88
+ strvec.insert(strvec.begin()+i, string(1, c));
89
+ return true;
90
+ }
91
+
92
+ virtual bool removeCodepointAt(unsigned int i)
93
+ {
94
+ if (i >= numberOfCodepoints()) return false;
95
+ strvec.erase(strvec.begin() + i);
96
+ return true;
97
+ }
98
+
99
+ virtual string composedForm()
100
+ {
101
+ return internalForm();
102
+ }
103
+
104
+ virtual string internalForm()
105
+ {
106
+ string newstr;
107
+ unsigned int s=numberOfCodepoints();
108
+ for (unsigned int i=0; i<s; i++) newstr+=strvec[i];
109
+ return newstr;
110
+ }
111
+
112
+ protected:
113
+ vector<string> strvec;
114
+ unsigned int _cursor;
115
+ };
116
+
117
+
118
+ enum SyllableType
119
+ {
120
+ POJSyllable = 0,
121
+ TLSyllable = 1,
122
+ TLPASyllable = 2,
123
+ DTSyllable = 3
124
+ };
125
+
126
+ enum DiacriticInputOption
127
+ {
128
+ DiacriticGivenBeforeVowel = 0,
129
+ DiacriticGivenAfterVowel = 1
130
+ };
131
+
132
+ class HoloSymbol
133
+ {
134
+ public:
135
+ HoloSymbol() : _tone(0), _type(POJSyllable)
136
+ {
137
+ }
138
+
139
+ HoloSymbol(const string &s, SyllableType t) : _tone(0), _type(t), _symbol(s)
140
+ {
141
+ }
142
+
143
+ HoloSymbol(const HoloSymbol &s) : _tone(s._tone), _type(s._type), _symbol(s._symbol)
144
+ {
145
+ }
146
+
147
+ void setType(SyllableType t)
148
+ {
149
+ _type = t;
150
+ }
151
+
152
+ const HoloSymbol& operator=(const HoloSymbol &s)
153
+ {
154
+ _symbol = s._symbol;
155
+ _tone = s._tone;
156
+ _type = s._type;
157
+ return *this;
158
+ }
159
+
160
+ string symbol()
161
+ {
162
+ return string(_symbol);
163
+ }
164
+
165
+ string symbolInLowerCase()
166
+ {
167
+ string lower;
168
+ unsigned int s=_symbol.length();
169
+ for (unsigned int i=0; i<s; i++) lower+=tolower(_symbol[i]);
170
+ return lower;
171
+ }
172
+
173
+ string setSymbol(const string& s)
174
+ {
175
+ return (_symbol = s);
176
+ }
177
+
178
+ string composedForm(bool forcePOJStyle=false)
179
+ {
180
+ string composed = ComposeHoloVowel(_symbol, _tone, ((_type==POJSyllable) || forcePOJStyle) ? true : false);
181
+ if (!composed.length()) return _symbol;
182
+ return composed;
183
+ }
184
+
185
+ unsigned int composedLength()
186
+ {
187
+ string composed = composedForm();
188
+ unsigned int len = 0, clen = composed.length();
189
+ for (unsigned int i=0; i<clen; )
190
+ {
191
+ if (!(composed[i] & 0x80)) {
192
+ len++;
193
+ i++;
194
+ }
195
+ else if ((composed[i] & 0xe0) == 0xc0) {
196
+ len++;
197
+ i+=2;
198
+ }
199
+ else if ((composed[i] & 0xf0) == 0xe0) {
200
+ len++;
201
+ i+=3;
202
+ }
203
+ else {
204
+ len++;
205
+ i+=4;
206
+ }
207
+ }
208
+
209
+ // fprintf (stderr, "composed=%s, strlen=%d, calculated len=%d\n", composed.c_str(), clen, len);
210
+
211
+ return len;
212
+ }
213
+
214
+ unsigned int tone()
215
+ {
216
+ return _tone;
217
+ }
218
+
219
+ unsigned int setTone(unsigned int t)
220
+ {
221
+ _tone = t > 9 ? _tone : t;
222
+ return _tone;
223
+ }
224
+
225
+ bool isUpperCase()
226
+ {
227
+ if (!_symbol.length()) return false;
228
+ return toupper(_symbol[0]) == _symbol[0];
229
+ }
230
+
231
+
232
+ protected:
233
+ unsigned int _tone;
234
+ SyllableType _type;
235
+ string _symbol;
236
+ };
237
+
238
+
239
+ // the _cursor in HoloSyllable really means "the internal symbol cursor",
240
+ // as for the display cursor (called by the input method context),
241
+ // we need to recalculate
242
+ class HoloSyllable : public Composable
243
+ {
244
+ public:
245
+ HoloSyllable() : _inputType(POJSyllable), _inputOption(DiacriticGivenBeforeVowel),
246
+ _forcePOJStyle(false),
247
+ _cursor(0), _preparedTone(0)
248
+ {
249
+ }
250
+
251
+ HoloSyllable(const HoloSyllable &s) : _inputType(s._inputType),
252
+ _inputOption(s._inputOption),
253
+ _forcePOJStyle(s._forcePOJStyle),
254
+ _symvec(s._symvec),
255
+ _cursor(s._cursor), _preparedTone(s._preparedTone)
256
+ {
257
+ }
258
+
259
+ const HoloSyllable& operator=(const HoloSyllable &s)
260
+ {
261
+ _inputType = s._inputType;
262
+ _inputOption = s._inputOption;
263
+ _forcePOJStyle = s._forcePOJStyle;
264
+ _symvec = s._symvec;
265
+ _cursor = s._cursor;
266
+ _preparedTone = s._preparedTone;
267
+ return *this;
268
+ }
269
+
270
+ virtual void setInputType(SyllableType t)
271
+ {
272
+ _inputType = t;
273
+ }
274
+
275
+ virtual void setInputOption(DiacriticInputOption o)
276
+ {
277
+ if (o != _inputOption) clearPreparedTone();
278
+ _inputOption = o;
279
+ }
280
+
281
+ virtual void setForcePOJStyle(bool p)
282
+ {
283
+ _forcePOJStyle = p;
284
+ }
285
+
286
+ virtual void clear()
287
+ {
288
+ _symvec.clear();
289
+ _cursor = 0;
290
+ _preparedTone = 0;
291
+ }
292
+
293
+ virtual bool empty()
294
+ {
295
+ return _symvec.empty();
296
+ }
297
+
298
+ virtual unsigned int numberOfCodepoints()
299
+ {
300
+ return _symvec.size();
301
+ }
302
+
303
+ virtual string composedForm()
304
+ {
305
+ unsigned int s = _symvec.size();
306
+ string composed;
307
+ unsigned int i;
308
+
309
+ if (_preparedTone) _cursor--;
310
+
311
+ for (i=0; i<_cursor; i++)
312
+ {
313
+ composed += _symvec[i].composedForm(_forcePOJStyle);
314
+ // fprintf(stderr, "%d, symbol=%s, composed=%s, composd form=%s\n", i, _symvec[i].symbol().c_str(), _symvec[i].composedForm().c_str(), composed.c_str());
315
+ }
316
+
317
+ composed += GetToneASCIIRepresentation(_preparedTone);
318
+ // fprintf(stderr, "composd form=%s\n", composed.c_str());
319
+
320
+ for (; i<s; i++)
321
+ {
322
+ composed += _symvec[i].composedForm(_forcePOJStyle);
323
+ // fprintf(stderr, "composd form=%s\n", composed.c_str());
324
+ }
325
+
326
+ if (_preparedTone) _cursor++;
327
+
328
+ return composed;
329
+ }
330
+
331
+ void setCursor(unsigned int c)
332
+ {
333
+ clearPreparedTone();
334
+ _cursor = c;
335
+ }
336
+
337
+ unsigned int cursor()
338
+ {
339
+ unsigned int realcursor = _preparedTone ? _cursor-1 : _cursor;
340
+ unsigned codepointCursor=0;
341
+ for (unsigned int i=0; i<realcursor; i++) codepointCursor+=_symvec[i].composedLength();
342
+
343
+ if (_preparedTone) codepointCursor++;
344
+
345
+ return codepointCursor;
346
+ }
347
+
348
+ bool cursorHome()
349
+ {
350
+ clearPreparedTone();
351
+ if (_cursor==0) return false;
352
+ _cursor=0;
353
+ return true;
354
+ }
355
+
356
+ bool cursorEnd()
357
+ {
358
+ clearPreparedTone();
359
+ unsigned int len = numberOfCodepoints();
360
+ if (_cursor == len) return false;
361
+ _cursor = len;
362
+ return true;
363
+ }
364
+
365
+ bool cursorLeft()
366
+ {
367
+ clearPreparedTone();
368
+ if (_cursor==0) return false;
369
+ _cursor--;
370
+ return true;
371
+ }
372
+
373
+ bool cursorRight()
374
+ {
375
+ clearPreparedTone();
376
+ if (_cursor == numberOfCodepoints()) return false;
377
+ _cursor++;
378
+ return true;
379
+ }
380
+
381
+ bool insertSymbolAtCursor(const HoloSymbol &s)
382
+ {
383
+ clearPreparedTone();
384
+ HoloSymbol newsym(s);
385
+ newsym.setType(_inputType);
386
+ _symvec.insert(_symvec.begin() + _cursor, newsym);
387
+ _cursor++;
388
+
389
+ return true;
390
+ }
391
+
392
+ // if there is a prepared tone, the given tone parameter will be ignored
393
+ bool insertCharacterAtCursor(char c, unsigned int tone=0)
394
+ {
395
+ // fprintf(stderr, "insert char %d ('%c'), cursor=%d\n", c, c, _cursor);
396
+ if (IsDiacriticSymbol(c))
397
+ {
398
+ unsigned int tone = ToneFromDiacriticSymbol(c);
399
+
400
+ // if there's already a prepared tone, we replace it with the current one
401
+ if (_preparedTone) {
402
+ _preparedTone = tone;
403
+ return true;
404
+ }
405
+
406
+ if (_inputOption==DiacriticGivenBeforeVowel) {
407
+ _preparedTone = tone;
408
+ _cursor++;
409
+ }
410
+ else {
411
+ // diacritic given after vowel
412
+ if (hasPreviousSymbolAtCursor()) previousSymbolAtCursor().setTone(tone);
413
+ }
414
+ return true;
415
+ }
416
+
417
+ // if it's not a diacritic symbol, it's POJ^W^W^W, and it's n or u or g,
418
+ // (and if there's no prepared tone!)
419
+ // we need to do something special...
420
+ if (!IsDiacriticSymbol(c) && !_preparedTone /* && _inputType==POJSyllable */)
421
+ {
422
+ if (hasPreviousSymbolAtCursor())
423
+ {
424
+ string prev = previousSymbolAtCursor().symbolInLowerCase();
425
+
426
+ // N -> nn only works if the first character of the syllable is not an
427
+ // all uppercase symbol
428
+ if (c=='N' && ((prev != "n") && (prev != "nn")) && _inputType == POJSyllable
429
+ && (_symvec.size() > 0 && !_symvec[0].isUpperCase()))
430
+ {
431
+ // insert two n's in a row
432
+ _symvec.insert(_symvec.begin() + _cursor, HoloSymbol(string("nn"), _inputType));
433
+ _cursor++;
434
+ return true;
435
+ }
436
+ else if (tolower(c)=='n' && prev=="n") {
437
+ previousSymbolAtCursor().setSymbol(previousSymbolAtCursor().symbol() + string(1, c));
438
+ return true;
439
+ }
440
+ else if (_inputType == POJSyllable && tolower(c)=='u' && prev=="o") {
441
+ previousSymbolAtCursor().setSymbol(previousSymbolAtCursor().symbol() + string(1, c));
442
+ return true;
443
+ }
444
+ else if (_inputType == TLSyllable && tolower(c)=='o' && prev=="o") {
445
+ previousSymbolAtCursor().setSymbol(previousSymbolAtCursor().symbol() + string(1, c));
446
+ return true;
447
+ }
448
+ else if (tolower(c)=='g' && prev=="nn") {
449
+ // we need to break them up!
450
+ string before = previousSymbolAtCursor().symbol();
451
+
452
+ // and the tone of the previous symbol (when it's combined into nn) will be retained
453
+ previousSymbolAtCursor().setSymbol(before.substr(0, 1));
454
+
455
+ // insert one n and one g
456
+ _symvec.insert(_symvec.begin() + _cursor, HoloSymbol(before.substr(1,1), _inputType));
457
+ _cursor++;
458
+ _symvec.insert(_symvec.begin() + _cursor, HoloSymbol(string(1, c), _inputType));
459
+ _cursor++;
460
+ return true;
461
+ }
462
+ }
463
+ }
464
+
465
+
466
+ HoloSymbol s(string(1, c), _inputType);
467
+ if (_preparedTone)
468
+ {
469
+ _cursor--;
470
+ s.setTone(_preparedTone);
471
+ _preparedTone = 0;
472
+ }
473
+ else if (tone > 1)
474
+ {
475
+ s.setTone(tone);
476
+ }
477
+
478
+ _symvec.insert(_symvec.begin() + _cursor, s);
479
+ _cursor++;
480
+
481
+ return true;
482
+ }
483
+
484
+ bool removeCharacterAtRightOfCursor() // backspace
485
+ {
486
+ if (_preparedTone)
487
+ {
488
+ clearPreparedTone();
489
+ return true;
490
+ }
491
+
492
+ if (atBeginning()) return false;
493
+ _cursor--;
494
+ _symvec.erase(_symvec.begin() + _cursor);
495
+ return true;
496
+ }
497
+
498
+ bool removeCharacterAtLeftOfCursor() // delete
499
+ {
500
+ // we do some tightrope trick here: if we have _preparedTone ready,
501
+ // we "push back" the real _cursor position, do the delete thing,
502
+ // then push it back
503
+ bool retval=true;
504
+ if (_preparedTone) _cursor--;
505
+ if (atEnd()) retval=false; else _symvec.erase(_symvec.begin() + _cursor);
506
+ if (_preparedTone) _cursor++;
507
+ return retval;
508
+ }
509
+
510
+ // returns a normalized string that represents the "internal form" for querying the database
511
+ // implies normalization
512
+ string normalizedQueryData(unsigned int finalTone=0)
513
+ {
514
+ HoloSyllable s(*this);
515
+ s.normalize(finalTone);
516
+ string query;
517
+
518
+ unsigned int size=s._symvec.size();
519
+ unsigned int loudest = 0;
520
+
521
+ for (unsigned int i=0; i<size; i++) {
522
+ query = query + s._symvec[i].symbol();
523
+ if (s._symvec[i].tone() > 1) loudest = s._symvec[i].tone();
524
+ // fprintf (stderr, "combining query data %s, tone %d\n", s._symvec[i].symbol().c_str(), s._symvec[i].tone());
525
+ }
526
+
527
+ if (loudest > 1) query = query + string(1, loudest+'0');
528
+ return query;
529
+ }
530
+
531
+ // normalization is an "identpotent" function, ie. the result should
532
+ // be the same no matter how many times you call it--this being a very
533
+ // important linguistic characteristic of this function
534
+ void normalize(unsigned int finalTone=0)
535
+ {
536
+ // fprintf (stderr, "input finalTone=%d\n", finalTone);
537
+ unsigned int end = _symvec.size();
538
+
539
+ // if it's empty, just return
540
+ if (!end) return;
541
+
542
+ unsigned int loudestVowel = end;
543
+ unsigned int loudestTone = 0;
544
+ unsigned int p;
545
+
546
+ // find the loudest vowel
547
+ #define FLV(x) ((p=findSymbol(x)) != end)
548
+ #define SETLOUDEST(v) do { loudestVowel = v; if (_symvec[loudestVowel].tone()>1) { loudestTone = _symvec[loudestVowel].tone(); } } while(0)
549
+
550
+
551
+ if (end==1 && _symvec[0].symbolInLowerCase()=="m") SETLOUDEST(0);
552
+ if (FLV("n")) SETLOUDEST(p);
553
+ if (FLV("m")) SETLOUDEST(p);
554
+
555
+ // see if it's ng
556
+ if ((p=findSymbolPair("n", "g")) != end)
557
+ SETLOUDEST(p);
558
+
559
+ if (FLV("u")) SETLOUDEST(p);
560
+ if (FLV("i")) SETLOUDEST(p);
561
+ if (FLV("o")) SETLOUDEST(p);
562
+ if (FLV("e")) SETLOUDEST(p);
563
+ if (FLV("ou")) SETLOUDEST(p);
564
+ if (FLV("oo")) SETLOUDEST(p);
565
+ if (FLV("a")) SETLOUDEST(p);
566
+
567
+ // the last "ere" override
568
+ if (end >= 3) {
569
+ if (_symvec[end-1].symbolInLowerCase() == "e" && _symvec[end-2].symbolInLowerCase() == "r" && _symvec[end-3].symbolInLowerCase() == "e")
570
+ {
571
+ SETLOUDEST(end-1);
572
+ }
573
+ }
574
+
575
+ if (loudestVowel==end) return;
576
+ // fprintf(stderr, "found loudest vowel=%d (%s), loudest tone=%d\n", loudestVowel, _symvec[loudestVowel].symbol().c_str(), loudestTone);
577
+
578
+ // finalTone overrides
579
+ if (finalTone > 1) loudestTone = finalTone;
580
+
581
+ for (unsigned int i=0; i<end; i++) _symvec[i].setTone(0);
582
+
583
+ string lastSymbolStr = _symvec[end-1].symbolInLowerCase();
584
+
585
+ // if the symbol is "i", and there's a next "u", we shift
586
+ // the vowel to "u"
587
+
588
+ if (_symvec[loudestVowel].symbolInLowerCase()=="i")
589
+ {
590
+ if (loudestVowel+1 < end)
591
+ {
592
+ if (_symvec[loudestVowel+1].symbolInLowerCase()=="u") loudestVowel++;
593
+ }
594
+ }
595
+
596
+ if (loudestTone==4 || /* loudestTone==6 || */ loudestTone <= 1) {
597
+ // ignore the 4th, 6th and 1th (or no tone), so everything is set to 0 now
598
+ return;
599
+ }
600
+
601
+ if (lastSymbolStr=="t" || lastSymbolStr=="p" || lastSymbolStr=="k" || lastSymbolStr=="h") {
602
+ // only when the ending is t, p, k, h is the tone set -- and only when the tone is 8
603
+ if (loudestTone==8) _symvec[loudestVowel].setTone(loudestTone);
604
+ return;
605
+ }
606
+ else {
607
+ // if not t,p,k,h, we need to override the loudest tone--back to tone 1 !
608
+ if (loudestTone==8) {
609
+ _symvec[loudestVowel].setTone(0);
610
+ return;
611
+ }
612
+ }
613
+
614
+ _symvec[loudestVowel].setTone(loudestTone);
615
+
616
+ #undef FLV
617
+ #undef SETTONE
618
+ }
619
+
620
+ HoloSyllable convertToPOJSyllable()
621
+ {
622
+ HoloSyllable syl = *this;
623
+ syl.clearPreparedTone();
624
+ syl.setCursor(0);
625
+ if (_inputType==POJSyllable) return syl;
626
+
627
+ syl.setInputType(POJSyllable);
628
+ syl.clear();
629
+
630
+ // begin TL->POJ conversion
631
+ unsigned int size=_symvec.size();
632
+ unsigned int i;
633
+
634
+ for (i=0; i<size; i++)
635
+ {
636
+ HoloSymbol sym1 = _symvec[i];
637
+ string str1 = sym1.symbol();
638
+
639
+ // fprintf (stderr, "converting to POJ: %s\n", str1.c_str());
640
+
641
+ string lowstr1 = sym1.symbolInLowerCase();
642
+
643
+ // oo -> ou
644
+ if (lowstr1=="oo")
645
+ {
646
+ // detect case
647
+ if (str1[0] == tolower(str1[0])) {
648
+ syl.insertCharacterAtCursor('o', sym1.tone());
649
+ syl.insertCharacterAtCursor('u');
650
+ }
651
+ else
652
+ {
653
+ syl.insertCharacterAtCursor('O', sym1.tone());
654
+ syl.insertCharacterAtCursor('U');
655
+ }
656
+ continue;
657
+ }
658
+
659
+
660
+ if (hasNextSymbol(i)) {
661
+ HoloSymbol sym2 = _symvec[i+1];
662
+ string str2 = sym2.symbol();
663
+ string lowstr2 = sym2.symbolInLowerCase();
664
+
665
+ // ts -> ch with case detection
666
+ if (lowstr1=="t" && lowstr2=="s") {
667
+ // detect case
668
+ if (str1[0] == tolower(str1[0])) {
669
+ syl.insertCharacterAtCursor('c');
670
+ syl.insertCharacterAtCursor('h');
671
+ }
672
+ else {
673
+ syl.insertCharacterAtCursor('C');
674
+ syl.insertCharacterAtCursor('H');
675
+ }
676
+
677
+ i++;
678
+ continue;
679
+ }
680
+
681
+ // ue -> oe
682
+ if (lowstr1=="u" && lowstr2=="e") {
683
+ // detect case
684
+ if (str1[0] == tolower(str1[0])) {
685
+ syl.insertCharacterAtCursor('o', sym1.tone());
686
+ syl.insertCharacterAtCursor('e', sym2.tone());
687
+ }
688
+ else {
689
+ syl.insertCharacterAtCursor('O', sym1.tone());
690
+ syl.insertCharacterAtCursor('E', sym2.tone());
691
+ }
692
+
693
+ i++;
694
+ continue;
695
+ }
696
+
697
+ // ua -> oa
698
+ if (lowstr1=="u" && lowstr2=="a") {
699
+ // detect case
700
+ if (str1[0] == tolower(str1[0])) {
701
+ syl.insertCharacterAtCursor('o', sym1.tone());
702
+ syl.insertCharacterAtCursor('a', sym2.tone());
703
+ }
704
+ else {
705
+ syl.insertCharacterAtCursor('O', sym1.tone());
706
+ syl.insertCharacterAtCursor('A', sym2.tone());
707
+ }
708
+
709
+ i++;
710
+ continue;
711
+ }
712
+
713
+ // ik -> ek (at ending)
714
+ if (lowstr1=="i" && lowstr2=="k" && (i+2)==size) {
715
+ // detect case
716
+ if (str1[0] == tolower(str1[0])) {
717
+ syl.insertCharacterAtCursor('e', sym1.tone());
718
+ syl.insertCharacterAtCursor('k', sym2.tone());
719
+ }
720
+ else {
721
+ syl.insertCharacterAtCursor('E', sym1.tone());
722
+ syl.insertCharacterAtCursor('K', sym2.tone());
723
+ }
724
+
725
+ i++;
726
+ continue;
727
+ }
728
+
729
+ if (hasNextNextSymbol(i) && (i+3)==size) {
730
+ HoloSymbol sym3 = _symvec[i+2];
731
+ string str3 = sym3.symbol();
732
+ string lowstr3 = sym3.symbolInLowerCase();
733
+
734
+ // ing -> eng (must be ending)
735
+ if (lowstr1=="i" && lowstr2=="n" && lowstr3=="g") {
736
+ // detect case
737
+ if (str1[0] == tolower(str1[0])) {
738
+ syl.insertCharacterAtCursor('e', sym1.tone());
739
+ syl.insertCharacterAtCursor('n', sym2.tone());
740
+ syl.insertCharacterAtCursor('g', sym3.tone());
741
+ }
742
+ else {
743
+ syl.insertCharacterAtCursor('E', sym1.tone());
744
+ syl.insertCharacterAtCursor('N', sym2.tone());
745
+ syl.insertCharacterAtCursor('G', sym3.tone());
746
+ }
747
+
748
+ i+=2;
749
+ continue;
750
+ }
751
+
752
+ // ouh -> oh (ending)
753
+ if (lowstr1=="o" && lowstr2=="u" && lowstr3=="h") {
754
+ // detect case
755
+ if (str2[0] == tolower(str2[0])) {
756
+ syl.insertCharacterAtCursor('o', sym1.tone());
757
+ syl.insertCharacterAtCursor('h', sym2.tone());
758
+ }
759
+ else {
760
+ syl.insertCharacterAtCursor('O', sym1.tone());
761
+ syl.insertCharacterAtCursor('H', sym2.tone());
762
+ }
763
+
764
+ i+=2;
765
+ continue;
766
+ }
767
+ }
768
+ }
769
+
770
+ syl.insertSymbolAtCursor(sym1);
771
+ }
772
+
773
+ return syl;
774
+ }
775
+
776
+ HoloSyllable convertToTLSyllable()
777
+ {
778
+ HoloSyllable syl = *this;
779
+ syl.clearPreparedTone();
780
+ syl.setCursor(0);
781
+ if (_inputType==TLSyllable) return syl;
782
+
783
+ syl.setInputType(TLSyllable);
784
+ syl.clear();
785
+
786
+ // begin POJ->TL conversion
787
+ unsigned int size=_symvec.size();
788
+ unsigned int i;
789
+
790
+ for (i=0; i<size; i++)
791
+ {
792
+ HoloSymbol sym1 = _symvec[i];
793
+ string str1 = sym1.symbol();
794
+ string lowstr1 = sym1.symbolInLowerCase();
795
+
796
+ // ou -> oo
797
+ if (lowstr1=="ou")
798
+ {
799
+ // detect case
800
+ if (str1[0] == tolower(str1[0])) {
801
+ syl.insertCharacterAtCursor('o', sym1.tone());
802
+ syl.insertCharacterAtCursor('o');
803
+ }
804
+ else
805
+ {
806
+ syl.insertCharacterAtCursor('O', sym1.tone());
807
+ syl.insertCharacterAtCursor('O');
808
+ }
809
+ continue;
810
+ }
811
+
812
+
813
+ if (hasNextSymbol(i)) {
814
+ HoloSymbol sym2 = _symvec[i+1];
815
+ string str2 = sym2.symbol();
816
+ string lowstr2 = sym2.symbolInLowerCase();
817
+
818
+ // ch -> ts with case detection
819
+ if (lowstr1=="c" && lowstr2=="h") {
820
+ // detect case
821
+ if (str1[0] == tolower(str1[0])) {
822
+ syl.insertCharacterAtCursor('t');
823
+ syl.insertCharacterAtCursor('s');
824
+ }
825
+ else {
826
+ syl.insertCharacterAtCursor('T');
827
+ syl.insertCharacterAtCursor('S');
828
+ }
829
+
830
+ i++;
831
+ continue;
832
+ }
833
+
834
+ // oe -> ue
835
+ if (lowstr1=="o" && lowstr2=="e") {
836
+ // detect case
837
+ if (str1[0] == tolower(str1[0])) {
838
+ syl.insertCharacterAtCursor('u', sym1.tone());
839
+ syl.insertCharacterAtCursor('e', sym2.tone());
840
+ }
841
+ else {
842
+ syl.insertCharacterAtCursor('U', sym1.tone());
843
+ syl.insertCharacterAtCursor('E', sym2.tone());
844
+ }
845
+
846
+ i++;
847
+ continue;
848
+ }
849
+
850
+ // oa -> ua
851
+ if (lowstr1=="o" && lowstr2=="a") {
852
+ // detect case
853
+ if (str1[0] == tolower(str1[0])) {
854
+ syl.insertCharacterAtCursor('u', sym1.tone());
855
+ syl.insertCharacterAtCursor('a', sym2.tone());
856
+ }
857
+ else {
858
+ syl.insertCharacterAtCursor('U', sym1.tone());
859
+ syl.insertCharacterAtCursor('A', sym2.tone());
860
+ }
861
+
862
+ i++;
863
+ continue;
864
+ }
865
+
866
+ // ek -> ik (at ending)
867
+ if (lowstr1=="e" && lowstr2=="k" && (i+2)==size) {
868
+ // detect case
869
+ if (str1[0] == tolower(str1[0])) {
870
+ syl.insertCharacterAtCursor('i', sym1.tone());
871
+ syl.insertCharacterAtCursor('k', sym2.tone());
872
+ }
873
+ else {
874
+ syl.insertCharacterAtCursor('I', sym1.tone());
875
+ syl.insertCharacterAtCursor('K', sym2.tone());
876
+ }
877
+
878
+ i++;
879
+ continue;
880
+ }
881
+
882
+ if (hasNextNextSymbol(i) && (i+3)==size) {
883
+ HoloSymbol sym3 = _symvec[i+2];
884
+ string str3 = sym3.symbol();
885
+ string lowstr3 = sym3.symbolInLowerCase();
886
+
887
+ // ing -> eng (must be ending)
888
+ if (lowstr1=="e" && lowstr2=="n" && lowstr3=="g") {
889
+ // detect case
890
+ if (str1[0] == tolower(str1[0])) {
891
+ syl.insertCharacterAtCursor('i', sym1.tone());
892
+ syl.insertCharacterAtCursor('n', sym2.tone());
893
+ syl.insertCharacterAtCursor('g', sym3.tone());
894
+ }
895
+ else {
896
+ syl.insertCharacterAtCursor('I', sym1.tone());
897
+ syl.insertCharacterAtCursor('N', sym2.tone());
898
+ syl.insertCharacterAtCursor('G', sym3.tone());
899
+ }
900
+
901
+ i+=2;
902
+ continue;
903
+ }
904
+ }
905
+ }
906
+
907
+ syl.insertSymbolAtCursor(sym1);
908
+ }
909
+ return syl;
910
+ }
911
+
912
+
913
+ protected:
914
+ bool atBeginning()
915
+ {
916
+ return _cursor == 0;
917
+ }
918
+
919
+ bool atEnd()
920
+ {
921
+ return _cursor == numberOfCodepoints();
922
+ }
923
+
924
+ void clearPreparedTone()
925
+ {
926
+ if (!_preparedTone) return;
927
+ _preparedTone = 0;
928
+ _cursor--;
929
+ }
930
+
931
+ bool hasPreviousSymbolAtCursor()
932
+ {
933
+ unsigned int realcursor = _preparedTone ? _cursor-1 : _cursor;
934
+ return realcursor > 0;
935
+ }
936
+
937
+ bool hasNextSymbol(unsigned int pos)
938
+ {
939
+ if (pos+1 >= _symvec.size()) return false;
940
+ return true;
941
+ }
942
+
943
+ bool hasNextNextSymbol(unsigned int pos)
944
+ {
945
+ if (pos+2 >= _symvec.size()) return false;
946
+ return true;
947
+ }
948
+
949
+
950
+ // the result of this function is unpredictable if there's no
951
+ // previous symbol--always check with hasPreviousSymbolAtCursor() !
952
+ HoloSymbol& previousSymbolAtCursor()
953
+ {
954
+ unsigned int realcursor = _preparedTone ? _cursor-1 : _cursor;
955
+ return _symvec[realcursor-1];
956
+ }
957
+
958
+ // always assumes that the given input is in all lower case
959
+ unsigned int findSymbol(const char *s)
960
+ {
961
+ string cpps(s);
962
+ unsigned int size = _symvec.size();
963
+ unsigned int i;
964
+ for (i = 0; i < size; i++) {
965
+ if (_symvec[i].symbolInLowerCase() == cpps) break;
966
+ }
967
+ return i;
968
+ }
969
+
970
+ unsigned int findSymbolPair(const char *s1, const char *s2)
971
+ {
972
+ string cpps1(s1), cpps2(s2);
973
+
974
+ unsigned int size = _symvec.size();
975
+ if (size < 2) return size;
976
+
977
+ unsigned int i;
978
+ for (i = 0; i < size-1; i++) {
979
+ if (_symvec[i].symbolInLowerCase()==cpps1 && _symvec[i+1].symbolInLowerCase()==cpps2) return i;
980
+ }
981
+
982
+ return size;
983
+ }
984
+
985
+ SyllableType _inputType;
986
+ DiacriticInputOption _inputOption;
987
+ bool _forcePOJStyle;
988
+
989
+ vector<HoloSymbol> _symvec;
990
+ unsigned int _cursor;
991
+ unsigned int _preparedTone;
992
+ };
993
+
994
+ class FreeFormSyllable : public ComposableStringBuffer
995
+ {
996
+ public:
997
+ HoloSyllable convertToTLFromTLPA(unsigned int finalTone=0)
998
+ {
999
+ string rep=internalForm();
1000
+ HoloSyllable syl;
1001
+ syl.setInputType(TLSyllable);
1002
+
1003
+ unsigned int size=rep.length();
1004
+ for (unsigned int i=0; i<size; i++)
1005
+ {
1006
+ if (rep[i]=='c') {
1007
+ syl.insertCharacterAtCursor('t');
1008
+ syl.insertCharacterAtCursor('s');
1009
+ }
1010
+ else if (rep[i]=='C')
1011
+ {
1012
+ syl.insertCharacterAtCursor('T');
1013
+ syl.insertCharacterAtCursor('S');
1014
+ }
1015
+ else syl.insertCharacterAtCursor(rep[i]);
1016
+ }
1017
+
1018
+ syl.normalize(finalTone);
1019
+ return syl;
1020
+ }
1021
+
1022
+
1023
+ HoloSyllable convertToTLFromDT(unsigned int finalTone=0)
1024
+ {
1025
+ string rep=internalForm();
1026
+ HoloSyllable syl;
1027
+ syl.setInputType(TLSyllable);
1028
+
1029
+ unsigned int size=rep.length();
1030
+ for (unsigned int i=0; i<size; i++)
1031
+ {
1032
+ char dt1 = rep[i];
1033
+ char lowdt1 = tolower(dt1);
1034
+
1035
+ // r -> j (beginning)
1036
+ if (i==0 && lowdt1=='r') {
1037
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('j', dt1));
1038
+ continue;
1039
+ }
1040
+
1041
+ // replaces the two-character combinations
1042
+ if (i+1 < size) {
1043
+ string part=rep.substr(i, 2);
1044
+ string lower=toLowerString(part);
1045
+
1046
+ // or -> o
1047
+ if (lower=="or") {
1048
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('o', part));
1049
+ i++;
1050
+ continue;
1051
+ }
1052
+
1053
+ // en -> ian
1054
+ if (lower=="en") {
1055
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('i', part));
1056
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('a', part));
1057
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('n', part));
1058
+ i++;
1059
+ continue;
1060
+ }
1061
+
1062
+ // et -> iat
1063
+ if (lower=="et") {
1064
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('i', part));
1065
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('a', part));
1066
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('t', part));
1067
+ i++;
1068
+ continue;
1069
+ }
1070
+
1071
+
1072
+ // bh -> b (beginning)
1073
+ if (i==0 && lower=="bh") {
1074
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('b', part));
1075
+ i++;
1076
+ continue;
1077
+ }
1078
+
1079
+ // gh -> g (beginning)
1080
+ if (i==0 && lower=="gh") {
1081
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('g', part));
1082
+ i++;
1083
+ continue;
1084
+ }
1085
+
1086
+ // wa -> ua (beginning)
1087
+ if (lower=="wa" && i==0) {
1088
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('u', part));
1089
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('a', part));
1090
+ i++;
1091
+ continue;
1092
+ }
1093
+
1094
+ // we -> ue (beginning)
1095
+ if (lower=="we" && i==0) {
1096
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('u', part));
1097
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('e', part));
1098
+ i++;
1099
+ continue;
1100
+ }
1101
+
1102
+ // wi -> ui (beginning)
1103
+ if (lower=="wi" && i==0) {
1104
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('u', part));
1105
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('i', part));
1106
+ i++;
1107
+ continue;
1108
+ }
1109
+
1110
+ // yo -> io (beginning)
1111
+ if (lower=="yo" && i==0) {
1112
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('i', part));
1113
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('o', part));
1114
+ i++;
1115
+ continue;
1116
+ }
1117
+
1118
+ // yi -> i (beginning)
1119
+ if (lower=="yi" && i==0) {
1120
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('i', part));
1121
+ i++;
1122
+ continue;
1123
+ }
1124
+ }
1125
+
1126
+ // o -> oo
1127
+ if (lowdt1=='o') {
1128
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('o', dt1));
1129
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('o', dt1));
1130
+ continue;
1131
+ }
1132
+
1133
+ // b -> p (beginning)
1134
+ if (i==0 && lowdt1=='b') {
1135
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('p', dt1));
1136
+ continue;
1137
+ }
1138
+
1139
+ // p -> ph (beginning)
1140
+ if (i==0 && lowdt1=='p') {
1141
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('p', dt1));
1142
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('h', dt1));
1143
+ continue;
1144
+ }
1145
+
1146
+ // k -> kh (beginning)
1147
+ if (i==0 && lowdt1=='k') {
1148
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('k', dt1));
1149
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('h', dt1));
1150
+ continue;
1151
+ }
1152
+
1153
+ // g -> k (beginning)
1154
+ if (i==0 && lowdt1=='g') {
1155
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('k', dt1));
1156
+ continue;
1157
+ }
1158
+
1159
+ // d -> t (beginning)
1160
+ if (i==0 && lowdt1=='d') {
1161
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('t', dt1));
1162
+ continue;
1163
+ }
1164
+
1165
+ // t -> th (beginning)
1166
+ if (i==0 && lowdt1=='t') {
1167
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('t', dt1));
1168
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('h', dt1));
1169
+ continue;
1170
+ }
1171
+
1172
+ // z -> ts (beginning)
1173
+ if (i==0 && lowdt1=='z') {
1174
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('t', dt1));
1175
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('s', dt1));
1176
+ continue;
1177
+ }
1178
+
1179
+ // c -> tsh (beginning)
1180
+ if (i==0 && lowdt1=='c') {
1181
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('t', dt1));
1182
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('s', dt1));
1183
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('h', dt1));
1184
+ continue;
1185
+ }
1186
+
1187
+ // else ...
1188
+ syl.insertCharacterAtCursor(dt1);
1189
+ }
1190
+
1191
+ // remap the final tone
1192
+ unsigned int tltone=finalTone;
1193
+
1194
+ syl.normalize(tltone);
1195
+ return syl;
1196
+
1197
+ }
1198
+
1199
+ protected:
1200
+ char charWithCaseAccordingTo(char c, char ref)
1201
+ {
1202
+ if (tolower(ref) == ref) return tolower(c);
1203
+ return toupper(c);
1204
+ }
1205
+
1206
+ char charWithCaseAccordingTo(char c, const string &r)
1207
+ {
1208
+ if (tolower(r[0]) == r[0]) return tolower(c);
1209
+ return toupper(c);
1210
+ }
1211
+
1212
+ string toLowerString(const string &s)
1213
+ {
1214
+ unsigned int size=s.length();
1215
+ string lower;
1216
+ unsigned int i;
1217
+ for (i=0;i<size;i++) lower+=string(1, tolower(s[i]));
1218
+ return lower;
1219
+ }
1220
+ };
1221
+ };
1222
+
1223
+