formosa 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,1223 @@
1
+ // LibHolo.h: LibFormosa's Holo Processing Library
2
+ //
3
+ // Copyright (c) 2007 The OpenVanilla Project (http://openvanilla.org)
4
+ // All rights reserved.
5
+ //
6
+ // Redistribution and use in source and binary forms, with or without
7
+ // modification, are permitted provided that the following conditions
8
+ // are met:
9
+ //
10
+ // 1. Redistributions of source code must retain the above copyright
11
+ // notice, this list of conditions and the following disclaimer.
12
+ // 2. Redistributions in binary form must reproduce the above copyright
13
+ // notice, this list of conditions and the following disclaimer in the
14
+ // documentation and/or other materials provided with the distribution.
15
+ // 3. Neither the name of OpenVanilla nor the names of its contributors
16
+ // may be used to endorse or promote products derived from this software
17
+ // without specific prior written permission.
18
+ //
19
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20
+ // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21
+ // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22
+ // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23
+ // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24
+ // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25
+ // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26
+ // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27
+ // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28
+ // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29
+ // POSSIBILITY OF SUCH DAMAGE.
30
+
31
+ #include <vector>
32
+ #include <string>
33
+ #include <iostream>
34
+ #include <ctype.h>
35
+
36
+ #include "HoloVowels.h"
37
+
38
+ namespace LibHolo
39
+ {
40
+ using namespace std;
41
+
42
+ class Composable
43
+ {
44
+ public:
45
+ virtual ~Composable() {}
46
+ virtual void clear()=0;
47
+ virtual bool empty()=0;
48
+ virtual unsigned int numberOfCodepoints()=0;
49
+ virtual string composedForm()=0;
50
+ };
51
+
52
+ class ComposableStringBuffer : Composable {
53
+ public:
54
+ ComposableStringBuffer() : _cursor(0)
55
+ {
56
+ }
57
+
58
+ virtual unsigned int cursor()
59
+ {
60
+ return _cursor;
61
+ }
62
+
63
+ virtual unsigned int setCursor(unsigned int c)
64
+ {
65
+ if (c <= numberOfCodepoints()) _cursor = c;
66
+ return _cursor;
67
+ }
68
+
69
+ virtual void clear()
70
+ {
71
+ _cursor = 0;
72
+ strvec.clear();
73
+ }
74
+
75
+ virtual bool empty()
76
+ {
77
+ return strvec.empty();
78
+ }
79
+
80
+ virtual unsigned int numberOfCodepoints()
81
+ {
82
+ return strvec.size();
83
+ }
84
+
85
+ virtual bool insertCharacterAt(unsigned int i, char c)
86
+ {
87
+ if (i > numberOfCodepoints()) return false;
88
+ strvec.insert(strvec.begin()+i, string(1, c));
89
+ return true;
90
+ }
91
+
92
+ virtual bool removeCodepointAt(unsigned int i)
93
+ {
94
+ if (i >= numberOfCodepoints()) return false;
95
+ strvec.erase(strvec.begin() + i);
96
+ return true;
97
+ }
98
+
99
+ virtual string composedForm()
100
+ {
101
+ return internalForm();
102
+ }
103
+
104
+ virtual string internalForm()
105
+ {
106
+ string newstr;
107
+ unsigned int s=numberOfCodepoints();
108
+ for (unsigned int i=0; i<s; i++) newstr+=strvec[i];
109
+ return newstr;
110
+ }
111
+
112
+ protected:
113
+ vector<string> strvec;
114
+ unsigned int _cursor;
115
+ };
116
+
117
+
118
+ enum SyllableType
119
+ {
120
+ POJSyllable = 0,
121
+ TLSyllable = 1,
122
+ TLPASyllable = 2,
123
+ DTSyllable = 3
124
+ };
125
+
126
+ enum DiacriticInputOption
127
+ {
128
+ DiacriticGivenBeforeVowel = 0,
129
+ DiacriticGivenAfterVowel = 1
130
+ };
131
+
132
+ class HoloSymbol
133
+ {
134
+ public:
135
+ HoloSymbol() : _tone(0), _type(POJSyllable)
136
+ {
137
+ }
138
+
139
+ HoloSymbol(const string &s, SyllableType t) : _tone(0), _type(t), _symbol(s)
140
+ {
141
+ }
142
+
143
+ HoloSymbol(const HoloSymbol &s) : _tone(s._tone), _type(s._type), _symbol(s._symbol)
144
+ {
145
+ }
146
+
147
+ void setType(SyllableType t)
148
+ {
149
+ _type = t;
150
+ }
151
+
152
+ const HoloSymbol& operator=(const HoloSymbol &s)
153
+ {
154
+ _symbol = s._symbol;
155
+ _tone = s._tone;
156
+ _type = s._type;
157
+ return *this;
158
+ }
159
+
160
+ string symbol()
161
+ {
162
+ return string(_symbol);
163
+ }
164
+
165
+ string symbolInLowerCase()
166
+ {
167
+ string lower;
168
+ unsigned int s=_symbol.length();
169
+ for (unsigned int i=0; i<s; i++) lower+=tolower(_symbol[i]);
170
+ return lower;
171
+ }
172
+
173
+ string setSymbol(const string& s)
174
+ {
175
+ return (_symbol = s);
176
+ }
177
+
178
+ string composedForm(bool forcePOJStyle=false)
179
+ {
180
+ string composed = ComposeHoloVowel(_symbol, _tone, ((_type==POJSyllable) || forcePOJStyle) ? true : false);
181
+ if (!composed.length()) return _symbol;
182
+ return composed;
183
+ }
184
+
185
+ unsigned int composedLength()
186
+ {
187
+ string composed = composedForm();
188
+ unsigned int len = 0, clen = composed.length();
189
+ for (unsigned int i=0; i<clen; )
190
+ {
191
+ if (!(composed[i] & 0x80)) {
192
+ len++;
193
+ i++;
194
+ }
195
+ else if ((composed[i] & 0xe0) == 0xc0) {
196
+ len++;
197
+ i+=2;
198
+ }
199
+ else if ((composed[i] & 0xf0) == 0xe0) {
200
+ len++;
201
+ i+=3;
202
+ }
203
+ else {
204
+ len++;
205
+ i+=4;
206
+ }
207
+ }
208
+
209
+ // fprintf (stderr, "composed=%s, strlen=%d, calculated len=%d\n", composed.c_str(), clen, len);
210
+
211
+ return len;
212
+ }
213
+
214
+ unsigned int tone()
215
+ {
216
+ return _tone;
217
+ }
218
+
219
+ unsigned int setTone(unsigned int t)
220
+ {
221
+ _tone = t > 9 ? _tone : t;
222
+ return _tone;
223
+ }
224
+
225
+ bool isUpperCase()
226
+ {
227
+ if (!_symbol.length()) return false;
228
+ return toupper(_symbol[0]) == _symbol[0];
229
+ }
230
+
231
+
232
+ protected:
233
+ unsigned int _tone;
234
+ SyllableType _type;
235
+ string _symbol;
236
+ };
237
+
238
+
239
+ // the _cursor in HoloSyllable really means "the internal symbol cursor",
240
+ // as for the display cursor (called by the input method context),
241
+ // we need to recalculate
242
+ class HoloSyllable : public Composable
243
+ {
244
+ public:
245
+ HoloSyllable() : _inputType(POJSyllable), _inputOption(DiacriticGivenBeforeVowel),
246
+ _forcePOJStyle(false),
247
+ _cursor(0), _preparedTone(0)
248
+ {
249
+ }
250
+
251
+ HoloSyllable(const HoloSyllable &s) : _inputType(s._inputType),
252
+ _inputOption(s._inputOption),
253
+ _forcePOJStyle(s._forcePOJStyle),
254
+ _symvec(s._symvec),
255
+ _cursor(s._cursor), _preparedTone(s._preparedTone)
256
+ {
257
+ }
258
+
259
+ const HoloSyllable& operator=(const HoloSyllable &s)
260
+ {
261
+ _inputType = s._inputType;
262
+ _inputOption = s._inputOption;
263
+ _forcePOJStyle = s._forcePOJStyle;
264
+ _symvec = s._symvec;
265
+ _cursor = s._cursor;
266
+ _preparedTone = s._preparedTone;
267
+ return *this;
268
+ }
269
+
270
+ virtual void setInputType(SyllableType t)
271
+ {
272
+ _inputType = t;
273
+ }
274
+
275
+ virtual void setInputOption(DiacriticInputOption o)
276
+ {
277
+ if (o != _inputOption) clearPreparedTone();
278
+ _inputOption = o;
279
+ }
280
+
281
+ virtual void setForcePOJStyle(bool p)
282
+ {
283
+ _forcePOJStyle = p;
284
+ }
285
+
286
+ virtual void clear()
287
+ {
288
+ _symvec.clear();
289
+ _cursor = 0;
290
+ _preparedTone = 0;
291
+ }
292
+
293
+ virtual bool empty()
294
+ {
295
+ return _symvec.empty();
296
+ }
297
+
298
+ virtual unsigned int numberOfCodepoints()
299
+ {
300
+ return _symvec.size();
301
+ }
302
+
303
+ virtual string composedForm()
304
+ {
305
+ unsigned int s = _symvec.size();
306
+ string composed;
307
+ unsigned int i;
308
+
309
+ if (_preparedTone) _cursor--;
310
+
311
+ for (i=0; i<_cursor; i++)
312
+ {
313
+ composed += _symvec[i].composedForm(_forcePOJStyle);
314
+ // fprintf(stderr, "%d, symbol=%s, composed=%s, composd form=%s\n", i, _symvec[i].symbol().c_str(), _symvec[i].composedForm().c_str(), composed.c_str());
315
+ }
316
+
317
+ composed += GetToneASCIIRepresentation(_preparedTone);
318
+ // fprintf(stderr, "composd form=%s\n", composed.c_str());
319
+
320
+ for (; i<s; i++)
321
+ {
322
+ composed += _symvec[i].composedForm(_forcePOJStyle);
323
+ // fprintf(stderr, "composd form=%s\n", composed.c_str());
324
+ }
325
+
326
+ if (_preparedTone) _cursor++;
327
+
328
+ return composed;
329
+ }
330
+
331
+ void setCursor(unsigned int c)
332
+ {
333
+ clearPreparedTone();
334
+ _cursor = c;
335
+ }
336
+
337
+ unsigned int cursor()
338
+ {
339
+ unsigned int realcursor = _preparedTone ? _cursor-1 : _cursor;
340
+ unsigned codepointCursor=0;
341
+ for (unsigned int i=0; i<realcursor; i++) codepointCursor+=_symvec[i].composedLength();
342
+
343
+ if (_preparedTone) codepointCursor++;
344
+
345
+ return codepointCursor;
346
+ }
347
+
348
+ bool cursorHome()
349
+ {
350
+ clearPreparedTone();
351
+ if (_cursor==0) return false;
352
+ _cursor=0;
353
+ return true;
354
+ }
355
+
356
+ bool cursorEnd()
357
+ {
358
+ clearPreparedTone();
359
+ unsigned int len = numberOfCodepoints();
360
+ if (_cursor == len) return false;
361
+ _cursor = len;
362
+ return true;
363
+ }
364
+
365
+ bool cursorLeft()
366
+ {
367
+ clearPreparedTone();
368
+ if (_cursor==0) return false;
369
+ _cursor--;
370
+ return true;
371
+ }
372
+
373
+ bool cursorRight()
374
+ {
375
+ clearPreparedTone();
376
+ if (_cursor == numberOfCodepoints()) return false;
377
+ _cursor++;
378
+ return true;
379
+ }
380
+
381
+ bool insertSymbolAtCursor(const HoloSymbol &s)
382
+ {
383
+ clearPreparedTone();
384
+ HoloSymbol newsym(s);
385
+ newsym.setType(_inputType);
386
+ _symvec.insert(_symvec.begin() + _cursor, newsym);
387
+ _cursor++;
388
+
389
+ return true;
390
+ }
391
+
392
+ // if there is a prepared tone, the given tone parameter will be ignored
393
+ bool insertCharacterAtCursor(char c, unsigned int tone=0)
394
+ {
395
+ // fprintf(stderr, "insert char %d ('%c'), cursor=%d\n", c, c, _cursor);
396
+ if (IsDiacriticSymbol(c))
397
+ {
398
+ unsigned int tone = ToneFromDiacriticSymbol(c);
399
+
400
+ // if there's already a prepared tone, we replace it with the current one
401
+ if (_preparedTone) {
402
+ _preparedTone = tone;
403
+ return true;
404
+ }
405
+
406
+ if (_inputOption==DiacriticGivenBeforeVowel) {
407
+ _preparedTone = tone;
408
+ _cursor++;
409
+ }
410
+ else {
411
+ // diacritic given after vowel
412
+ if (hasPreviousSymbolAtCursor()) previousSymbolAtCursor().setTone(tone);
413
+ }
414
+ return true;
415
+ }
416
+
417
+ // if it's not a diacritic symbol, it's POJ^W^W^W, and it's n or u or g,
418
+ // (and if there's no prepared tone!)
419
+ // we need to do something special...
420
+ if (!IsDiacriticSymbol(c) && !_preparedTone /* && _inputType==POJSyllable */)
421
+ {
422
+ if (hasPreviousSymbolAtCursor())
423
+ {
424
+ string prev = previousSymbolAtCursor().symbolInLowerCase();
425
+
426
+ // N -> nn only works if the first character of the syllable is not an
427
+ // all uppercase symbol
428
+ if (c=='N' && ((prev != "n") && (prev != "nn")) && _inputType == POJSyllable
429
+ && (_symvec.size() > 0 && !_symvec[0].isUpperCase()))
430
+ {
431
+ // insert two n's in a row
432
+ _symvec.insert(_symvec.begin() + _cursor, HoloSymbol(string("nn"), _inputType));
433
+ _cursor++;
434
+ return true;
435
+ }
436
+ else if (tolower(c)=='n' && prev=="n") {
437
+ previousSymbolAtCursor().setSymbol(previousSymbolAtCursor().symbol() + string(1, c));
438
+ return true;
439
+ }
440
+ else if (_inputType == POJSyllable && tolower(c)=='u' && prev=="o") {
441
+ previousSymbolAtCursor().setSymbol(previousSymbolAtCursor().symbol() + string(1, c));
442
+ return true;
443
+ }
444
+ else if (_inputType == TLSyllable && tolower(c)=='o' && prev=="o") {
445
+ previousSymbolAtCursor().setSymbol(previousSymbolAtCursor().symbol() + string(1, c));
446
+ return true;
447
+ }
448
+ else if (tolower(c)=='g' && prev=="nn") {
449
+ // we need to break them up!
450
+ string before = previousSymbolAtCursor().symbol();
451
+
452
+ // and the tone of the previous symbol (when it's combined into nn) will be retained
453
+ previousSymbolAtCursor().setSymbol(before.substr(0, 1));
454
+
455
+ // insert one n and one g
456
+ _symvec.insert(_symvec.begin() + _cursor, HoloSymbol(before.substr(1,1), _inputType));
457
+ _cursor++;
458
+ _symvec.insert(_symvec.begin() + _cursor, HoloSymbol(string(1, c), _inputType));
459
+ _cursor++;
460
+ return true;
461
+ }
462
+ }
463
+ }
464
+
465
+
466
+ HoloSymbol s(string(1, c), _inputType);
467
+ if (_preparedTone)
468
+ {
469
+ _cursor--;
470
+ s.setTone(_preparedTone);
471
+ _preparedTone = 0;
472
+ }
473
+ else if (tone > 1)
474
+ {
475
+ s.setTone(tone);
476
+ }
477
+
478
+ _symvec.insert(_symvec.begin() + _cursor, s);
479
+ _cursor++;
480
+
481
+ return true;
482
+ }
483
+
484
+ bool removeCharacterAtRightOfCursor() // backspace
485
+ {
486
+ if (_preparedTone)
487
+ {
488
+ clearPreparedTone();
489
+ return true;
490
+ }
491
+
492
+ if (atBeginning()) return false;
493
+ _cursor--;
494
+ _symvec.erase(_symvec.begin() + _cursor);
495
+ return true;
496
+ }
497
+
498
+ bool removeCharacterAtLeftOfCursor() // delete
499
+ {
500
+ // we do some tightrope trick here: if we have _preparedTone ready,
501
+ // we "push back" the real _cursor position, do the delete thing,
502
+ // then push it back
503
+ bool retval=true;
504
+ if (_preparedTone) _cursor--;
505
+ if (atEnd()) retval=false; else _symvec.erase(_symvec.begin() + _cursor);
506
+ if (_preparedTone) _cursor++;
507
+ return retval;
508
+ }
509
+
510
+ // returns a normalized string that represents the "internal form" for querying the database
511
+ // implies normalization
512
+ string normalizedQueryData(unsigned int finalTone=0)
513
+ {
514
+ HoloSyllable s(*this);
515
+ s.normalize(finalTone);
516
+ string query;
517
+
518
+ unsigned int size=s._symvec.size();
519
+ unsigned int loudest = 0;
520
+
521
+ for (unsigned int i=0; i<size; i++) {
522
+ query = query + s._symvec[i].symbol();
523
+ if (s._symvec[i].tone() > 1) loudest = s._symvec[i].tone();
524
+ // fprintf (stderr, "combining query data %s, tone %d\n", s._symvec[i].symbol().c_str(), s._symvec[i].tone());
525
+ }
526
+
527
+ if (loudest > 1) query = query + string(1, loudest+'0');
528
+ return query;
529
+ }
530
+
531
+ // normalization is an "identpotent" function, ie. the result should
532
+ // be the same no matter how many times you call it--this being a very
533
+ // important linguistic characteristic of this function
534
+ void normalize(unsigned int finalTone=0)
535
+ {
536
+ // fprintf (stderr, "input finalTone=%d\n", finalTone);
537
+ unsigned int end = _symvec.size();
538
+
539
+ // if it's empty, just return
540
+ if (!end) return;
541
+
542
+ unsigned int loudestVowel = end;
543
+ unsigned int loudestTone = 0;
544
+ unsigned int p;
545
+
546
+ // find the loudest vowel
547
+ #define FLV(x) ((p=findSymbol(x)) != end)
548
+ #define SETLOUDEST(v) do { loudestVowel = v; if (_symvec[loudestVowel].tone()>1) { loudestTone = _symvec[loudestVowel].tone(); } } while(0)
549
+
550
+
551
+ if (end==1 && _symvec[0].symbolInLowerCase()=="m") SETLOUDEST(0);
552
+ if (FLV("n")) SETLOUDEST(p);
553
+ if (FLV("m")) SETLOUDEST(p);
554
+
555
+ // see if it's ng
556
+ if ((p=findSymbolPair("n", "g")) != end)
557
+ SETLOUDEST(p);
558
+
559
+ if (FLV("u")) SETLOUDEST(p);
560
+ if (FLV("i")) SETLOUDEST(p);
561
+ if (FLV("o")) SETLOUDEST(p);
562
+ if (FLV("e")) SETLOUDEST(p);
563
+ if (FLV("ou")) SETLOUDEST(p);
564
+ if (FLV("oo")) SETLOUDEST(p);
565
+ if (FLV("a")) SETLOUDEST(p);
566
+
567
+ // the last "ere" override
568
+ if (end >= 3) {
569
+ if (_symvec[end-1].symbolInLowerCase() == "e" && _symvec[end-2].symbolInLowerCase() == "r" && _symvec[end-3].symbolInLowerCase() == "e")
570
+ {
571
+ SETLOUDEST(end-1);
572
+ }
573
+ }
574
+
575
+ if (loudestVowel==end) return;
576
+ // fprintf(stderr, "found loudest vowel=%d (%s), loudest tone=%d\n", loudestVowel, _symvec[loudestVowel].symbol().c_str(), loudestTone);
577
+
578
+ // finalTone overrides
579
+ if (finalTone > 1) loudestTone = finalTone;
580
+
581
+ for (unsigned int i=0; i<end; i++) _symvec[i].setTone(0);
582
+
583
+ string lastSymbolStr = _symvec[end-1].symbolInLowerCase();
584
+
585
+ // if the symbol is "i", and there's a next "u", we shift
586
+ // the vowel to "u"
587
+
588
+ if (_symvec[loudestVowel].symbolInLowerCase()=="i")
589
+ {
590
+ if (loudestVowel+1 < end)
591
+ {
592
+ if (_symvec[loudestVowel+1].symbolInLowerCase()=="u") loudestVowel++;
593
+ }
594
+ }
595
+
596
+ if (loudestTone==4 || /* loudestTone==6 || */ loudestTone <= 1) {
597
+ // ignore the 4th, 6th and 1th (or no tone), so everything is set to 0 now
598
+ return;
599
+ }
600
+
601
+ if (lastSymbolStr=="t" || lastSymbolStr=="p" || lastSymbolStr=="k" || lastSymbolStr=="h") {
602
+ // only when the ending is t, p, k, h is the tone set -- and only when the tone is 8
603
+ if (loudestTone==8) _symvec[loudestVowel].setTone(loudestTone);
604
+ return;
605
+ }
606
+ else {
607
+ // if not t,p,k,h, we need to override the loudest tone--back to tone 1 !
608
+ if (loudestTone==8) {
609
+ _symvec[loudestVowel].setTone(0);
610
+ return;
611
+ }
612
+ }
613
+
614
+ _symvec[loudestVowel].setTone(loudestTone);
615
+
616
+ #undef FLV
617
+ #undef SETTONE
618
+ }
619
+
620
+ HoloSyllable convertToPOJSyllable()
621
+ {
622
+ HoloSyllable syl = *this;
623
+ syl.clearPreparedTone();
624
+ syl.setCursor(0);
625
+ if (_inputType==POJSyllable) return syl;
626
+
627
+ syl.setInputType(POJSyllable);
628
+ syl.clear();
629
+
630
+ // begin TL->POJ conversion
631
+ unsigned int size=_symvec.size();
632
+ unsigned int i;
633
+
634
+ for (i=0; i<size; i++)
635
+ {
636
+ HoloSymbol sym1 = _symvec[i];
637
+ string str1 = sym1.symbol();
638
+
639
+ // fprintf (stderr, "converting to POJ: %s\n", str1.c_str());
640
+
641
+ string lowstr1 = sym1.symbolInLowerCase();
642
+
643
+ // oo -> ou
644
+ if (lowstr1=="oo")
645
+ {
646
+ // detect case
647
+ if (str1[0] == tolower(str1[0])) {
648
+ syl.insertCharacterAtCursor('o', sym1.tone());
649
+ syl.insertCharacterAtCursor('u');
650
+ }
651
+ else
652
+ {
653
+ syl.insertCharacterAtCursor('O', sym1.tone());
654
+ syl.insertCharacterAtCursor('U');
655
+ }
656
+ continue;
657
+ }
658
+
659
+
660
+ if (hasNextSymbol(i)) {
661
+ HoloSymbol sym2 = _symvec[i+1];
662
+ string str2 = sym2.symbol();
663
+ string lowstr2 = sym2.symbolInLowerCase();
664
+
665
+ // ts -> ch with case detection
666
+ if (lowstr1=="t" && lowstr2=="s") {
667
+ // detect case
668
+ if (str1[0] == tolower(str1[0])) {
669
+ syl.insertCharacterAtCursor('c');
670
+ syl.insertCharacterAtCursor('h');
671
+ }
672
+ else {
673
+ syl.insertCharacterAtCursor('C');
674
+ syl.insertCharacterAtCursor('H');
675
+ }
676
+
677
+ i++;
678
+ continue;
679
+ }
680
+
681
+ // ue -> oe
682
+ if (lowstr1=="u" && lowstr2=="e") {
683
+ // detect case
684
+ if (str1[0] == tolower(str1[0])) {
685
+ syl.insertCharacterAtCursor('o', sym1.tone());
686
+ syl.insertCharacterAtCursor('e', sym2.tone());
687
+ }
688
+ else {
689
+ syl.insertCharacterAtCursor('O', sym1.tone());
690
+ syl.insertCharacterAtCursor('E', sym2.tone());
691
+ }
692
+
693
+ i++;
694
+ continue;
695
+ }
696
+
697
+ // ua -> oa
698
+ if (lowstr1=="u" && lowstr2=="a") {
699
+ // detect case
700
+ if (str1[0] == tolower(str1[0])) {
701
+ syl.insertCharacterAtCursor('o', sym1.tone());
702
+ syl.insertCharacterAtCursor('a', sym2.tone());
703
+ }
704
+ else {
705
+ syl.insertCharacterAtCursor('O', sym1.tone());
706
+ syl.insertCharacterAtCursor('A', sym2.tone());
707
+ }
708
+
709
+ i++;
710
+ continue;
711
+ }
712
+
713
+ // ik -> ek (at ending)
714
+ if (lowstr1=="i" && lowstr2=="k" && (i+2)==size) {
715
+ // detect case
716
+ if (str1[0] == tolower(str1[0])) {
717
+ syl.insertCharacterAtCursor('e', sym1.tone());
718
+ syl.insertCharacterAtCursor('k', sym2.tone());
719
+ }
720
+ else {
721
+ syl.insertCharacterAtCursor('E', sym1.tone());
722
+ syl.insertCharacterAtCursor('K', sym2.tone());
723
+ }
724
+
725
+ i++;
726
+ continue;
727
+ }
728
+
729
+ if (hasNextNextSymbol(i) && (i+3)==size) {
730
+ HoloSymbol sym3 = _symvec[i+2];
731
+ string str3 = sym3.symbol();
732
+ string lowstr3 = sym3.symbolInLowerCase();
733
+
734
+ // ing -> eng (must be ending)
735
+ if (lowstr1=="i" && lowstr2=="n" && lowstr3=="g") {
736
+ // detect case
737
+ if (str1[0] == tolower(str1[0])) {
738
+ syl.insertCharacterAtCursor('e', sym1.tone());
739
+ syl.insertCharacterAtCursor('n', sym2.tone());
740
+ syl.insertCharacterAtCursor('g', sym3.tone());
741
+ }
742
+ else {
743
+ syl.insertCharacterAtCursor('E', sym1.tone());
744
+ syl.insertCharacterAtCursor('N', sym2.tone());
745
+ syl.insertCharacterAtCursor('G', sym3.tone());
746
+ }
747
+
748
+ i+=2;
749
+ continue;
750
+ }
751
+
752
+ // ouh -> oh (ending)
753
+ if (lowstr1=="o" && lowstr2=="u" && lowstr3=="h") {
754
+ // detect case
755
+ if (str2[0] == tolower(str2[0])) {
756
+ syl.insertCharacterAtCursor('o', sym1.tone());
757
+ syl.insertCharacterAtCursor('h', sym2.tone());
758
+ }
759
+ else {
760
+ syl.insertCharacterAtCursor('O', sym1.tone());
761
+ syl.insertCharacterAtCursor('H', sym2.tone());
762
+ }
763
+
764
+ i+=2;
765
+ continue;
766
+ }
767
+ }
768
+ }
769
+
770
+ syl.insertSymbolAtCursor(sym1);
771
+ }
772
+
773
+ return syl;
774
+ }
775
+
776
+ HoloSyllable convertToTLSyllable()
777
+ {
778
+ HoloSyllable syl = *this;
779
+ syl.clearPreparedTone();
780
+ syl.setCursor(0);
781
+ if (_inputType==TLSyllable) return syl;
782
+
783
+ syl.setInputType(TLSyllable);
784
+ syl.clear();
785
+
786
+ // begin POJ->TL conversion
787
+ unsigned int size=_symvec.size();
788
+ unsigned int i;
789
+
790
+ for (i=0; i<size; i++)
791
+ {
792
+ HoloSymbol sym1 = _symvec[i];
793
+ string str1 = sym1.symbol();
794
+ string lowstr1 = sym1.symbolInLowerCase();
795
+
796
+ // ou -> oo
797
+ if (lowstr1=="ou")
798
+ {
799
+ // detect case
800
+ if (str1[0] == tolower(str1[0])) {
801
+ syl.insertCharacterAtCursor('o', sym1.tone());
802
+ syl.insertCharacterAtCursor('o');
803
+ }
804
+ else
805
+ {
806
+ syl.insertCharacterAtCursor('O', sym1.tone());
807
+ syl.insertCharacterAtCursor('O');
808
+ }
809
+ continue;
810
+ }
811
+
812
+
813
+ if (hasNextSymbol(i)) {
814
+ HoloSymbol sym2 = _symvec[i+1];
815
+ string str2 = sym2.symbol();
816
+ string lowstr2 = sym2.symbolInLowerCase();
817
+
818
+ // ch -> ts with case detection
819
+ if (lowstr1=="c" && lowstr2=="h") {
820
+ // detect case
821
+ if (str1[0] == tolower(str1[0])) {
822
+ syl.insertCharacterAtCursor('t');
823
+ syl.insertCharacterAtCursor('s');
824
+ }
825
+ else {
826
+ syl.insertCharacterAtCursor('T');
827
+ syl.insertCharacterAtCursor('S');
828
+ }
829
+
830
+ i++;
831
+ continue;
832
+ }
833
+
834
+ // oe -> ue
835
+ if (lowstr1=="o" && lowstr2=="e") {
836
+ // detect case
837
+ if (str1[0] == tolower(str1[0])) {
838
+ syl.insertCharacterAtCursor('u', sym1.tone());
839
+ syl.insertCharacterAtCursor('e', sym2.tone());
840
+ }
841
+ else {
842
+ syl.insertCharacterAtCursor('U', sym1.tone());
843
+ syl.insertCharacterAtCursor('E', sym2.tone());
844
+ }
845
+
846
+ i++;
847
+ continue;
848
+ }
849
+
850
+ // oa -> ua
851
+ if (lowstr1=="o" && lowstr2=="a") {
852
+ // detect case
853
+ if (str1[0] == tolower(str1[0])) {
854
+ syl.insertCharacterAtCursor('u', sym1.tone());
855
+ syl.insertCharacterAtCursor('a', sym2.tone());
856
+ }
857
+ else {
858
+ syl.insertCharacterAtCursor('U', sym1.tone());
859
+ syl.insertCharacterAtCursor('A', sym2.tone());
860
+ }
861
+
862
+ i++;
863
+ continue;
864
+ }
865
+
866
+ // ek -> ik (at ending)
867
+ if (lowstr1=="e" && lowstr2=="k" && (i+2)==size) {
868
+ // detect case
869
+ if (str1[0] == tolower(str1[0])) {
870
+ syl.insertCharacterAtCursor('i', sym1.tone());
871
+ syl.insertCharacterAtCursor('k', sym2.tone());
872
+ }
873
+ else {
874
+ syl.insertCharacterAtCursor('I', sym1.tone());
875
+ syl.insertCharacterAtCursor('K', sym2.tone());
876
+ }
877
+
878
+ i++;
879
+ continue;
880
+ }
881
+
882
+ if (hasNextNextSymbol(i) && (i+3)==size) {
883
+ HoloSymbol sym3 = _symvec[i+2];
884
+ string str3 = sym3.symbol();
885
+ string lowstr3 = sym3.symbolInLowerCase();
886
+
887
+ // ing -> eng (must be ending)
888
+ if (lowstr1=="e" && lowstr2=="n" && lowstr3=="g") {
889
+ // detect case
890
+ if (str1[0] == tolower(str1[0])) {
891
+ syl.insertCharacterAtCursor('i', sym1.tone());
892
+ syl.insertCharacterAtCursor('n', sym2.tone());
893
+ syl.insertCharacterAtCursor('g', sym3.tone());
894
+ }
895
+ else {
896
+ syl.insertCharacterAtCursor('I', sym1.tone());
897
+ syl.insertCharacterAtCursor('N', sym2.tone());
898
+ syl.insertCharacterAtCursor('G', sym3.tone());
899
+ }
900
+
901
+ i+=2;
902
+ continue;
903
+ }
904
+ }
905
+ }
906
+
907
+ syl.insertSymbolAtCursor(sym1);
908
+ }
909
+ return syl;
910
+ }
911
+
912
+
913
+ protected:
914
+ bool atBeginning()
915
+ {
916
+ return _cursor == 0;
917
+ }
918
+
919
+ bool atEnd()
920
+ {
921
+ return _cursor == numberOfCodepoints();
922
+ }
923
+
924
+ void clearPreparedTone()
925
+ {
926
+ if (!_preparedTone) return;
927
+ _preparedTone = 0;
928
+ _cursor--;
929
+ }
930
+
931
+ bool hasPreviousSymbolAtCursor()
932
+ {
933
+ unsigned int realcursor = _preparedTone ? _cursor-1 : _cursor;
934
+ return realcursor > 0;
935
+ }
936
+
937
+ bool hasNextSymbol(unsigned int pos)
938
+ {
939
+ if (pos+1 >= _symvec.size()) return false;
940
+ return true;
941
+ }
942
+
943
+ bool hasNextNextSymbol(unsigned int pos)
944
+ {
945
+ if (pos+2 >= _symvec.size()) return false;
946
+ return true;
947
+ }
948
+
949
+
950
+ // the result of this function is unpredictable if there's no
951
+ // previous symbol--always check with hasPreviousSymbolAtCursor() !
952
+ HoloSymbol& previousSymbolAtCursor()
953
+ {
954
+ unsigned int realcursor = _preparedTone ? _cursor-1 : _cursor;
955
+ return _symvec[realcursor-1];
956
+ }
957
+
958
+ // always assumes that the given input is in all lower case
959
+ unsigned int findSymbol(const char *s)
960
+ {
961
+ string cpps(s);
962
+ unsigned int size = _symvec.size();
963
+ unsigned int i;
964
+ for (i = 0; i < size; i++) {
965
+ if (_symvec[i].symbolInLowerCase() == cpps) break;
966
+ }
967
+ return i;
968
+ }
969
+
970
+ unsigned int findSymbolPair(const char *s1, const char *s2)
971
+ {
972
+ string cpps1(s1), cpps2(s2);
973
+
974
+ unsigned int size = _symvec.size();
975
+ if (size < 2) return size;
976
+
977
+ unsigned int i;
978
+ for (i = 0; i < size-1; i++) {
979
+ if (_symvec[i].symbolInLowerCase()==cpps1 && _symvec[i+1].symbolInLowerCase()==cpps2) return i;
980
+ }
981
+
982
+ return size;
983
+ }
984
+
985
+ SyllableType _inputType;
986
+ DiacriticInputOption _inputOption;
987
+ bool _forcePOJStyle;
988
+
989
+ vector<HoloSymbol> _symvec;
990
+ unsigned int _cursor;
991
+ unsigned int _preparedTone;
992
+ };
993
+
994
+ class FreeFormSyllable : public ComposableStringBuffer
995
+ {
996
+ public:
997
+ HoloSyllable convertToTLFromTLPA(unsigned int finalTone=0)
998
+ {
999
+ string rep=internalForm();
1000
+ HoloSyllable syl;
1001
+ syl.setInputType(TLSyllable);
1002
+
1003
+ unsigned int size=rep.length();
1004
+ for (unsigned int i=0; i<size; i++)
1005
+ {
1006
+ if (rep[i]=='c') {
1007
+ syl.insertCharacterAtCursor('t');
1008
+ syl.insertCharacterAtCursor('s');
1009
+ }
1010
+ else if (rep[i]=='C')
1011
+ {
1012
+ syl.insertCharacterAtCursor('T');
1013
+ syl.insertCharacterAtCursor('S');
1014
+ }
1015
+ else syl.insertCharacterAtCursor(rep[i]);
1016
+ }
1017
+
1018
+ syl.normalize(finalTone);
1019
+ return syl;
1020
+ }
1021
+
1022
+
1023
+ HoloSyllable convertToTLFromDT(unsigned int finalTone=0)
1024
+ {
1025
+ string rep=internalForm();
1026
+ HoloSyllable syl;
1027
+ syl.setInputType(TLSyllable);
1028
+
1029
+ unsigned int size=rep.length();
1030
+ for (unsigned int i=0; i<size; i++)
1031
+ {
1032
+ char dt1 = rep[i];
1033
+ char lowdt1 = tolower(dt1);
1034
+
1035
+ // r -> j (beginning)
1036
+ if (i==0 && lowdt1=='r') {
1037
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('j', dt1));
1038
+ continue;
1039
+ }
1040
+
1041
+ // replaces the two-character combinations
1042
+ if (i+1 < size) {
1043
+ string part=rep.substr(i, 2);
1044
+ string lower=toLowerString(part);
1045
+
1046
+ // or -> o
1047
+ if (lower=="or") {
1048
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('o', part));
1049
+ i++;
1050
+ continue;
1051
+ }
1052
+
1053
+ // en -> ian
1054
+ if (lower=="en") {
1055
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('i', part));
1056
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('a', part));
1057
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('n', part));
1058
+ i++;
1059
+ continue;
1060
+ }
1061
+
1062
+ // et -> iat
1063
+ if (lower=="et") {
1064
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('i', part));
1065
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('a', part));
1066
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('t', part));
1067
+ i++;
1068
+ continue;
1069
+ }
1070
+
1071
+
1072
+ // bh -> b (beginning)
1073
+ if (i==0 && lower=="bh") {
1074
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('b', part));
1075
+ i++;
1076
+ continue;
1077
+ }
1078
+
1079
+ // gh -> g (beginning)
1080
+ if (i==0 && lower=="gh") {
1081
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('g', part));
1082
+ i++;
1083
+ continue;
1084
+ }
1085
+
1086
+ // wa -> ua (beginning)
1087
+ if (lower=="wa" && i==0) {
1088
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('u', part));
1089
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('a', part));
1090
+ i++;
1091
+ continue;
1092
+ }
1093
+
1094
+ // we -> ue (beginning)
1095
+ if (lower=="we" && i==0) {
1096
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('u', part));
1097
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('e', part));
1098
+ i++;
1099
+ continue;
1100
+ }
1101
+
1102
+ // wi -> ui (beginning)
1103
+ if (lower=="wi" && i==0) {
1104
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('u', part));
1105
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('i', part));
1106
+ i++;
1107
+ continue;
1108
+ }
1109
+
1110
+ // yo -> io (beginning)
1111
+ if (lower=="yo" && i==0) {
1112
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('i', part));
1113
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('o', part));
1114
+ i++;
1115
+ continue;
1116
+ }
1117
+
1118
+ // yi -> i (beginning)
1119
+ if (lower=="yi" && i==0) {
1120
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('i', part));
1121
+ i++;
1122
+ continue;
1123
+ }
1124
+ }
1125
+
1126
+ // o -> oo
1127
+ if (lowdt1=='o') {
1128
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('o', dt1));
1129
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('o', dt1));
1130
+ continue;
1131
+ }
1132
+
1133
+ // b -> p (beginning)
1134
+ if (i==0 && lowdt1=='b') {
1135
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('p', dt1));
1136
+ continue;
1137
+ }
1138
+
1139
+ // p -> ph (beginning)
1140
+ if (i==0 && lowdt1=='p') {
1141
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('p', dt1));
1142
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('h', dt1));
1143
+ continue;
1144
+ }
1145
+
1146
+ // k -> kh (beginning)
1147
+ if (i==0 && lowdt1=='k') {
1148
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('k', dt1));
1149
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('h', dt1));
1150
+ continue;
1151
+ }
1152
+
1153
+ // g -> k (beginning)
1154
+ if (i==0 && lowdt1=='g') {
1155
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('k', dt1));
1156
+ continue;
1157
+ }
1158
+
1159
+ // d -> t (beginning)
1160
+ if (i==0 && lowdt1=='d') {
1161
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('t', dt1));
1162
+ continue;
1163
+ }
1164
+
1165
+ // t -> th (beginning)
1166
+ if (i==0 && lowdt1=='t') {
1167
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('t', dt1));
1168
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('h', dt1));
1169
+ continue;
1170
+ }
1171
+
1172
+ // z -> ts (beginning)
1173
+ if (i==0 && lowdt1=='z') {
1174
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('t', dt1));
1175
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('s', dt1));
1176
+ continue;
1177
+ }
1178
+
1179
+ // c -> tsh (beginning)
1180
+ if (i==0 && lowdt1=='c') {
1181
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('t', dt1));
1182
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('s', dt1));
1183
+ syl.insertCharacterAtCursor(charWithCaseAccordingTo('h', dt1));
1184
+ continue;
1185
+ }
1186
+
1187
+ // else ...
1188
+ syl.insertCharacterAtCursor(dt1);
1189
+ }
1190
+
1191
+ // remap the final tone
1192
+ unsigned int tltone=finalTone;
1193
+
1194
+ syl.normalize(tltone);
1195
+ return syl;
1196
+
1197
+ }
1198
+
1199
+ protected:
1200
+ char charWithCaseAccordingTo(char c, char ref)
1201
+ {
1202
+ if (tolower(ref) == ref) return tolower(c);
1203
+ return toupper(c);
1204
+ }
1205
+
1206
+ char charWithCaseAccordingTo(char c, const string &r)
1207
+ {
1208
+ if (tolower(r[0]) == r[0]) return tolower(c);
1209
+ return toupper(c);
1210
+ }
1211
+
1212
+ string toLowerString(const string &s)
1213
+ {
1214
+ unsigned int size=s.length();
1215
+ string lower;
1216
+ unsigned int i;
1217
+ for (i=0;i<size;i++) lower+=string(1, tolower(s[i]));
1218
+ return lower;
1219
+ }
1220
+ };
1221
+ };
1222
+
1223
+