cld3 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +18 -0
  3. data/LICENSE +204 -0
  4. data/LICENSE_CLD3 +203 -0
  5. data/README.md +22 -0
  6. data/cld3.gemspec +35 -0
  7. data/ext/cld3/base.cc +36 -0
  8. data/ext/cld3/base.h +106 -0
  9. data/ext/cld3/casts.h +98 -0
  10. data/ext/cld3/embedding_feature_extractor.cc +51 -0
  11. data/ext/cld3/embedding_feature_extractor.h +182 -0
  12. data/ext/cld3/embedding_network.cc +196 -0
  13. data/ext/cld3/embedding_network.h +186 -0
  14. data/ext/cld3/embedding_network_params.h +285 -0
  15. data/ext/cld3/extconf.rb +49 -0
  16. data/ext/cld3/feature_extractor.cc +137 -0
  17. data/ext/cld3/feature_extractor.h +633 -0
  18. data/ext/cld3/feature_extractor.proto +50 -0
  19. data/ext/cld3/feature_types.cc +72 -0
  20. data/ext/cld3/feature_types.h +158 -0
  21. data/ext/cld3/fixunicodevalue.cc +55 -0
  22. data/ext/cld3/fixunicodevalue.h +69 -0
  23. data/ext/cld3/float16.h +58 -0
  24. data/ext/cld3/fml_parser.cc +308 -0
  25. data/ext/cld3/fml_parser.h +123 -0
  26. data/ext/cld3/generated_entities.cc +296 -0
  27. data/ext/cld3/generated_ulscript.cc +678 -0
  28. data/ext/cld3/generated_ulscript.h +142 -0
  29. data/ext/cld3/getonescriptspan.cc +1109 -0
  30. data/ext/cld3/getonescriptspan.h +124 -0
  31. data/ext/cld3/integral_types.h +37 -0
  32. data/ext/cld3/lang_id_nn_params.cc +57449 -0
  33. data/ext/cld3/lang_id_nn_params.h +178 -0
  34. data/ext/cld3/language_identifier_features.cc +165 -0
  35. data/ext/cld3/language_identifier_features.h +116 -0
  36. data/ext/cld3/nnet_language_identifier.cc +380 -0
  37. data/ext/cld3/nnet_language_identifier.h +175 -0
  38. data/ext/cld3/nnet_language_identifier_c.cc +72 -0
  39. data/ext/cld3/offsetmap.cc +478 -0
  40. data/ext/cld3/offsetmap.h +168 -0
  41. data/ext/cld3/port.h +143 -0
  42. data/ext/cld3/registry.cc +28 -0
  43. data/ext/cld3/registry.h +242 -0
  44. data/ext/cld3/relevant_script_feature.cc +89 -0
  45. data/ext/cld3/relevant_script_feature.h +49 -0
  46. data/ext/cld3/script_detector.h +156 -0
  47. data/ext/cld3/sentence.proto +77 -0
  48. data/ext/cld3/sentence_features.cc +29 -0
  49. data/ext/cld3/sentence_features.h +35 -0
  50. data/ext/cld3/simple_adder.h +72 -0
  51. data/ext/cld3/stringpiece.h +81 -0
  52. data/ext/cld3/task_context.cc +161 -0
  53. data/ext/cld3/task_context.h +81 -0
  54. data/ext/cld3/task_context_params.cc +74 -0
  55. data/ext/cld3/task_context_params.h +54 -0
  56. data/ext/cld3/task_spec.proto +98 -0
  57. data/ext/cld3/text_processing.cc +245 -0
  58. data/ext/cld3/text_processing.h +30 -0
  59. data/ext/cld3/unicodetext.cc +96 -0
  60. data/ext/cld3/unicodetext.h +144 -0
  61. data/ext/cld3/utf8acceptinterchange.h +486 -0
  62. data/ext/cld3/utf8prop_lettermarkscriptnum.h +1631 -0
  63. data/ext/cld3/utf8repl_lettermarklower.h +758 -0
  64. data/ext/cld3/utf8scannot_lettermarkspecial.h +1455 -0
  65. data/ext/cld3/utf8statetable.cc +1344 -0
  66. data/ext/cld3/utf8statetable.h +285 -0
  67. data/ext/cld3/utils.cc +241 -0
  68. data/ext/cld3/utils.h +144 -0
  69. data/ext/cld3/workspace.cc +64 -0
  70. data/ext/cld3/workspace.h +177 -0
  71. data/lib/cld3.rb +99 -0
  72. metadata +158 -0
@@ -0,0 +1,142 @@
1
+ // Copyright 2013 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+
15
+ // generated_ulscript.h
16
+ // Machine generated. Do Not Edit.
17
+ //
18
+ // Declarations for scripts recognized by CLD2
19
+ //
20
+
21
+ #ifndef SCRIPT_SPAN_GENERATED_ULSCRIPT_H_
22
+ #define SCRIPT_SPAN_GENERATED_ULSCRIPT_H_
23
+
24
+ namespace chrome_lang_id {
25
+ namespace CLD2 {
26
+
27
+ typedef enum {RTypeNone = 0, RTypeOne, RTypeMany, RTypeCJK} ULScriptRType;
28
+
29
+ typedef struct {const char* s; int i;} CharIntPair;
30
+
31
+ typedef enum {
32
+ ULScript_Common = 0, // Zyyy
33
+ ULScript_Latin = 1, // Latn
34
+ ULScript_Greek = 2, // Grek
35
+ ULScript_Cyrillic = 3, // Cyrl
36
+ ULScript_Armenian = 4, // Armn
37
+ ULScript_Hebrew = 5, // Hebr
38
+ ULScript_Arabic = 6, // Arab
39
+ ULScript_Syriac = 7, // Syrc
40
+ ULScript_Thaana = 8, // Thaa
41
+ ULScript_Devanagari = 9, // Deva
42
+ ULScript_Bengali = 10, // Beng
43
+ ULScript_Gurmukhi = 11, // Guru
44
+ ULScript_Gujarati = 12, // Gujr
45
+ ULScript_Oriya = 13, // Orya
46
+ ULScript_Tamil = 14, // Taml
47
+ ULScript_Telugu = 15, // Telu
48
+ ULScript_Kannada = 16, // Knda
49
+ ULScript_Malayalam = 17, // Mlym
50
+ ULScript_Sinhala = 18, // Sinh
51
+ ULScript_Thai = 19, // Thai
52
+ ULScript_Lao = 20, // Laoo
53
+ ULScript_Tibetan = 21, // Tibt
54
+ ULScript_Myanmar = 22, // Mymr
55
+ ULScript_Georgian = 23, // Geor
56
+ ULScript_Hani = 24, // Hani
57
+ ULScript_Ethiopic = 25, // Ethi
58
+ ULScript_Cherokee = 26, // Cher
59
+ ULScript_Canadian_Aboriginal = 27, // Cans
60
+ ULScript_Ogham = 28, // Ogam
61
+ ULScript_Runic = 29, // Runr
62
+ ULScript_Khmer = 30, // Khmr
63
+ ULScript_Mongolian = 31, // Mong
64
+ ULScript_32 = 32, //
65
+ ULScript_33 = 33, //
66
+ ULScript_Bopomofo = 34, // Bopo
67
+ ULScript_35 = 35, //
68
+ ULScript_Yi = 36, // Yiii
69
+ ULScript_Old_Italic = 37, // Ital
70
+ ULScript_Gothic = 38, // Goth
71
+ ULScript_Deseret = 39, // Dsrt
72
+ ULScript_Inherited = 40, // Zinh
73
+ ULScript_Tagalog = 41, // Tglg
74
+ ULScript_Hanunoo = 42, // Hano
75
+ ULScript_Buhid = 43, // Buhd
76
+ ULScript_Tagbanwa = 44, // Tagb
77
+ ULScript_Limbu = 45, // Limb
78
+ ULScript_Tai_Le = 46, // Tale
79
+ ULScript_Linear_B = 47, // Linb
80
+ ULScript_Ugaritic = 48, // Ugar
81
+ ULScript_Shavian = 49, // Shaw
82
+ ULScript_Osmanya = 50, // Osma
83
+ ULScript_Cypriot = 51, // Cprt
84
+ ULScript_Braille = 52, // Brai
85
+ ULScript_Buginese = 53, // Bugi
86
+ ULScript_Coptic = 54, // Copt
87
+ ULScript_New_Tai_Lue = 55, // Talu
88
+ ULScript_Glagolitic = 56, // Glag
89
+ ULScript_Tifinagh = 57, // Tfng
90
+ ULScript_Syloti_Nagri = 58, // Sylo
91
+ ULScript_Old_Persian = 59, // Xpeo
92
+ ULScript_Kharoshthi = 60, // Khar
93
+ ULScript_Balinese = 61, // Bali
94
+ ULScript_Cuneiform = 62, // Xsux
95
+ ULScript_Phoenician = 63, // Phnx
96
+ ULScript_Phags_Pa = 64, // Phag
97
+ ULScript_Nko = 65, // Nkoo
98
+ ULScript_Sundanese = 66, // Sund
99
+ ULScript_Lepcha = 67, // Lepc
100
+ ULScript_Ol_Chiki = 68, // Olck
101
+ ULScript_Vai = 69, // Vaii
102
+ ULScript_Saurashtra = 70, // Saur
103
+ ULScript_Kayah_Li = 71, // Kali
104
+ ULScript_Rejang = 72, // Rjng
105
+ ULScript_Lycian = 73, // Lyci
106
+ ULScript_Carian = 74, // Cari
107
+ ULScript_Lydian = 75, // Lydi
108
+ ULScript_Cham = 76, // Cham
109
+ ULScript_Tai_Tham = 77, // Lana
110
+ ULScript_Tai_Viet = 78, // Tavt
111
+ ULScript_Avestan = 79, // Avst
112
+ ULScript_Egyptian_Hieroglyphs = 80, // Egyp
113
+ ULScript_Samaritan = 81, // Samr
114
+ ULScript_Lisu = 82, // Lisu
115
+ ULScript_Bamum = 83, // Bamu
116
+ ULScript_Javanese = 84, // Java
117
+ ULScript_Meetei_Mayek = 85, // Mtei
118
+ ULScript_Imperial_Aramaic = 86, // Armi
119
+ ULScript_Old_South_Arabian = 87, // Sarb
120
+ ULScript_Inscriptional_Parthian = 88, // Prti
121
+ ULScript_Inscriptional_Pahlavi = 89, // Phli
122
+ ULScript_Old_Turkic = 90, // Orkh
123
+ ULScript_Kaithi = 91, // Kthi
124
+ ULScript_Batak = 92, // Batk
125
+ ULScript_Brahmi = 93, // Brah
126
+ ULScript_Mandaic = 94, // Mand
127
+ ULScript_Chakma = 95, // Cakm
128
+ ULScript_Meroitic_Cursive = 96, // Merc
129
+ ULScript_Meroitic_Hieroglyphs = 97, // Mero
130
+ ULScript_Miao = 98, // Plrd
131
+ ULScript_Sharada = 99, // Shrd
132
+ ULScript_Sora_Sompeng = 100, // Sora
133
+ ULScript_Takri = 101, // Takr
134
+ NUM_ULSCRIPTS
135
+ } ULScript;
136
+
137
+ #define UNKNOWN_ULSCRIPT ULScript_Common
138
+
139
+ } // namespace CLD2
140
+ } // namespace chrome_lang_id
141
+
142
+ #endif // SCRIPT_SPAN_GENERATED_ULSCRIPT_H_
@@ -0,0 +1,1109 @@
1
+ // Copyright 2013 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+
15
+ //
16
+ // Author: dsites@google.com (Dick Sites)
17
+ //
18
+
19
+
20
+ #include "getonescriptspan.h"
21
+
22
+ #include <string.h>
23
+
24
+ #include "fixunicodevalue.h"
25
+ #include "port.h"
26
+ #include "utf8acceptinterchange.h"
27
+ #include "utf8repl_lettermarklower.h"
28
+ #include "utf8prop_lettermarkscriptnum.h"
29
+ #include "utf8scannot_lettermarkspecial.h"
30
+ #include "utf8statetable.h"
31
+
32
+ namespace chrome_lang_id {
33
+ namespace CLD2 {
34
+
35
+ // Alphabetical order for binary search, from
36
+ // generated_entities.cc
37
+ extern const int kNameToEntitySize;
38
+ extern const CharIntPair kNameToEntity[];
39
+
40
+ static const char kSpecialSymbol[256] = { // true for < > &
41
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
42
+ 0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0,
43
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
44
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
45
+
46
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
47
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
48
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
49
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
50
+ };
51
+
52
+
53
+
54
+ #define LT 0 // <
55
+ #define GT 1 // >
56
+ #define EX 2 // !
57
+ #define HY 3 // -
58
+ #define QU 4 // "
59
+ #define AP 5 // '
60
+ #define SL 6 // /
61
+ #define S_ 7
62
+ #define C_ 8
63
+ #define R_ 9
64
+ #define I_ 10
65
+ #define P_ 11
66
+ #define T_ 12
67
+ #define Y_ 13
68
+ #define L_ 14
69
+ #define E_ 15
70
+ #define CR 16 // <cr> or <lf>
71
+ #define NL 17 // non-letter: ASCII whitespace, digit, punctuation
72
+ #define PL 18 // possible letter, incl. &
73
+ #define xx 19 // <unused>
74
+
75
+ // Map byte to one of ~20 interesting categories for cheap tag parsing
76
+ static const uint8 kCharToSub[256] = {
77
+ NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL,
78
+ NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
79
+ NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL,
80
+ NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL,
81
+
82
+ PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
83
+ P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
84
+ PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
85
+ P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
86
+
87
+ NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
88
+ NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
89
+ NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
90
+ NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
91
+
92
+ PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
93
+ PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
94
+ PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
95
+ PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
96
+ };
97
+
98
+ #undef LT
99
+ #undef GT
100
+ #undef EX
101
+ #undef HY
102
+ #undef QU
103
+ #undef AP
104
+ #undef SL
105
+ #undef S_
106
+ #undef C_
107
+ #undef R_
108
+ #undef I_
109
+ #undef P_
110
+ #undef T_
111
+ #undef Y_
112
+ #undef L_
113
+ #undef E_
114
+ #undef CR
115
+ #undef NL
116
+ #undef PL
117
+ #undef xx
118
+
119
+
120
+ #define OK 0
121
+ #define X_ 1
122
+
123
+
124
+ static const int kMaxExitStateLettersMarksOnly = 1;
125
+ static const int kMaxExitStateAllText = 2;
126
+
127
+
128
+ // State machine to do cheap parse of non-letter strings incl. tags
129
+ // advances <tag>
130
+ // | |
131
+ // advances <tag> ... </tag> for <script> <style>
132
+ // | |
133
+ // advances <!-- ... <tag> ... -->
134
+ // | |
135
+ // advances <tag
136
+ // || (0)
137
+ // advances <tag <tag2>
138
+ // || (0)
139
+ //
140
+ // We start in state [0] at a non-letter and make at least one transition
141
+ // When scanning for just letters, arriving back at state [0] or [1] exits
142
+ // the state machine.
143
+ // When scanning for any non-tag text, arriving at state [2] also exits
144
+ static const uint8 kTagParseTbl_0[] = {
145
+ // < > ! - " ' / S C R I P T Y L E CR NL PL xx
146
+ 3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [0] OK exit state
147
+ X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error exit state
148
+ 3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [2] NL* [exit state]
149
+ X_, 2, 4, 9, 10,11, 9,13, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [3] <
150
+ X_, 2, 9, 5, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [4] <!
151
+ X_, 2, 9, 6, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [5] <!-
152
+ 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [6] <!--.*
153
+ 6, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [7] <!--.*-
154
+ 6, 2, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [8] <!--.*--
155
+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [9] <.*
156
+ 10,10,10,10, 9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*"
157
+ 11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*'
158
+ X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " '
159
+
160
+ // < > ! - " ' / S C R I P T Y L E CR NL PL xx
161
+ X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9, 9, 9, 9,X_, // [13] <S
162
+ X_, 2, 9, 9, 10,11, 9, 9, 9,15, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [14] <SC
163
+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9,16, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [15] <SCR
164
+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9,17, 9, 9, 9, 9, 9, 9, 9,X_, // [16] <SCRI
165
+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 18, 9, 9, 9, 9, 9, 9,X_, // [17] <SCRIP
166
+ X_,19, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT
167
+ 20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .*
168
+ 19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*<
169
+ 19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 21,21,19,X_, // [21] <SCRIPT .*</ allow SP CR LF
170
+ 19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S
171
+ 19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC
172
+ 19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR
173
+ 19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI
174
+ 19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP
175
+ 19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT
176
+
177
+ // < > ! - " ' / S C R I P T Y L E CR NL PL xx
178
+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9,29, 9, 9, 9, 9, 9,X_, // [28] <ST
179
+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9,30, 9, 9, 9, 9,X_, // [29] <STY
180
+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9,31, 9, 9, 9,X_, // [30] <STYL
181
+ X_,32, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE
182
+ 33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .*
183
+ 32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*<
184
+ 32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 34,34,32,X_, // [34] <STYLE .*</ allow SP CR LF
185
+ 32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S
186
+ 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST
187
+ 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY
188
+ 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL
189
+ 32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE
190
+ };
191
+
192
+ #undef OK
193
+ #undef X_
194
+
195
+ enum
196
+ {
197
+ UTFmax = 4, // maximum bytes per rune
198
+ Runesync = 0x80, // cannot represent part of a UTF sequence (<)
199
+ Runeself = 0x80, // rune and UTF sequences are the same (<)
200
+ Runeerror = 0xFFFD, // decoding error in UTF
201
+ Runemax = 0x10FFFF, // maximum rune value
202
+ };
203
+
204
+ // Debugging. Not thread safe.
205
+ static char gDisplayPiece[32];
206
+ const uint8 gCharlen[16] = {1,1,1,1, 1,1,1,1, 1,1,1,1, 2,2,3,4};
207
+ char* DisplayPiece(const char* next_byte_, int byte_length_) {
208
+ // Copy up to 8 UTF-8 chars to buffer
209
+ int k = 0; // byte count
210
+ int n = 0; // character count
211
+ for (int i = 0; i < byte_length_; ++i) {
212
+ char c = next_byte_[i];
213
+ if ((c & 0xc0) != 0x80) {
214
+ // Beginning of a UTF-8 character
215
+ int charlen = gCharlen[static_cast<uint8>(c) >> 4];
216
+ if (i + charlen > byte_length_) {break;} // Not enough room for full char
217
+ if (k >= (32 - 7)) {break;} // Not necessarily enough room
218
+ if (n >= 8) {break;} // Enough characters already
219
+ ++n;
220
+ }
221
+ if (c == '<') {
222
+ memcpy(&gDisplayPiece[k], "&lt;", 4); k += 4;
223
+ } else if (c == '>') {
224
+ memcpy(&gDisplayPiece[k], "&gt;", 4); k += 4;
225
+ } else if (c == '&') {
226
+ memcpy(&gDisplayPiece[k], "&amp;", 5); k += 5;
227
+ } else if (c == '\'') {
228
+ memcpy(&gDisplayPiece[k], "&apos;", 6); k += 6;
229
+ } else if (c == '"') {
230
+ memcpy(&gDisplayPiece[k], "&quot;", 6); k += 6;
231
+ } else {
232
+ gDisplayPiece[k++] = c;
233
+ }
234
+ }
235
+ gDisplayPiece[k++] = '\0';
236
+ return gDisplayPiece;
237
+ }
238
+
239
+
240
+
241
+ // runetochar copies (encodes) one rune, pointed to by r, to at most
242
+ // UTFmax bytes starting at s and returns the number of bytes generated.
243
+ int runetochar(char *str, const char32 *rune) {
244
+ // Convert to unsigned for range check.
245
+ unsigned long c;
246
+
247
+ // 1 char 00-7F
248
+ c = *rune;
249
+ if(c <= 0x7F) {
250
+ str[0] = static_cast<char>(c);
251
+ return 1;
252
+ }
253
+
254
+ // 2 char 0080-07FF
255
+ if(c <= 0x07FF) {
256
+ str[0] = 0xC0 | static_cast<char>(c >> 1*6);
257
+ str[1] = 0x80 | (c & 0x3F);
258
+ return 2;
259
+ }
260
+
261
+ // Range check
262
+ if (c > Runemax) {
263
+ c = Runeerror;
264
+ }
265
+
266
+ // 3 char 0800-FFFF
267
+ if (c <= 0xFFFF) {
268
+ str[0] = 0xE0 | static_cast<char>(c >> 2*6);
269
+ str[1] = 0x80 | ((c >> 1*6) & 0x3F);
270
+ str[2] = 0x80 | (c & 0x3F);
271
+ return 3;
272
+ }
273
+
274
+ // 4 char 10000-1FFFFF
275
+ str[0] = 0xF0 | static_cast<char>(c >> 3*6);
276
+ str[1] = 0x80 | ((c >> 2*6) & 0x3F);
277
+ str[2] = 0x80 | ((c >> 1*6) & 0x3F);
278
+ str[3] = 0x80 | (c & 0x3F);
279
+ return 4;
280
+ }
281
+
282
+
283
+
284
+ // Useful for converting an entity to an ascii value.
285
+ // RETURNS unicode value, or -1 if entity isn't valid. Don't include & or ;
286
+ int LookupEntity(const char* entity_name, int entity_len) {
287
+ // Make a C string
288
+ if (entity_len >= 16) {return -1;} // All real entities are shorter
289
+ char temp[16];
290
+ memcpy(temp, entity_name, entity_len);
291
+ temp[entity_len] = '\0';
292
+ int match = BinarySearch(temp, 0, kNameToEntitySize, kNameToEntity);
293
+ if (match >= 0) {return kNameToEntity[match].i;}
294
+ return -1;
295
+ }
296
+
297
+ bool ascii_isdigit(char c) {
298
+ return ('0' <= c) && (c <= '9');
299
+ }
300
+ bool ascii_isxdigit(char c) {
301
+ if (('0' <= c) && (c <= '9')) {return true;}
302
+ if (('a' <= c) && (c <= 'f')) {return true;}
303
+ if (('A' <= c) && (c <= 'F')) {return true;}
304
+ return false;
305
+ }
306
+ bool ascii_isalnum(char c) {
307
+ if (('0' <= c) && (c <= '9')) {return true;}
308
+ if (('a' <= c) && (c <= 'z')) {return true;}
309
+ if (('A' <= c) && (c <= 'Z')) {return true;}
310
+ return false;
311
+ }
312
+ int hex_digit_to_int(char c) {
313
+ if (('0' <= c) && (c <= '9')) {return c - '0';}
314
+ if (('a' <= c) && (c <= 'f')) {return c - 'a' + 10;}
315
+ if (('A' <= c) && (c <= 'F')) {return c - 'A' + 10;}
316
+ return 0;
317
+ }
318
+
319
+ static int32 strto32_base10(const char* nptr, const char* limit,
320
+ const char **endptr) {
321
+ *endptr = nptr;
322
+ while (nptr < limit && *nptr == '0') {
323
+ ++nptr;
324
+ }
325
+ if (nptr == limit || !ascii_isdigit(*nptr))
326
+ return -1;
327
+ const char* end_digits_run = nptr;
328
+ while (end_digits_run < limit && ascii_isdigit(*end_digits_run)) {
329
+ ++end_digits_run;
330
+ }
331
+ *endptr = end_digits_run;
332
+ const int num_digits = end_digits_run - nptr;
333
+ // kint32max == 2147483647.
334
+ if (num_digits < 9 ||
335
+ (num_digits == 10 && memcmp(nptr, "2147483647", 10) <= 0)) {
336
+ int value = 0;
337
+ for (; nptr < end_digits_run; ++nptr) {
338
+ value *= 10;
339
+ value += *nptr - '0';
340
+ }
341
+ // Overflow past the last valid unicode codepoint
342
+ // (0x10ffff) is converted to U+FFFD by FixUnicodeValue().
343
+ return FixUnicodeValue(value);
344
+ } else {
345
+ // Overflow: can't fit in an int32;
346
+ // returns the replacement character 0xFFFD.
347
+ return 0xFFFD;
348
+ }
349
+ }
350
+
351
+ static int32 strto32_base16(const char* nptr, const char* limit,
352
+ const char **endptr) {
353
+ *endptr = nptr;
354
+ while (nptr < limit && *nptr == '0') {
355
+ ++nptr;
356
+ }
357
+ if (nptr == limit || !ascii_isxdigit(*nptr)) {
358
+ return -1;
359
+ }
360
+ const char* end_xdigits_run = nptr;
361
+ while (end_xdigits_run < limit && ascii_isxdigit(*end_xdigits_run)) {
362
+ ++end_xdigits_run;
363
+ }
364
+ *endptr = end_xdigits_run;
365
+ const int num_xdigits = end_xdigits_run - nptr;
366
+ // kint32max == 0x7FFFFFFF.
367
+ if (num_xdigits < 8 || (num_xdigits == 8 && nptr[0] < '8')) {
368
+ int value = 0;
369
+ for (; nptr < end_xdigits_run; ++nptr) {
370
+ value <<= 4;
371
+ value += hex_digit_to_int(*nptr);
372
+ }
373
+ // Overflow past the last valid unicode codepoint
374
+ // (0x10ffff) is converted to U+FFFD by FixUnicodeValue().
375
+ return FixUnicodeValue(value);
376
+ } else {
377
+ // Overflow: can't fit in an int32;
378
+ // returns the replacement character 0xFFFD.
379
+ return 0xFFFD;
380
+ }
381
+ }
382
+
383
+ // Unescape the current character pointed to by src. SETS the number
384
+ // of chars read for the conversion (in UTF8). If src isn't a valid entity,
385
+ // just consume the & and RETURN -1. If src doesn't point to & -- which it
386
+ // should -- set src_consumed to 0 and RETURN -1.
387
+ int ReadEntity(const char* src, int srcn, int* src_consumed) {
388
+ const char* const srcend = src + srcn;
389
+
390
+ if (srcn == 0 || *src != '&') { // input should start with an ampersand
391
+ *src_consumed = 0;
392
+ return -1;
393
+ }
394
+ *src_consumed = 1; // we'll get the & at least
395
+
396
+ // The standards are a bit unclear on when an entity ends. Certainly a ";"
397
+ // ends one, but spaces probably do too. We follow the lead of both IE and
398
+ // Netscape, which as far as we can tell end numeric entities (1st case below)
399
+ // at any non-digit, and end character entities (2nd case) at any non-alnum.
400
+ const char* entstart, *entend; // where the entity starts and ends
401
+ entstart = src + 1; // read past the &
402
+ int entval; // UCS2 value of the entity
403
+ if ( *entstart == '#' ) { // -- 1st case: numeric entity
404
+ if ( entstart + 2 >= srcend ) {
405
+ return -1; // no way a legitimate number could fit
406
+ } else if ( entstart[1] == 'x' || entstart[1] == 'X' ) { // hex numeric
407
+ entval = strto32_base16(entstart + 2, srcend, &entend);
408
+ } else { // decimal numeric entity
409
+ entval = strto32_base10(entstart+1, srcend, &entend);
410
+ }
411
+ if (entval == -1 || entend > srcend) {
412
+ return -1; // not entirely correct, but close enough
413
+ }
414
+ } else { // -- 2nd case: character entity
415
+ for (entend = entstart;
416
+ entend < srcend && ascii_isalnum(*entend);
417
+ ++entend ) {
418
+ // entity consists of alphanumeric chars
419
+ }
420
+ entval = LookupEntity(entstart, entend - entstart);
421
+ if (entval < 0) {
422
+ return -1; // not a legal entity name
423
+ }
424
+ // Now we do a strange-seeming IE6-compatibility check: if entval is
425
+ // >= 256, it *must* be followed by a semicolon or it's not considered
426
+ // an entity. The problem is lots of the newfangled entity names, like
427
+ // "lang", also occur in URL CGI arguments: "/search?q=test&lang=en".
428
+ // When these links are written in HTML, it would be really bad if the
429
+ // "&lang" were treated as an entity, which is what the spec says
430
+ // *should* happen (even when the HTML is inside an "A HREF" tag!)
431
+ // IE ignores the spec for these new, high-value entities, so we do too.
432
+ if ( entval >= 256 && !(entend < srcend && *entend == ';') ) {
433
+ return -1; // make non-;-terminated entity illegal
434
+ }
435
+ }
436
+
437
+ // Finally, figure out how much src was consumed
438
+ if ( entend < srcend && *entend == ';' ) {
439
+ entend++; // standard says ; terminator is special
440
+ }
441
+ *src_consumed = entend - src;
442
+ return entval;
443
+ }
444
+
445
+
446
+ // Src points to '&'
447
+ // Writes entity value to dst. Returns take(src), put(dst) byte counts
448
+ void EntityToBuffer(const char* src, int len, char* dst,
449
+ int* tlen, int* plen) {
450
+ char32 entval = ReadEntity(src, len, tlen);
451
+
452
+ // ReadEntity does this already: entval = FixUnicodeValue(entval);
453
+
454
+ // Convert UTF-32 to UTF-8
455
+ if (entval > 0) {
456
+ *plen = runetochar(dst, &entval);
457
+ } else {
458
+ // Illegal entity; ignore the '&'
459
+ *tlen = 1;
460
+ *plen = 0;
461
+ }
462
+ }
463
+
464
+ // Returns true if character is < > or &, none of which are letters
465
+ bool inline IsSpecial(char c) {
466
+ // Comparison (int != 0) is used to silence the warning:
467
+ // 'const char': forcing value to bool
468
+ if ((c & 0xe0) == 0x20) {
469
+ return (kSpecialSymbol[static_cast<uint8>(c)] != 0);
470
+ }
471
+ return false;
472
+ }
473
+
474
+ // Quick Skip to next letter or < > & or to end of string (eos)
475
+ // Always return is_letter for eos
476
+ int ScanToLetterOrSpecial(const char* src, int len) {
477
+ int bytes_consumed;
478
+ StringPiece str(src, len);
479
+ UTF8GenericScan(&utf8scannot_lettermarkspecial_obj, str, &bytes_consumed);
480
+ return bytes_consumed;
481
+ }
482
+
483
+
484
+
485
+
486
+ // src points to non-letter, such as tag-opening '<'
487
+ // Return length from here to next possible letter
488
+ // On another < before >, return 1
489
+ // advances <tag>
490
+ // | |
491
+ // advances <tag> ... </tag> for <script> <style>
492
+ // | |
493
+ // advances <!-- ... <tag> ... -->
494
+ // | |
495
+ // advances <tag
496
+ // | | end of string
497
+ // advances <tag <tag2>
498
+ // ||
499
+ int ScanToPossibleLetter(const char* isrc, int len, int max_exit_state) {
500
+ const uint8* src = reinterpret_cast<const uint8*>(isrc);
501
+ const uint8* srclimit = src + len;
502
+ const uint8* tagParseTbl = kTagParseTbl_0;
503
+ int e = 0;
504
+ while (src < srclimit) {
505
+ e = tagParseTbl[kCharToSub[*src++]];
506
+ if (e <= max_exit_state) {
507
+ // We overshot by one byte
508
+ --src;
509
+ break;
510
+ }
511
+ tagParseTbl = &kTagParseTbl_0[e * 20];
512
+ }
513
+
514
+ if (src >= srclimit) {
515
+ // We fell off the end of the text.
516
+ // It looks like the most common case for this is a truncated file, not
517
+ // mismatched angle brackets. So we pretend that the last char was '>'
518
+ return len;
519
+ }
520
+
521
+ // OK to be in state 0 or state 2 at exit
522
+ if ((e != 0) && (e != 2)) {
523
+ // Error, '<' followed by '<'
524
+ // We want to back up to first <, then advance by one byte past it
525
+ int offset = src - reinterpret_cast<const uint8*>(isrc);
526
+
527
+ // Backscan to first '<' and return enough length to just get past it
528
+ --offset; // back up over the second '<', which caused us to stop
529
+ while ((0 < offset) && (isrc[offset] != '<')) {
530
+ // Find the first '<', which is unmatched
531
+ --offset;
532
+ }
533
+ // skip to just beyond first '<'
534
+ return offset + 1;
535
+ }
536
+
537
+ return src - reinterpret_cast<const uint8*>(isrc);
538
+ }
539
+
540
+ // Returns mid if key found in lo <= mid < hi, else -1
541
+ int BinarySearch(const char* key, int lo, int hi, const CharIntPair* cipair) {
542
+ // binary search
543
+ while (lo < hi) {
544
+ int mid = (lo + hi) >> 1;
545
+ if (strcmp(key, cipair[mid].s) < 0) {
546
+ hi = mid;
547
+ } else if (strcmp(key, cipair[mid].s) > 0) {
548
+ lo = mid + 1;
549
+ } else {
550
+ return mid;
551
+ }
552
+ }
553
+ return -1;
554
+ }
555
+
556
+ // Returns the length in bytes of the prefix of src that is all
557
+ // interchange valid UTF-8
558
+ int SpanInterchangeValid(const char* src, int byte_length) {
559
+ int bytes_consumed;
560
+ const UTF8ReplaceObj* st = &utf8acceptinterchange_obj;
561
+ StringPiece str(src, byte_length);
562
+ UTF8GenericScan(st, str, &bytes_consumed);
563
+ return bytes_consumed;
564
+ }
565
+
566
+ ScriptScanner::ScriptScanner(const char* buffer,
567
+ int buffer_length,
568
+ bool is_plain_text)
569
+ : start_byte_(buffer),
570
+ next_byte_(buffer),
571
+ byte_length_(buffer_length),
572
+ is_plain_text_(is_plain_text),
573
+ letters_marks_only_(true),
574
+ one_script_only_(true),
575
+ exit_state_(kMaxExitStateLettersMarksOnly) {
576
+ script_buffer_ = new char[kMaxScriptBuffer];
577
+ script_buffer_lower_ = new char[kMaxScriptLowerBuffer];
578
+ map2original_.Clear(); // map from script_buffer_ to buffer
579
+ map2uplow_.Clear(); // map from script_buffer_lower_ to script_buffer_
580
+ }
581
+
582
+ // Extended version to allow spans of any non-tag text and spans of mixed script
583
+ ScriptScanner::ScriptScanner(const char* buffer,
584
+ int buffer_length,
585
+ bool is_plain_text,
586
+ bool any_text,
587
+ bool any_script)
588
+ : start_byte_(buffer),
589
+ next_byte_(buffer),
590
+ byte_length_(buffer_length),
591
+ is_plain_text_(is_plain_text),
592
+ letters_marks_only_(!any_text),
593
+ one_script_only_(!any_script),
594
+ exit_state_(any_text ? kMaxExitStateAllText : kMaxExitStateLettersMarksOnly) {
595
+ script_buffer_ = new char[kMaxScriptBuffer];
596
+ script_buffer_lower_ = new char[kMaxScriptLowerBuffer];
597
+ map2original_.Clear(); // map from script_buffer_ to buffer
598
+ map2uplow_.Clear(); // map from script_buffer_lower_ to script_buffer_
599
+ }
600
+
601
+
602
+ ScriptScanner::~ScriptScanner() {
603
+ delete[] script_buffer_;
604
+ delete[] script_buffer_lower_;
605
+ }
606
+
607
+
608
+
609
+
610
+ // Get to the first real non-tag letter or entity that is a letter
611
+ // Sets script of that letter
612
+ // Return len if no more letters
613
+ int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) {
614
+ int sc = UNKNOWN_ULSCRIPT;
615
+ int skip = 0;
616
+ int tlen, plen;
617
+
618
+ // Do run of non-letters (tag | &NL | NL)*
619
+ tlen = 0;
620
+ while (skip < len) {
621
+ // Do fast scan to next interesting byte
622
+ // int oldskip = skip;
623
+ skip += ScanToLetterOrSpecial(src + skip, len - skip);
624
+
625
+ // Check for no more letters/specials
626
+ if (skip >= len) {
627
+ // All done
628
+ *script = sc;
629
+ return len;
630
+ }
631
+
632
+ // We are at a letter, nonletter, tag, or entity
633
+ if (IsSpecial(src[skip]) && !is_plain_text_) {
634
+ if (src[skip] == '<') {
635
+ // Begining of tag; skip to end and go around again
636
+ tlen = ScanToPossibleLetter(src + skip, len - skip,
637
+ exit_state_);
638
+ sc = 0;
639
+ } else if (src[skip] == '>') {
640
+ // Unexpected end of tag; skip it and go around again
641
+ tlen = 1; // Over the >
642
+ sc = 0;
643
+ } else if (src[skip] == '&') {
644
+ // Expand entity, no advance
645
+ char temp[4];
646
+ EntityToBuffer(src + skip, len - skip,
647
+ temp, &tlen, &plen);
648
+ if (plen > 0) {
649
+ sc = GetUTF8LetterScriptNum(temp);
650
+ }
651
+ }
652
+ } else {
653
+ // Update 1..4 bytes
654
+ tlen = UTF8OneCharLen(src + skip);
655
+ sc = GetUTF8LetterScriptNum(src + skip);
656
+ }
657
+ if (sc != 0) {break;} // Letter found
658
+ skip += tlen; // Else advance
659
+ }
660
+
661
+ *script = sc;
662
+ return skip;
663
+ }
664
+
665
+
666
+ // These are for ASCII-only tag names
667
+ // Compare one letter uplow to c, ignoring case of uplowp
668
+ inline bool EqCase(char uplow, char c) {
669
+ return (uplow | 0x20) == c;
670
+ }
671
+
672
+ // These are for ASCII-only tag names
673
+ // Return true for space / < > etc. all less than 0x40
674
+ inline bool NeqLetter(char c) {
675
+ return c < 0x40;
676
+ }
677
+
678
+ // These are for ASCII-only tag names
679
+ // Return true for space \n false for \r
680
+ inline bool WS(char c) {
681
+ return (c == ' ') || (c == '\n');
682
+ }
683
+
684
+ // Canonical CR or LF
685
+ static const char LF = '\n';
686
+
687
+
688
+ // The naive loop scans from next_byte_ to script_buffer_ until full.
689
+ // But this can leave an awkward hard-to-identify short fragment at the
690
+ // end of the input. We would prefer to make the next-to-last fragment
691
+ // shorter and the last fragment longer.
692
+
693
+ // Copy next run of non-tag characters to buffer [NUL terminated]
694
+ // This just replaces tags with space or \n and removes entities.
695
+ // Tags <br> <p> and <tr> are replaced with \n. Non-letter sequences
696
+ // including \r or \n are replaced by \n. All other tags and skipped text
697
+ // are replaced with ASCII space.
698
+ //
699
+ // Buffer ALWAYS has leading space and trailing space space space NUL
700
+ bool ScriptScanner::GetOneTextSpan(LangSpan* span) {
701
+ span->text = script_buffer_;
702
+ span->text_bytes = 0;
703
+ span->offset = next_byte_ - start_byte_;
704
+ span->ulscript = UNKNOWN_ULSCRIPT;
705
+ span->truncated = false;
706
+
707
+ int put_soft_limit = kMaxScriptBytes - kWithinScriptTail;
708
+ if ((kMaxScriptBytes <= byte_length_) &&
709
+ (byte_length_ < (2 * kMaxScriptBytes))) {
710
+ // Try to split the last two fragments in half
711
+ put_soft_limit = byte_length_ / 2;
712
+ }
713
+
714
+ script_buffer_[0] = ' '; // Always a space at front of output
715
+ script_buffer_[1] = '\0';
716
+ int take = 0;
717
+ int put = 1; // Start after the initial space
718
+ int tlen = 0, plen = 0;
719
+
720
+ if (byte_length_ <= 0) {
721
+ return false; // No more text to be found
722
+ }
723
+
724
+ // Go over alternating spans of text and tags,
725
+ // copying letters to buffer with single spaces for each run of non-letters
726
+ bool last_byte_was_space = false;
727
+ while (take < byte_length_) {
728
+ char c = next_byte_[take];
729
+ if (c == '\r') {c = LF;} // Canonical CR or LF
730
+ if (c == '\n') {c = LF;} // Canonical CR or LF
731
+
732
+ if (IsSpecial(c) && !is_plain_text_) {
733
+ if (c == '<') {
734
+ // Replace tag with space
735
+ c = ' '; // for almost-full test below
736
+ // or if <p> <br> <tr>, replace with \n
737
+ if (take < (byte_length_ - 3)) {
738
+ if (EqCase(next_byte_[take + 1], 'p') &&
739
+ NeqLetter(next_byte_[take + 2])) {
740
+ c = LF;
741
+ }
742
+ if (EqCase(next_byte_[take + 1], 'b') &&
743
+ EqCase(next_byte_[take + 2], 'r') &&
744
+ NeqLetter(next_byte_[take + 3])) {
745
+ c = LF;
746
+ }
747
+ if (EqCase(next_byte_[take + 1], 't') &&
748
+ EqCase(next_byte_[take + 2], 'r') &&
749
+ NeqLetter(next_byte_[take + 3])) {
750
+ c = LF;
751
+ }
752
+ }
753
+ // Begining of tag; skip to end and go around again
754
+ tlen = 1 + ScanToPossibleLetter(next_byte_ + take, byte_length_ - take,
755
+ exit_state_);
756
+ // Copy one byte, compressing spaces
757
+ if (!last_byte_was_space || !WS(c)) {
758
+ script_buffer_[put++] = c; // Advance dest
759
+ last_byte_was_space = WS(c);
760
+ }
761
+ } else if (c == '>') {
762
+ // Unexpected end of tag; copy it and go around again
763
+ tlen = 1; // Over the >
764
+ script_buffer_[put++] = c; // Advance dest
765
+ } else if (c == '&') {
766
+ // Expand entity, no advance
767
+ EntityToBuffer(next_byte_ + take, byte_length_ - take,
768
+ script_buffer_ + put, &tlen, &plen);
769
+ put += plen; // Advance dest
770
+ }
771
+ take += tlen; // Advance source
772
+ } else {
773
+ // Copy one byte, compressing spaces
774
+ if (!last_byte_was_space || !WS(c)) {
775
+ script_buffer_[put++] = c; // Advance dest
776
+ last_byte_was_space = WS(c);
777
+ }
778
+ ++take; // Advance source
779
+ }
780
+
781
+ if (WS(c) &&
782
+ (put >= put_soft_limit)) {
783
+ // Buffer is almost full
784
+ span->truncated = true;
785
+ break;
786
+ }
787
+ if (put >= kMaxScriptBytes) {
788
+ // Buffer is completely full
789
+ span->truncated = true;
790
+ break;
791
+ }
792
+ }
793
+
794
+ // Almost done. Back up to a character boundary if needed
795
+ while ((0 < take) && ((next_byte_[take] & 0xc0) == 0x80)) {
796
+ // Back up over continuation byte
797
+ --take;
798
+ --put;
799
+ }
800
+
801
+ // Update input position
802
+ next_byte_ += take;
803
+ byte_length_ -= take;
804
+
805
+ // Put four more spaces/NUL. Worst case is abcd _ _ _ \0
806
+ // kMaxScriptBytes | | put
807
+ script_buffer_[put + 0] = ' ';
808
+ script_buffer_[put + 1] = ' ';
809
+ script_buffer_[put + 2] = ' ';
810
+ script_buffer_[put + 3] = '\0';
811
+
812
+ span->text_bytes = put; // Does not include the last four chars above
813
+ return true;
814
+ }
815
+
816
+
817
+ // Copy next run of same-script non-tag letters to buffer [NUL terminated]
818
+ // Buffer ALWAYS has leading space and trailing space space space NUL
819
+ bool ScriptScanner::GetOneScriptSpan(LangSpan* span) {
820
+ if (!letters_marks_only_) {
821
+ // Return non-tag text, including punctuation and digits
822
+ return GetOneTextSpan(span);
823
+ }
824
+
825
+ span->text = script_buffer_;
826
+ span->text_bytes = 0;
827
+ span->offset = next_byte_ - start_byte_;
828
+ span->ulscript = UNKNOWN_ULSCRIPT;
829
+ span->truncated = false;
830
+
831
+ // struct timeval script_start, script_mid, script_end;
832
+
833
+ int put_soft_limit = kMaxScriptBytes - kWithinScriptTail;
834
+ if ((kMaxScriptBytes <= byte_length_) &&
835
+ (byte_length_ < (2 * kMaxScriptBytes))) {
836
+ // Try to split the last two fragments in half
837
+ put_soft_limit = byte_length_ / 2;
838
+ }
839
+
840
+
841
+ int spanscript; // The script of this span
842
+ int sc = UNKNOWN_ULSCRIPT; // The script of next character
843
+ int tlen = 0;
844
+ int plen = 0;
845
+
846
+ script_buffer_[0] = ' '; // Always a space at front of output
847
+ script_buffer_[1] = '\0';
848
+ int take = 0;
849
+ int put = 1; // Start after the initial space
850
+
851
+ // Build offsets from span->text back to start_byte_ + span->offset
852
+ // This mapping reflects deletion of non-letters, expansion of
853
+ // entities, etc.
854
+ map2original_.Clear();
855
+ map2original_.Delete(span->offset); // So that MapBack(0) gives offset
856
+
857
+ // Get to the first real non-tag letter or entity that is a letter
858
+ int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript);
859
+ next_byte_ += skip;
860
+ byte_length_ -= skip;
861
+
862
+ if (skip != 1) {
863
+ map2original_.Delete(skip);
864
+ map2original_.Insert(1);
865
+ } else {
866
+ map2original_.Copy(1);
867
+ }
868
+ if (byte_length_ <= 0) {
869
+ map2original_.Reset();
870
+ return false; // No more letters to be found
871
+ }
872
+
873
+ // There is at least one letter, so we know the script for this span
874
+ span->ulscript = (ULScript)spanscript;
875
+
876
+
877
+ // Go over alternating spans of same-script letters and non-letters,
878
+ // copying letters to buffer with single spaces for each run of non-letters
879
+ while (take < byte_length_) {
880
+ // Copy run of letters in same script (&LS | LS)*
881
+ int letter_count = 0; // Keep track of word length
882
+ bool need_break = false;
883
+
884
+ while (take < byte_length_) {
885
+ // We are at a letter, nonletter, tag, or entity
886
+ if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
887
+ if (next_byte_[take] == '<') {
888
+ // Begining of tag
889
+ sc = 0;
890
+ break;
891
+ } else if (next_byte_[take] == '>') {
892
+ // Unexpected end of tag
893
+ sc = 0;
894
+ break;
895
+ } else if (next_byte_[take] == '&') {
896
+ // Copy entity, no advance
897
+ EntityToBuffer(next_byte_ + take, byte_length_ - take,
898
+ script_buffer_ + put, &tlen, &plen);
899
+ if (plen > 0) {
900
+ sc = GetUTF8LetterScriptNum(script_buffer_ + put);
901
+ }
902
+ }
903
+ } else {
904
+ // Real letter, safely copy up to 4 bytes, increment by 1..4
905
+ // Will update by 1..4 bytes at Advance, below
906
+ tlen = plen = UTF8OneCharLen(next_byte_ + take);
907
+ if (take < (byte_length_ - 3)) {
908
+ // X86 fast case, does unaligned load/store
909
+ UNALIGNED_STORE32(script_buffer_ + put,
910
+ UNALIGNED_LOAD32(next_byte_ + take));
911
+
912
+ } else {
913
+ // Slow case, happens 1-3 times per input document
914
+ memcpy(script_buffer_ + put, next_byte_ + take, plen);
915
+ }
916
+ sc = GetUTF8LetterScriptNum(next_byte_ + take);
917
+ }
918
+
919
+ // Allow continue across a single letter in a different script:
920
+ // A B D = three scripts, c = common script, i = inherited script,
921
+ // - = don't care, ( = take position before the += below
922
+ // AAA(A- continue
923
+ //
924
+ // AAA(BA continue
925
+ // AAA(BB break
926
+ // AAA(Bc continue (breaks after B)
927
+ // AAA(BD break
928
+ // AAA(Bi break
929
+ //
930
+ // AAA(c- break
931
+ //
932
+ // AAA(i- continue
933
+ //
934
+
935
+ if ((sc != spanscript) && (sc != ULScript_Inherited)) {
936
+ // Might need to break this script span
937
+ if (sc == ULScript_Common) {
938
+ need_break = true;
939
+ } else {
940
+ // Look at next following character, ignoring entity as Common
941
+ int sc2 = GetUTF8LetterScriptNum(next_byte_ + take + tlen);
942
+ if ((sc2 != ULScript_Common) && (sc2 != spanscript)) {
943
+ // We found a non-trivial change of script
944
+ if (one_script_only_) {
945
+ need_break = true;
946
+ }
947
+ }
948
+ }
949
+ }
950
+ if (need_break) {break;} // Non-letter or letter in wrong script
951
+
952
+ take += tlen; // Advance
953
+ put += plen; // Advance
954
+
955
+ // Update the offset map to reflect take/put lengths
956
+ if (tlen == plen) {
957
+ map2original_.Copy(tlen);
958
+ } else if (tlen < plen) {
959
+ map2original_.Copy(tlen);
960
+ map2original_.Insert(plen - tlen);
961
+ } else { // plen < tlen
962
+ map2original_.Copy(plen);
963
+ map2original_.Delete(tlen - plen);
964
+ }
965
+
966
+ ++letter_count;
967
+ if (put >= kMaxScriptBytes) {
968
+ // Buffer is full
969
+ span->truncated = true;
970
+ break;
971
+ }
972
+ } // End while letters
973
+
974
+ // Do run of non-letters (tag | &NL | NL)*
975
+ while (take < byte_length_) {
976
+ // Do fast scan to next interesting byte
977
+ tlen = ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take);
978
+ take += tlen;
979
+ map2original_.Delete(tlen);
980
+ if (take >= byte_length_) {break;} // Might have scanned to end
981
+
982
+ // We are at a letter, nonletter, tag, or entity
983
+ if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
984
+ if (next_byte_[take] == '<') {
985
+ // Begining of tag; skip to end and go around again
986
+ tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take,
987
+ exit_state_);
988
+ sc = 0;
989
+ } else if (next_byte_[take] == '>') {
990
+ // Unexpected end of tag; skip it and go around again
991
+ tlen = 1; // Over the >
992
+ sc = 0;
993
+ } else if (next_byte_[take] == '&') {
994
+ // Expand entity, no advance
995
+ EntityToBuffer(next_byte_ + take, byte_length_ - take,
996
+ script_buffer_ + put, &tlen, &plen);
997
+ if (plen > 0) {
998
+ sc = GetUTF8LetterScriptNum(script_buffer_ + put);
999
+ }
1000
+ }
1001
+ } else {
1002
+ // Update 1..4
1003
+ tlen = UTF8OneCharLen(next_byte_ + take);
1004
+ sc = GetUTF8LetterScriptNum(next_byte_ + take);
1005
+ }
1006
+ if (sc != 0) {break;} // Letter found
1007
+ take += tlen; // Else advance
1008
+ map2original_.Delete(tlen);
1009
+ } // End while not-letters
1010
+
1011
+ script_buffer_[put++] = ' ';
1012
+ map2original_.Insert(1);
1013
+
1014
+ // Letter in wrong script ?
1015
+ if ((sc != spanscript) && (sc != ULScript_Inherited)) {break;}
1016
+ if (put >= put_soft_limit) {
1017
+ // Buffer is almost full
1018
+ span->truncated = true;
1019
+ break;
1020
+ }
1021
+ }
1022
+
1023
+ // Almost done. Back up to a character boundary if needed
1024
+ while ((0 < take) && (take < byte_length_) &&
1025
+ ((next_byte_[take] & 0xc0) == 0x80)) {
1026
+ // Back up over continuation byte
1027
+ --take;
1028
+ --put;
1029
+ }
1030
+
1031
+ // Update input position
1032
+ next_byte_ += take;
1033
+ byte_length_ -= take;
1034
+
1035
+ // Put four more spaces/NUL. Worst case is abcd _ _ _ \0
1036
+ // kMaxScriptBytes | | put
1037
+ script_buffer_[put + 0] = ' ';
1038
+ script_buffer_[put + 1] = ' ';
1039
+ script_buffer_[put + 2] = ' ';
1040
+ script_buffer_[put + 3] = '\0';
1041
+ map2original_.Insert(4);
1042
+ map2original_.Reset();
1043
+
1044
+ span->text_bytes = put; // Does not include the last four chars above
1045
+ return true;
1046
+ }
1047
+
1048
+ // Force Latin, Cyrillic, Armenian, Greek scripts to be lowercase
1049
+ // List changes with each version of Unicode, so just always lowercase
1050
+ // Unicode 6.2.0:
1051
+ // ARMENIAN COPTIC CYRILLIC DESERET GEORGIAN GLAGOLITIC GREEK LATIN
1052
+ void ScriptScanner::LowerScriptSpan(LangSpan* span) {
1053
+ // If needed, lowercase all the text. If we do it sooner, might miss
1054
+ // lowercasing an entity such as &Aacute;
1055
+ // We only need to do this for Latn and Cyrl scripts
1056
+ map2uplow_.Clear();
1057
+ // Full Unicode lowercase of the entire buffer, including
1058
+ // four pad bytes off the end.
1059
+ // Ahhh. But the last byte 0x00 is not interchange-valid, so we do 3 pad
1060
+ // bytes and put the 0x00 in explicitly.
1061
+ // Build an offset map from script_buffer_lower_ back to script_buffer_
1062
+ int consumed, filled, changed;
1063
+ StringPiece istr(span->text, span->text_bytes + 3);
1064
+ StringPiece ostr(script_buffer_lower_, kMaxScriptLowerBuffer);
1065
+
1066
+ UTF8GenericReplace(&utf8repl_lettermarklower_obj,
1067
+ istr, ostr, is_plain_text_,
1068
+ &consumed, &filled, &changed, &map2uplow_);
1069
+ script_buffer_lower_[filled] = '\0';
1070
+ span->text = script_buffer_lower_;
1071
+ span->text_bytes = filled - 3;
1072
+ map2uplow_.Reset();
1073
+ }
1074
+
1075
+ // Copy next run of same-script non-tag letters to buffer [NUL terminated]
1076
+ // Force Latin, Cyrillic, Greek scripts to be lowercase
1077
+ // Buffer ALWAYS has leading space and trailing space space space NUL
1078
+ bool ScriptScanner::GetOneScriptSpanLower(LangSpan* span) {
1079
+ bool ok = GetOneScriptSpan(span);
1080
+ if (ok) {
1081
+ LowerScriptSpan(span);
1082
+ }
1083
+ return ok;
1084
+ }
1085
+
1086
+ // Maps byte offset in most recent GetOneScriptSpan/Lower
1087
+ // span->text [0..text_bytes] into an additional byte offset from
1088
+ // span->offset, to get back to corresponding text in the original
1089
+ // input buffer.
1090
+ // text_offset must be the first byte
1091
+ // of a UTF-8 character, or just beyond the last character. Normally this
1092
+ // routine is called with the first byte of an interesting range and
1093
+ // again with the first byte of the following range.
1094
+ int ScriptScanner::MapBack(int text_offset) {
1095
+ return map2original_.MapBack(map2uplow_.MapBack(text_offset));
1096
+ }
1097
+
1098
+
1099
+ // Gets lscript number for letters; always returns
1100
+ // 0 (common script) for non-letters
1101
+ int GetUTF8LetterScriptNum(const char* src) {
1102
+ int srclen = UTF8OneCharLen(src);
1103
+ const uint8* usrc = reinterpret_cast<const uint8*>(src);
1104
+ return UTF8GenericPropertyTwoByte(&utf8prop_lettermarkscriptnum_obj,
1105
+ &usrc, &srclen);
1106
+ }
1107
+
1108
+ } // namespace CLD2
1109
+ } // namespace chrome_lang_id