cld3 3.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +18 -0
  3. data/LICENSE +204 -0
  4. data/LICENSE_CLD3 +203 -0
  5. data/README.md +22 -0
  6. data/cld3.gemspec +35 -0
  7. data/ext/cld3/base.cc +36 -0
  8. data/ext/cld3/base.h +106 -0
  9. data/ext/cld3/casts.h +98 -0
  10. data/ext/cld3/embedding_feature_extractor.cc +51 -0
  11. data/ext/cld3/embedding_feature_extractor.h +182 -0
  12. data/ext/cld3/embedding_network.cc +196 -0
  13. data/ext/cld3/embedding_network.h +186 -0
  14. data/ext/cld3/embedding_network_params.h +285 -0
  15. data/ext/cld3/extconf.rb +49 -0
  16. data/ext/cld3/feature_extractor.cc +137 -0
  17. data/ext/cld3/feature_extractor.h +633 -0
  18. data/ext/cld3/feature_extractor.proto +50 -0
  19. data/ext/cld3/feature_types.cc +72 -0
  20. data/ext/cld3/feature_types.h +158 -0
  21. data/ext/cld3/fixunicodevalue.cc +55 -0
  22. data/ext/cld3/fixunicodevalue.h +69 -0
  23. data/ext/cld3/float16.h +58 -0
  24. data/ext/cld3/fml_parser.cc +308 -0
  25. data/ext/cld3/fml_parser.h +123 -0
  26. data/ext/cld3/generated_entities.cc +296 -0
  27. data/ext/cld3/generated_ulscript.cc +678 -0
  28. data/ext/cld3/generated_ulscript.h +142 -0
  29. data/ext/cld3/getonescriptspan.cc +1109 -0
  30. data/ext/cld3/getonescriptspan.h +124 -0
  31. data/ext/cld3/integral_types.h +37 -0
  32. data/ext/cld3/lang_id_nn_params.cc +57449 -0
  33. data/ext/cld3/lang_id_nn_params.h +178 -0
  34. data/ext/cld3/language_identifier_features.cc +165 -0
  35. data/ext/cld3/language_identifier_features.h +116 -0
  36. data/ext/cld3/nnet_language_identifier.cc +380 -0
  37. data/ext/cld3/nnet_language_identifier.h +175 -0
  38. data/ext/cld3/nnet_language_identifier_c.cc +72 -0
  39. data/ext/cld3/offsetmap.cc +478 -0
  40. data/ext/cld3/offsetmap.h +168 -0
  41. data/ext/cld3/port.h +143 -0
  42. data/ext/cld3/registry.cc +28 -0
  43. data/ext/cld3/registry.h +242 -0
  44. data/ext/cld3/relevant_script_feature.cc +89 -0
  45. data/ext/cld3/relevant_script_feature.h +49 -0
  46. data/ext/cld3/script_detector.h +156 -0
  47. data/ext/cld3/sentence.proto +77 -0
  48. data/ext/cld3/sentence_features.cc +29 -0
  49. data/ext/cld3/sentence_features.h +35 -0
  50. data/ext/cld3/simple_adder.h +72 -0
  51. data/ext/cld3/stringpiece.h +81 -0
  52. data/ext/cld3/task_context.cc +161 -0
  53. data/ext/cld3/task_context.h +81 -0
  54. data/ext/cld3/task_context_params.cc +74 -0
  55. data/ext/cld3/task_context_params.h +54 -0
  56. data/ext/cld3/task_spec.proto +98 -0
  57. data/ext/cld3/text_processing.cc +245 -0
  58. data/ext/cld3/text_processing.h +30 -0
  59. data/ext/cld3/unicodetext.cc +96 -0
  60. data/ext/cld3/unicodetext.h +144 -0
  61. data/ext/cld3/utf8acceptinterchange.h +486 -0
  62. data/ext/cld3/utf8prop_lettermarkscriptnum.h +1631 -0
  63. data/ext/cld3/utf8repl_lettermarklower.h +758 -0
  64. data/ext/cld3/utf8scannot_lettermarkspecial.h +1455 -0
  65. data/ext/cld3/utf8statetable.cc +1344 -0
  66. data/ext/cld3/utf8statetable.h +285 -0
  67. data/ext/cld3/utils.cc +241 -0
  68. data/ext/cld3/utils.h +144 -0
  69. data/ext/cld3/workspace.cc +64 -0
  70. data/ext/cld3/workspace.h +177 -0
  71. data/lib/cld3.rb +99 -0
  72. metadata +158 -0
@@ -0,0 +1,142 @@
1
+ // Copyright 2013 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+
15
+ // generated_ulscript.h
16
+ // Machine generated. Do Not Edit.
17
+ //
18
+ // Declarations for scripts recognized by CLD2
19
+ //
20
+
21
+ #ifndef SCRIPT_SPAN_GENERATED_ULSCRIPT_H_
22
+ #define SCRIPT_SPAN_GENERATED_ULSCRIPT_H_
23
+
24
+ namespace chrome_lang_id {
25
+ namespace CLD2 {
26
+
27
+ typedef enum {RTypeNone = 0, RTypeOne, RTypeMany, RTypeCJK} ULScriptRType;
28
+
29
+ typedef struct {const char* s; int i;} CharIntPair;
30
+
31
+ typedef enum {
32
+ ULScript_Common = 0, // Zyyy
33
+ ULScript_Latin = 1, // Latn
34
+ ULScript_Greek = 2, // Grek
35
+ ULScript_Cyrillic = 3, // Cyrl
36
+ ULScript_Armenian = 4, // Armn
37
+ ULScript_Hebrew = 5, // Hebr
38
+ ULScript_Arabic = 6, // Arab
39
+ ULScript_Syriac = 7, // Syrc
40
+ ULScript_Thaana = 8, // Thaa
41
+ ULScript_Devanagari = 9, // Deva
42
+ ULScript_Bengali = 10, // Beng
43
+ ULScript_Gurmukhi = 11, // Guru
44
+ ULScript_Gujarati = 12, // Gujr
45
+ ULScript_Oriya = 13, // Orya
46
+ ULScript_Tamil = 14, // Taml
47
+ ULScript_Telugu = 15, // Telu
48
+ ULScript_Kannada = 16, // Knda
49
+ ULScript_Malayalam = 17, // Mlym
50
+ ULScript_Sinhala = 18, // Sinh
51
+ ULScript_Thai = 19, // Thai
52
+ ULScript_Lao = 20, // Laoo
53
+ ULScript_Tibetan = 21, // Tibt
54
+ ULScript_Myanmar = 22, // Mymr
55
+ ULScript_Georgian = 23, // Geor
56
+ ULScript_Hani = 24, // Hani
57
+ ULScript_Ethiopic = 25, // Ethi
58
+ ULScript_Cherokee = 26, // Cher
59
+ ULScript_Canadian_Aboriginal = 27, // Cans
60
+ ULScript_Ogham = 28, // Ogam
61
+ ULScript_Runic = 29, // Runr
62
+ ULScript_Khmer = 30, // Khmr
63
+ ULScript_Mongolian = 31, // Mong
64
+ ULScript_32 = 32, //
65
+ ULScript_33 = 33, //
66
+ ULScript_Bopomofo = 34, // Bopo
67
+ ULScript_35 = 35, //
68
+ ULScript_Yi = 36, // Yiii
69
+ ULScript_Old_Italic = 37, // Ital
70
+ ULScript_Gothic = 38, // Goth
71
+ ULScript_Deseret = 39, // Dsrt
72
+ ULScript_Inherited = 40, // Zinh
73
+ ULScript_Tagalog = 41, // Tglg
74
+ ULScript_Hanunoo = 42, // Hano
75
+ ULScript_Buhid = 43, // Buhd
76
+ ULScript_Tagbanwa = 44, // Tagb
77
+ ULScript_Limbu = 45, // Limb
78
+ ULScript_Tai_Le = 46, // Tale
79
+ ULScript_Linear_B = 47, // Linb
80
+ ULScript_Ugaritic = 48, // Ugar
81
+ ULScript_Shavian = 49, // Shaw
82
+ ULScript_Osmanya = 50, // Osma
83
+ ULScript_Cypriot = 51, // Cprt
84
+ ULScript_Braille = 52, // Brai
85
+ ULScript_Buginese = 53, // Bugi
86
+ ULScript_Coptic = 54, // Copt
87
+ ULScript_New_Tai_Lue = 55, // Talu
88
+ ULScript_Glagolitic = 56, // Glag
89
+ ULScript_Tifinagh = 57, // Tfng
90
+ ULScript_Syloti_Nagri = 58, // Sylo
91
+ ULScript_Old_Persian = 59, // Xpeo
92
+ ULScript_Kharoshthi = 60, // Khar
93
+ ULScript_Balinese = 61, // Bali
94
+ ULScript_Cuneiform = 62, // Xsux
95
+ ULScript_Phoenician = 63, // Phnx
96
+ ULScript_Phags_Pa = 64, // Phag
97
+ ULScript_Nko = 65, // Nkoo
98
+ ULScript_Sundanese = 66, // Sund
99
+ ULScript_Lepcha = 67, // Lepc
100
+ ULScript_Ol_Chiki = 68, // Olck
101
+ ULScript_Vai = 69, // Vaii
102
+ ULScript_Saurashtra = 70, // Saur
103
+ ULScript_Kayah_Li = 71, // Kali
104
+ ULScript_Rejang = 72, // Rjng
105
+ ULScript_Lycian = 73, // Lyci
106
+ ULScript_Carian = 74, // Cari
107
+ ULScript_Lydian = 75, // Lydi
108
+ ULScript_Cham = 76, // Cham
109
+ ULScript_Tai_Tham = 77, // Lana
110
+ ULScript_Tai_Viet = 78, // Tavt
111
+ ULScript_Avestan = 79, // Avst
112
+ ULScript_Egyptian_Hieroglyphs = 80, // Egyp
113
+ ULScript_Samaritan = 81, // Samr
114
+ ULScript_Lisu = 82, // Lisu
115
+ ULScript_Bamum = 83, // Bamu
116
+ ULScript_Javanese = 84, // Java
117
+ ULScript_Meetei_Mayek = 85, // Mtei
118
+ ULScript_Imperial_Aramaic = 86, // Armi
119
+ ULScript_Old_South_Arabian = 87, // Sarb
120
+ ULScript_Inscriptional_Parthian = 88, // Prti
121
+ ULScript_Inscriptional_Pahlavi = 89, // Phli
122
+ ULScript_Old_Turkic = 90, // Orkh
123
+ ULScript_Kaithi = 91, // Kthi
124
+ ULScript_Batak = 92, // Batk
125
+ ULScript_Brahmi = 93, // Brah
126
+ ULScript_Mandaic = 94, // Mand
127
+ ULScript_Chakma = 95, // Cakm
128
+ ULScript_Meroitic_Cursive = 96, // Merc
129
+ ULScript_Meroitic_Hieroglyphs = 97, // Mero
130
+ ULScript_Miao = 98, // Plrd
131
+ ULScript_Sharada = 99, // Shrd
132
+ ULScript_Sora_Sompeng = 100, // Sora
133
+ ULScript_Takri = 101, // Takr
134
+ NUM_ULSCRIPTS
135
+ } ULScript;
136
+
137
+ #define UNKNOWN_ULSCRIPT ULScript_Common
138
+
139
+ } // namespace CLD2
140
+ } // namespace chrome_lang_id
141
+
142
+ #endif // SCRIPT_SPAN_GENERATED_ULSCRIPT_H_
@@ -0,0 +1,1109 @@
1
+ // Copyright 2013 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+
15
+ //
16
+ // Author: dsites@google.com (Dick Sites)
17
+ //
18
+
19
+
20
+ #include "getonescriptspan.h"
21
+
22
+ #include <string.h>
23
+
24
+ #include "fixunicodevalue.h"
25
+ #include "port.h"
26
+ #include "utf8acceptinterchange.h"
27
+ #include "utf8repl_lettermarklower.h"
28
+ #include "utf8prop_lettermarkscriptnum.h"
29
+ #include "utf8scannot_lettermarkspecial.h"
30
+ #include "utf8statetable.h"
31
+
32
+ namespace chrome_lang_id {
33
+ namespace CLD2 {
34
+
35
+ // Alphabetical order for binary search, from
36
+ // generated_entities.cc
37
+ extern const int kNameToEntitySize;
38
+ extern const CharIntPair kNameToEntity[];
39
+
40
+ static const char kSpecialSymbol[256] = { // true for < > &
41
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
42
+ 0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0,
43
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
44
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
45
+
46
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
47
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
48
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
49
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
50
+ };
51
+
52
+
53
+
54
+ #define LT 0 // <
55
+ #define GT 1 // >
56
+ #define EX 2 // !
57
+ #define HY 3 // -
58
+ #define QU 4 // "
59
+ #define AP 5 // '
60
+ #define SL 6 // /
61
+ #define S_ 7
62
+ #define C_ 8
63
+ #define R_ 9
64
+ #define I_ 10
65
+ #define P_ 11
66
+ #define T_ 12
67
+ #define Y_ 13
68
+ #define L_ 14
69
+ #define E_ 15
70
+ #define CR 16 // <cr> or <lf>
71
+ #define NL 17 // non-letter: ASCII whitespace, digit, punctuation
72
+ #define PL 18 // possible letter, incl. &
73
+ #define xx 19 // <unused>
74
+
75
+ // Map byte to one of ~20 interesting categories for cheap tag parsing
76
+ static const uint8 kCharToSub[256] = {
77
+ NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL,
78
+ NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
79
+ NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL,
80
+ NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL,
81
+
82
+ PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
83
+ P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
84
+ PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
85
+ P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
86
+
87
+ NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
88
+ NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
89
+ NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
90
+ NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
91
+
92
+ PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
93
+ PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
94
+ PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
95
+ PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
96
+ };
97
+
98
+ #undef LT
99
+ #undef GT
100
+ #undef EX
101
+ #undef HY
102
+ #undef QU
103
+ #undef AP
104
+ #undef SL
105
+ #undef S_
106
+ #undef C_
107
+ #undef R_
108
+ #undef I_
109
+ #undef P_
110
+ #undef T_
111
+ #undef Y_
112
+ #undef L_
113
+ #undef E_
114
+ #undef CR
115
+ #undef NL
116
+ #undef PL
117
+ #undef xx
118
+
119
+
120
+ #define OK 0
121
+ #define X_ 1
122
+
123
+
124
+ static const int kMaxExitStateLettersMarksOnly = 1;
125
+ static const int kMaxExitStateAllText = 2;
126
+
127
+
128
+ // State machine to do cheap parse of non-letter strings incl. tags
129
+ // advances <tag>
130
+ // | |
131
+ // advances <tag> ... </tag> for <script> <style>
132
+ // | |
133
+ // advances <!-- ... <tag> ... -->
134
+ // | |
135
+ // advances <tag
136
+ // || (0)
137
+ // advances <tag <tag2>
138
+ // || (0)
139
+ //
140
+ // We start in state [0] at a non-letter and make at least one transition
141
+ // When scanning for just letters, arriving back at state [0] or [1] exits
142
+ // the state machine.
143
+ // When scanning for any non-tag text, arriving at state [2] also exits
144
+ static const uint8 kTagParseTbl_0[] = {
145
+ // < > ! - " ' / S C R I P T Y L E CR NL PL xx
146
+ 3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [0] OK exit state
147
+ X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error exit state
148
+ 3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [2] NL* [exit state]
149
+ X_, 2, 4, 9, 10,11, 9,13, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [3] <
150
+ X_, 2, 9, 5, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [4] <!
151
+ X_, 2, 9, 6, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [5] <!-
152
+ 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [6] <!--.*
153
+ 6, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [7] <!--.*-
154
+ 6, 2, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [8] <!--.*--
155
+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [9] <.*
156
+ 10,10,10,10, 9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*"
157
+ 11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*'
158
+ X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " '
159
+
160
+ // < > ! - " ' / S C R I P T Y L E CR NL PL xx
161
+ X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9, 9, 9, 9,X_, // [13] <S
162
+ X_, 2, 9, 9, 10,11, 9, 9, 9,15, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [14] <SC
163
+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9,16, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [15] <SCR
164
+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9,17, 9, 9, 9, 9, 9, 9, 9,X_, // [16] <SCRI
165
+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 18, 9, 9, 9, 9, 9, 9,X_, // [17] <SCRIP
166
+ X_,19, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT
167
+ 20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .*
168
+ 19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*<
169
+ 19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 21,21,19,X_, // [21] <SCRIPT .*</ allow SP CR LF
170
+ 19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S
171
+ 19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC
172
+ 19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR
173
+ 19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI
174
+ 19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP
175
+ 19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT
176
+
177
+ // < > ! - " ' / S C R I P T Y L E CR NL PL xx
178
+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9,29, 9, 9, 9, 9, 9,X_, // [28] <ST
179
+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9,30, 9, 9, 9, 9,X_, // [29] <STY
180
+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9,31, 9, 9, 9,X_, // [30] <STYL
181
+ X_,32, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE
182
+ 33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .*
183
+ 32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*<
184
+ 32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 34,34,32,X_, // [34] <STYLE .*</ allow SP CR LF
185
+ 32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S
186
+ 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST
187
+ 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY
188
+ 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL
189
+ 32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE
190
+ };
191
+
192
+ #undef OK
193
+ #undef X_
194
+
195
+ enum
196
+ {
197
+ UTFmax = 4, // maximum bytes per rune
198
+ Runesync = 0x80, // cannot represent part of a UTF sequence (<)
199
+ Runeself = 0x80, // rune and UTF sequences are the same (<)
200
+ Runeerror = 0xFFFD, // decoding error in UTF
201
+ Runemax = 0x10FFFF, // maximum rune value
202
+ };
203
+
204
+ // Debugging. Not thread safe.
205
+ static char gDisplayPiece[32];
206
+ const uint8 gCharlen[16] = {1,1,1,1, 1,1,1,1, 1,1,1,1, 2,2,3,4};
207
+ char* DisplayPiece(const char* next_byte_, int byte_length_) {
208
+ // Copy up to 8 UTF-8 chars to buffer
209
+ int k = 0; // byte count
210
+ int n = 0; // character count
211
+ for (int i = 0; i < byte_length_; ++i) {
212
+ char c = next_byte_[i];
213
+ if ((c & 0xc0) != 0x80) {
214
+ // Beginning of a UTF-8 character
215
+ int charlen = gCharlen[static_cast<uint8>(c) >> 4];
216
+ if (i + charlen > byte_length_) {break;} // Not enough room for full char
217
+ if (k >= (32 - 7)) {break;} // Not necessarily enough room
218
+ if (n >= 8) {break;} // Enough characters already
219
+ ++n;
220
+ }
221
+ if (c == '<') {
222
+ memcpy(&gDisplayPiece[k], "&lt;", 4); k += 4;
223
+ } else if (c == '>') {
224
+ memcpy(&gDisplayPiece[k], "&gt;", 4); k += 4;
225
+ } else if (c == '&') {
226
+ memcpy(&gDisplayPiece[k], "&amp;", 5); k += 5;
227
+ } else if (c == '\'') {
228
+ memcpy(&gDisplayPiece[k], "&apos;", 6); k += 6;
229
+ } else if (c == '"') {
230
+ memcpy(&gDisplayPiece[k], "&quot;", 6); k += 6;
231
+ } else {
232
+ gDisplayPiece[k++] = c;
233
+ }
234
+ }
235
+ gDisplayPiece[k++] = '\0';
236
+ return gDisplayPiece;
237
+ }
238
+
239
+
240
+
241
+ // runetochar copies (encodes) one rune, pointed to by r, to at most
242
+ // UTFmax bytes starting at s and returns the number of bytes generated.
243
+ int runetochar(char *str, const char32 *rune) {
244
+ // Convert to unsigned for range check.
245
+ unsigned long c;
246
+
247
+ // 1 char 00-7F
248
+ c = *rune;
249
+ if(c <= 0x7F) {
250
+ str[0] = static_cast<char>(c);
251
+ return 1;
252
+ }
253
+
254
+ // 2 char 0080-07FF
255
+ if(c <= 0x07FF) {
256
+ str[0] = 0xC0 | static_cast<char>(c >> 1*6);
257
+ str[1] = 0x80 | (c & 0x3F);
258
+ return 2;
259
+ }
260
+
261
+ // Range check
262
+ if (c > Runemax) {
263
+ c = Runeerror;
264
+ }
265
+
266
+ // 3 char 0800-FFFF
267
+ if (c <= 0xFFFF) {
268
+ str[0] = 0xE0 | static_cast<char>(c >> 2*6);
269
+ str[1] = 0x80 | ((c >> 1*6) & 0x3F);
270
+ str[2] = 0x80 | (c & 0x3F);
271
+ return 3;
272
+ }
273
+
274
+ // 4 char 10000-1FFFFF
275
+ str[0] = 0xF0 | static_cast<char>(c >> 3*6);
276
+ str[1] = 0x80 | ((c >> 2*6) & 0x3F);
277
+ str[2] = 0x80 | ((c >> 1*6) & 0x3F);
278
+ str[3] = 0x80 | (c & 0x3F);
279
+ return 4;
280
+ }
281
+
282
+
283
+
284
+ // Useful for converting an entity to an ascii value.
285
+ // RETURNS unicode value, or -1 if entity isn't valid. Don't include & or ;
286
+ int LookupEntity(const char* entity_name, int entity_len) {
287
+ // Make a C string
288
+ if (entity_len >= 16) {return -1;} // All real entities are shorter
289
+ char temp[16];
290
+ memcpy(temp, entity_name, entity_len);
291
+ temp[entity_len] = '\0';
292
+ int match = BinarySearch(temp, 0, kNameToEntitySize, kNameToEntity);
293
+ if (match >= 0) {return kNameToEntity[match].i;}
294
+ return -1;
295
+ }
296
+
297
+ bool ascii_isdigit(char c) {
298
+ return ('0' <= c) && (c <= '9');
299
+ }
300
+ bool ascii_isxdigit(char c) {
301
+ if (('0' <= c) && (c <= '9')) {return true;}
302
+ if (('a' <= c) && (c <= 'f')) {return true;}
303
+ if (('A' <= c) && (c <= 'F')) {return true;}
304
+ return false;
305
+ }
306
+ bool ascii_isalnum(char c) {
307
+ if (('0' <= c) && (c <= '9')) {return true;}
308
+ if (('a' <= c) && (c <= 'z')) {return true;}
309
+ if (('A' <= c) && (c <= 'Z')) {return true;}
310
+ return false;
311
+ }
312
+ int hex_digit_to_int(char c) {
313
+ if (('0' <= c) && (c <= '9')) {return c - '0';}
314
+ if (('a' <= c) && (c <= 'f')) {return c - 'a' + 10;}
315
+ if (('A' <= c) && (c <= 'F')) {return c - 'A' + 10;}
316
+ return 0;
317
+ }
318
+
319
+ static int32 strto32_base10(const char* nptr, const char* limit,
320
+ const char **endptr) {
321
+ *endptr = nptr;
322
+ while (nptr < limit && *nptr == '0') {
323
+ ++nptr;
324
+ }
325
+ if (nptr == limit || !ascii_isdigit(*nptr))
326
+ return -1;
327
+ const char* end_digits_run = nptr;
328
+ while (end_digits_run < limit && ascii_isdigit(*end_digits_run)) {
329
+ ++end_digits_run;
330
+ }
331
+ *endptr = end_digits_run;
332
+ const int num_digits = end_digits_run - nptr;
333
+ // kint32max == 2147483647.
334
+ if (num_digits < 9 ||
335
+ (num_digits == 10 && memcmp(nptr, "2147483647", 10) <= 0)) {
336
+ int value = 0;
337
+ for (; nptr < end_digits_run; ++nptr) {
338
+ value *= 10;
339
+ value += *nptr - '0';
340
+ }
341
+ // Overflow past the last valid unicode codepoint
342
+ // (0x10ffff) is converted to U+FFFD by FixUnicodeValue().
343
+ return FixUnicodeValue(value);
344
+ } else {
345
+ // Overflow: can't fit in an int32;
346
+ // returns the replacement character 0xFFFD.
347
+ return 0xFFFD;
348
+ }
349
+ }
350
+
351
+ static int32 strto32_base16(const char* nptr, const char* limit,
352
+ const char **endptr) {
353
+ *endptr = nptr;
354
+ while (nptr < limit && *nptr == '0') {
355
+ ++nptr;
356
+ }
357
+ if (nptr == limit || !ascii_isxdigit(*nptr)) {
358
+ return -1;
359
+ }
360
+ const char* end_xdigits_run = nptr;
361
+ while (end_xdigits_run < limit && ascii_isxdigit(*end_xdigits_run)) {
362
+ ++end_xdigits_run;
363
+ }
364
+ *endptr = end_xdigits_run;
365
+ const int num_xdigits = end_xdigits_run - nptr;
366
+ // kint32max == 0x7FFFFFFF.
367
+ if (num_xdigits < 8 || (num_xdigits == 8 && nptr[0] < '8')) {
368
+ int value = 0;
369
+ for (; nptr < end_xdigits_run; ++nptr) {
370
+ value <<= 4;
371
+ value += hex_digit_to_int(*nptr);
372
+ }
373
+ // Overflow past the last valid unicode codepoint
374
+ // (0x10ffff) is converted to U+FFFD by FixUnicodeValue().
375
+ return FixUnicodeValue(value);
376
+ } else {
377
+ // Overflow: can't fit in an int32;
378
+ // returns the replacement character 0xFFFD.
379
+ return 0xFFFD;
380
+ }
381
+ }
382
+
383
+ // Unescape the current character pointed to by src. SETS the number
384
+ // of chars read for the conversion (in UTF8). If src isn't a valid entity,
385
+ // just consume the & and RETURN -1. If src doesn't point to & -- which it
386
+ // should -- set src_consumed to 0 and RETURN -1.
387
+ int ReadEntity(const char* src, int srcn, int* src_consumed) {
388
+ const char* const srcend = src + srcn;
389
+
390
+ if (srcn == 0 || *src != '&') { // input should start with an ampersand
391
+ *src_consumed = 0;
392
+ return -1;
393
+ }
394
+ *src_consumed = 1; // we'll get the & at least
395
+
396
+ // The standards are a bit unclear on when an entity ends. Certainly a ";"
397
+ // ends one, but spaces probably do too. We follow the lead of both IE and
398
+ // Netscape, which as far as we can tell end numeric entities (1st case below)
399
+ // at any non-digit, and end character entities (2nd case) at any non-alnum.
400
+ const char* entstart, *entend; // where the entity starts and ends
401
+ entstart = src + 1; // read past the &
402
+ int entval; // UCS2 value of the entity
403
+ if ( *entstart == '#' ) { // -- 1st case: numeric entity
404
+ if ( entstart + 2 >= srcend ) {
405
+ return -1; // no way a legitimate number could fit
406
+ } else if ( entstart[1] == 'x' || entstart[1] == 'X' ) { // hex numeric
407
+ entval = strto32_base16(entstart + 2, srcend, &entend);
408
+ } else { // decimal numeric entity
409
+ entval = strto32_base10(entstart+1, srcend, &entend);
410
+ }
411
+ if (entval == -1 || entend > srcend) {
412
+ return -1; // not entirely correct, but close enough
413
+ }
414
+ } else { // -- 2nd case: character entity
415
+ for (entend = entstart;
416
+ entend < srcend && ascii_isalnum(*entend);
417
+ ++entend ) {
418
+ // entity consists of alphanumeric chars
419
+ }
420
+ entval = LookupEntity(entstart, entend - entstart);
421
+ if (entval < 0) {
422
+ return -1; // not a legal entity name
423
+ }
424
+ // Now we do a strange-seeming IE6-compatibility check: if entval is
425
+ // >= 256, it *must* be followed by a semicolon or it's not considered
426
+ // an entity. The problem is lots of the newfangled entity names, like
427
+ // "lang", also occur in URL CGI arguments: "/search?q=test&lang=en".
428
+ // When these links are written in HTML, it would be really bad if the
429
+ // "&lang" were treated as an entity, which is what the spec says
430
+ // *should* happen (even when the HTML is inside an "A HREF" tag!)
431
+ // IE ignores the spec for these new, high-value entities, so we do too.
432
+ if ( entval >= 256 && !(entend < srcend && *entend == ';') ) {
433
+ return -1; // make non-;-terminated entity illegal
434
+ }
435
+ }
436
+
437
+ // Finally, figure out how much src was consumed
438
+ if ( entend < srcend && *entend == ';' ) {
439
+ entend++; // standard says ; terminator is special
440
+ }
441
+ *src_consumed = entend - src;
442
+ return entval;
443
+ }
444
+
445
+
446
+ // Src points to '&'
447
+ // Writes entity value to dst. Returns take(src), put(dst) byte counts
448
+ void EntityToBuffer(const char* src, int len, char* dst,
449
+ int* tlen, int* plen) {
450
+ char32 entval = ReadEntity(src, len, tlen);
451
+
452
+ // ReadEntity does this already: entval = FixUnicodeValue(entval);
453
+
454
+ // Convert UTF-32 to UTF-8
455
+ if (entval > 0) {
456
+ *plen = runetochar(dst, &entval);
457
+ } else {
458
+ // Illegal entity; ignore the '&'
459
+ *tlen = 1;
460
+ *plen = 0;
461
+ }
462
+ }
463
+
464
+ // Returns true if character is < > or &, none of which are letters
465
+ bool inline IsSpecial(char c) {
466
+ // Comparison (int != 0) is used to silence the warning:
467
+ // 'const char': forcing value to bool
468
+ if ((c & 0xe0) == 0x20) {
469
+ return (kSpecialSymbol[static_cast<uint8>(c)] != 0);
470
+ }
471
+ return false;
472
+ }
473
+
474
+ // Quick Skip to next letter or < > & or to end of string (eos)
475
+ // Always return is_letter for eos
476
+ int ScanToLetterOrSpecial(const char* src, int len) {
477
+ int bytes_consumed;
478
+ StringPiece str(src, len);
479
+ UTF8GenericScan(&utf8scannot_lettermarkspecial_obj, str, &bytes_consumed);
480
+ return bytes_consumed;
481
+ }
482
+
483
+
484
+
485
+
486
+ // src points to non-letter, such as tag-opening '<'
487
+ // Return length from here to next possible letter
488
+ // On another < before >, return 1
489
+ // advances <tag>
490
+ // | |
491
+ // advances <tag> ... </tag> for <script> <style>
492
+ // | |
493
+ // advances <!-- ... <tag> ... -->
494
+ // | |
495
+ // advances <tag
496
+ // | | end of string
497
+ // advances <tag <tag2>
498
+ // ||
499
+ int ScanToPossibleLetter(const char* isrc, int len, int max_exit_state) {
500
+ const uint8* src = reinterpret_cast<const uint8*>(isrc);
501
+ const uint8* srclimit = src + len;
502
+ const uint8* tagParseTbl = kTagParseTbl_0;
503
+ int e = 0;
504
+ while (src < srclimit) {
505
+ e = tagParseTbl[kCharToSub[*src++]];
506
+ if (e <= max_exit_state) {
507
+ // We overshot by one byte
508
+ --src;
509
+ break;
510
+ }
511
+ tagParseTbl = &kTagParseTbl_0[e * 20];
512
+ }
513
+
514
+ if (src >= srclimit) {
515
+ // We fell off the end of the text.
516
+ // It looks like the most common case for this is a truncated file, not
517
+ // mismatched angle brackets. So we pretend that the last char was '>'
518
+ return len;
519
+ }
520
+
521
+ // OK to be in state 0 or state 2 at exit
522
+ if ((e != 0) && (e != 2)) {
523
+ // Error, '<' followed by '<'
524
+ // We want to back up to first <, then advance by one byte past it
525
+ int offset = src - reinterpret_cast<const uint8*>(isrc);
526
+
527
+ // Backscan to first '<' and return enough length to just get past it
528
+ --offset; // back up over the second '<', which caused us to stop
529
+ while ((0 < offset) && (isrc[offset] != '<')) {
530
+ // Find the first '<', which is unmatched
531
+ --offset;
532
+ }
533
+ // skip to just beyond first '<'
534
+ return offset + 1;
535
+ }
536
+
537
+ return src - reinterpret_cast<const uint8*>(isrc);
538
+ }
539
+
540
+ // Returns mid if key found in lo <= mid < hi, else -1
541
+ int BinarySearch(const char* key, int lo, int hi, const CharIntPair* cipair) {
542
+ // binary search
543
+ while (lo < hi) {
544
+ int mid = (lo + hi) >> 1;
545
+ if (strcmp(key, cipair[mid].s) < 0) {
546
+ hi = mid;
547
+ } else if (strcmp(key, cipair[mid].s) > 0) {
548
+ lo = mid + 1;
549
+ } else {
550
+ return mid;
551
+ }
552
+ }
553
+ return -1;
554
+ }
555
+
556
+ // Returns the length in bytes of the prefix of src that is all
557
+ // interchange valid UTF-8
558
+ int SpanInterchangeValid(const char* src, int byte_length) {
559
+ int bytes_consumed;
560
+ const UTF8ReplaceObj* st = &utf8acceptinterchange_obj;
561
+ StringPiece str(src, byte_length);
562
+ UTF8GenericScan(st, str, &bytes_consumed);
563
+ return bytes_consumed;
564
+ }
565
+
566
+ ScriptScanner::ScriptScanner(const char* buffer,
567
+ int buffer_length,
568
+ bool is_plain_text)
569
+ : start_byte_(buffer),
570
+ next_byte_(buffer),
571
+ byte_length_(buffer_length),
572
+ is_plain_text_(is_plain_text),
573
+ letters_marks_only_(true),
574
+ one_script_only_(true),
575
+ exit_state_(kMaxExitStateLettersMarksOnly) {
576
+ script_buffer_ = new char[kMaxScriptBuffer];
577
+ script_buffer_lower_ = new char[kMaxScriptLowerBuffer];
578
+ map2original_.Clear(); // map from script_buffer_ to buffer
579
+ map2uplow_.Clear(); // map from script_buffer_lower_ to script_buffer_
580
+ }
581
+
582
+ // Extended version to allow spans of any non-tag text and spans of mixed script
583
+ ScriptScanner::ScriptScanner(const char* buffer,
584
+ int buffer_length,
585
+ bool is_plain_text,
586
+ bool any_text,
587
+ bool any_script)
588
+ : start_byte_(buffer),
589
+ next_byte_(buffer),
590
+ byte_length_(buffer_length),
591
+ is_plain_text_(is_plain_text),
592
+ letters_marks_only_(!any_text),
593
+ one_script_only_(!any_script),
594
+ exit_state_(any_text ? kMaxExitStateAllText : kMaxExitStateLettersMarksOnly) {
595
+ script_buffer_ = new char[kMaxScriptBuffer];
596
+ script_buffer_lower_ = new char[kMaxScriptLowerBuffer];
597
+ map2original_.Clear(); // map from script_buffer_ to buffer
598
+ map2uplow_.Clear(); // map from script_buffer_lower_ to script_buffer_
599
+ }
600
+
601
+
602
+ ScriptScanner::~ScriptScanner() {
603
+ delete[] script_buffer_;
604
+ delete[] script_buffer_lower_;
605
+ }
606
+
607
+
608
+
609
+
610
+ // Get to the first real non-tag letter or entity that is a letter
611
+ // Sets script of that letter
612
+ // Return len if no more letters
613
+ int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) {
614
+ int sc = UNKNOWN_ULSCRIPT;
615
+ int skip = 0;
616
+ int tlen, plen;
617
+
618
+ // Do run of non-letters (tag | &NL | NL)*
619
+ tlen = 0;
620
+ while (skip < len) {
621
+ // Do fast scan to next interesting byte
622
+ // int oldskip = skip;
623
+ skip += ScanToLetterOrSpecial(src + skip, len - skip);
624
+
625
+ // Check for no more letters/specials
626
+ if (skip >= len) {
627
+ // All done
628
+ *script = sc;
629
+ return len;
630
+ }
631
+
632
+ // We are at a letter, nonletter, tag, or entity
633
+ if (IsSpecial(src[skip]) && !is_plain_text_) {
634
+ if (src[skip] == '<') {
635
+ // Begining of tag; skip to end and go around again
636
+ tlen = ScanToPossibleLetter(src + skip, len - skip,
637
+ exit_state_);
638
+ sc = 0;
639
+ } else if (src[skip] == '>') {
640
+ // Unexpected end of tag; skip it and go around again
641
+ tlen = 1; // Over the >
642
+ sc = 0;
643
+ } else if (src[skip] == '&') {
644
+ // Expand entity, no advance
645
+ char temp[4];
646
+ EntityToBuffer(src + skip, len - skip,
647
+ temp, &tlen, &plen);
648
+ if (plen > 0) {
649
+ sc = GetUTF8LetterScriptNum(temp);
650
+ }
651
+ }
652
+ } else {
653
+ // Update 1..4 bytes
654
+ tlen = UTF8OneCharLen(src + skip);
655
+ sc = GetUTF8LetterScriptNum(src + skip);
656
+ }
657
+ if (sc != 0) {break;} // Letter found
658
+ skip += tlen; // Else advance
659
+ }
660
+
661
+ *script = sc;
662
+ return skip;
663
+ }
664
+
665
+
666
+ // These are for ASCII-only tag names
667
+ // Compare one letter uplow to c, ignoring case of uplowp
668
+ inline bool EqCase(char uplow, char c) {
669
+ return (uplow | 0x20) == c;
670
+ }
671
+
672
+ // These are for ASCII-only tag names
673
+ // Return true for space / < > etc. all less than 0x40
674
+ inline bool NeqLetter(char c) {
675
+ return c < 0x40;
676
+ }
677
+
678
+ // These are for ASCII-only tag names
679
+ // Return true for space \n false for \r
680
+ inline bool WS(char c) {
681
+ return (c == ' ') || (c == '\n');
682
+ }
683
+
684
+ // Canonical CR or LF
685
+ static const char LF = '\n';
686
+
687
+
688
+ // The naive loop scans from next_byte_ to script_buffer_ until full.
689
+ // But this can leave an awkward hard-to-identify short fragment at the
690
+ // end of the input. We would prefer to make the next-to-last fragment
691
+ // shorter and the last fragment longer.
692
+
693
+ // Copy next run of non-tag characters to buffer [NUL terminated]
694
+ // This just replaces tags with space or \n and removes entities.
695
+ // Tags <br> <p> and <tr> are replaced with \n. Non-letter sequences
696
+ // including \r or \n are replaced by \n. All other tags and skipped text
697
+ // are replaced with ASCII space.
698
+ //
699
+ // Buffer ALWAYS has leading space and trailing space space space NUL
700
+ bool ScriptScanner::GetOneTextSpan(LangSpan* span) {
701
+ span->text = script_buffer_;
702
+ span->text_bytes = 0;
703
+ span->offset = next_byte_ - start_byte_;
704
+ span->ulscript = UNKNOWN_ULSCRIPT;
705
+ span->truncated = false;
706
+
707
+ int put_soft_limit = kMaxScriptBytes - kWithinScriptTail;
708
+ if ((kMaxScriptBytes <= byte_length_) &&
709
+ (byte_length_ < (2 * kMaxScriptBytes))) {
710
+ // Try to split the last two fragments in half
711
+ put_soft_limit = byte_length_ / 2;
712
+ }
713
+
714
+ script_buffer_[0] = ' '; // Always a space at front of output
715
+ script_buffer_[1] = '\0';
716
+ int take = 0;
717
+ int put = 1; // Start after the initial space
718
+ int tlen = 0, plen = 0;
719
+
720
+ if (byte_length_ <= 0) {
721
+ return false; // No more text to be found
722
+ }
723
+
724
+ // Go over alternating spans of text and tags,
725
+ // copying letters to buffer with single spaces for each run of non-letters
726
+ bool last_byte_was_space = false;
727
+ while (take < byte_length_) {
728
+ char c = next_byte_[take];
729
+ if (c == '\r') {c = LF;} // Canonical CR or LF
730
+ if (c == '\n') {c = LF;} // Canonical CR or LF
731
+
732
+ if (IsSpecial(c) && !is_plain_text_) {
733
+ if (c == '<') {
734
+ // Replace tag with space
735
+ c = ' '; // for almost-full test below
736
+ // or if <p> <br> <tr>, replace with \n
737
+ if (take < (byte_length_ - 3)) {
738
+ if (EqCase(next_byte_[take + 1], 'p') &&
739
+ NeqLetter(next_byte_[take + 2])) {
740
+ c = LF;
741
+ }
742
+ if (EqCase(next_byte_[take + 1], 'b') &&
743
+ EqCase(next_byte_[take + 2], 'r') &&
744
+ NeqLetter(next_byte_[take + 3])) {
745
+ c = LF;
746
+ }
747
+ if (EqCase(next_byte_[take + 1], 't') &&
748
+ EqCase(next_byte_[take + 2], 'r') &&
749
+ NeqLetter(next_byte_[take + 3])) {
750
+ c = LF;
751
+ }
752
+ }
753
+ // Begining of tag; skip to end and go around again
754
+ tlen = 1 + ScanToPossibleLetter(next_byte_ + take, byte_length_ - take,
755
+ exit_state_);
756
+ // Copy one byte, compressing spaces
757
+ if (!last_byte_was_space || !WS(c)) {
758
+ script_buffer_[put++] = c; // Advance dest
759
+ last_byte_was_space = WS(c);
760
+ }
761
+ } else if (c == '>') {
762
+ // Unexpected end of tag; copy it and go around again
763
+ tlen = 1; // Over the >
764
+ script_buffer_[put++] = c; // Advance dest
765
+ } else if (c == '&') {
766
+ // Expand entity, no advance
767
+ EntityToBuffer(next_byte_ + take, byte_length_ - take,
768
+ script_buffer_ + put, &tlen, &plen);
769
+ put += plen; // Advance dest
770
+ }
771
+ take += tlen; // Advance source
772
+ } else {
773
+ // Copy one byte, compressing spaces
774
+ if (!last_byte_was_space || !WS(c)) {
775
+ script_buffer_[put++] = c; // Advance dest
776
+ last_byte_was_space = WS(c);
777
+ }
778
+ ++take; // Advance source
779
+ }
780
+
781
+ if (WS(c) &&
782
+ (put >= put_soft_limit)) {
783
+ // Buffer is almost full
784
+ span->truncated = true;
785
+ break;
786
+ }
787
+ if (put >= kMaxScriptBytes) {
788
+ // Buffer is completely full
789
+ span->truncated = true;
790
+ break;
791
+ }
792
+ }
793
+
794
+ // Almost done. Back up to a character boundary if needed
795
+ while ((0 < take) && ((next_byte_[take] & 0xc0) == 0x80)) {
796
+ // Back up over continuation byte
797
+ --take;
798
+ --put;
799
+ }
800
+
801
+ // Update input position
802
+ next_byte_ += take;
803
+ byte_length_ -= take;
804
+
805
+ // Put four more spaces/NUL. Worst case is abcd _ _ _ \0
806
+ // kMaxScriptBytes | | put
807
+ script_buffer_[put + 0] = ' ';
808
+ script_buffer_[put + 1] = ' ';
809
+ script_buffer_[put + 2] = ' ';
810
+ script_buffer_[put + 3] = '\0';
811
+
812
+ span->text_bytes = put; // Does not include the last four chars above
813
+ return true;
814
+ }
815
+
816
+
817
+ // Copy next run of same-script non-tag letters to buffer [NUL terminated]
818
+ // Buffer ALWAYS has leading space and trailing space space space NUL
819
+ bool ScriptScanner::GetOneScriptSpan(LangSpan* span) {
820
+ if (!letters_marks_only_) {
821
+ // Return non-tag text, including punctuation and digits
822
+ return GetOneTextSpan(span);
823
+ }
824
+
825
+ span->text = script_buffer_;
826
+ span->text_bytes = 0;
827
+ span->offset = next_byte_ - start_byte_;
828
+ span->ulscript = UNKNOWN_ULSCRIPT;
829
+ span->truncated = false;
830
+
831
+ // struct timeval script_start, script_mid, script_end;
832
+
833
+ int put_soft_limit = kMaxScriptBytes - kWithinScriptTail;
834
+ if ((kMaxScriptBytes <= byte_length_) &&
835
+ (byte_length_ < (2 * kMaxScriptBytes))) {
836
+ // Try to split the last two fragments in half
837
+ put_soft_limit = byte_length_ / 2;
838
+ }
839
+
840
+
841
+ int spanscript; // The script of this span
842
+ int sc = UNKNOWN_ULSCRIPT; // The script of next character
843
+ int tlen = 0;
844
+ int plen = 0;
845
+
846
+ script_buffer_[0] = ' '; // Always a space at front of output
847
+ script_buffer_[1] = '\0';
848
+ int take = 0;
849
+ int put = 1; // Start after the initial space
850
+
851
+ // Build offsets from span->text back to start_byte_ + span->offset
852
+ // This mapping reflects deletion of non-letters, expansion of
853
+ // entities, etc.
854
+ map2original_.Clear();
855
+ map2original_.Delete(span->offset); // So that MapBack(0) gives offset
856
+
857
+ // Get to the first real non-tag letter or entity that is a letter
858
+ int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript);
859
+ next_byte_ += skip;
860
+ byte_length_ -= skip;
861
+
862
+ if (skip != 1) {
863
+ map2original_.Delete(skip);
864
+ map2original_.Insert(1);
865
+ } else {
866
+ map2original_.Copy(1);
867
+ }
868
+ if (byte_length_ <= 0) {
869
+ map2original_.Reset();
870
+ return false; // No more letters to be found
871
+ }
872
+
873
+ // There is at least one letter, so we know the script for this span
874
+ span->ulscript = (ULScript)spanscript;
875
+
876
+
877
+ // Go over alternating spans of same-script letters and non-letters,
878
+ // copying letters to buffer with single spaces for each run of non-letters
879
+ while (take < byte_length_) {
880
+ // Copy run of letters in same script (&LS | LS)*
881
+ int letter_count = 0; // Keep track of word length
882
+ bool need_break = false;
883
+
884
+ while (take < byte_length_) {
885
+ // We are at a letter, nonletter, tag, or entity
886
+ if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
887
+ if (next_byte_[take] == '<') {
888
+ // Begining of tag
889
+ sc = 0;
890
+ break;
891
+ } else if (next_byte_[take] == '>') {
892
+ // Unexpected end of tag
893
+ sc = 0;
894
+ break;
895
+ } else if (next_byte_[take] == '&') {
896
+ // Copy entity, no advance
897
+ EntityToBuffer(next_byte_ + take, byte_length_ - take,
898
+ script_buffer_ + put, &tlen, &plen);
899
+ if (plen > 0) {
900
+ sc = GetUTF8LetterScriptNum(script_buffer_ + put);
901
+ }
902
+ }
903
+ } else {
904
+ // Real letter, safely copy up to 4 bytes, increment by 1..4
905
+ // Will update by 1..4 bytes at Advance, below
906
+ tlen = plen = UTF8OneCharLen(next_byte_ + take);
907
+ if (take < (byte_length_ - 3)) {
908
+ // X86 fast case, does unaligned load/store
909
+ UNALIGNED_STORE32(script_buffer_ + put,
910
+ UNALIGNED_LOAD32(next_byte_ + take));
911
+
912
+ } else {
913
+ // Slow case, happens 1-3 times per input document
914
+ memcpy(script_buffer_ + put, next_byte_ + take, plen);
915
+ }
916
+ sc = GetUTF8LetterScriptNum(next_byte_ + take);
917
+ }
918
+
919
+ // Allow continue across a single letter in a different script:
920
+ // A B D = three scripts, c = common script, i = inherited script,
921
+ // - = don't care, ( = take position before the += below
922
+ // AAA(A- continue
923
+ //
924
+ // AAA(BA continue
925
+ // AAA(BB break
926
+ // AAA(Bc continue (breaks after B)
927
+ // AAA(BD break
928
+ // AAA(Bi break
929
+ //
930
+ // AAA(c- break
931
+ //
932
+ // AAA(i- continue
933
+ //
934
+
935
+ if ((sc != spanscript) && (sc != ULScript_Inherited)) {
936
+ // Might need to break this script span
937
+ if (sc == ULScript_Common) {
938
+ need_break = true;
939
+ } else {
940
+ // Look at next following character, ignoring entity as Common
941
+ int sc2 = GetUTF8LetterScriptNum(next_byte_ + take + tlen);
942
+ if ((sc2 != ULScript_Common) && (sc2 != spanscript)) {
943
+ // We found a non-trivial change of script
944
+ if (one_script_only_) {
945
+ need_break = true;
946
+ }
947
+ }
948
+ }
949
+ }
950
+ if (need_break) {break;} // Non-letter or letter in wrong script
951
+
952
+ take += tlen; // Advance
953
+ put += plen; // Advance
954
+
955
+ // Update the offset map to reflect take/put lengths
956
+ if (tlen == plen) {
957
+ map2original_.Copy(tlen);
958
+ } else if (tlen < plen) {
959
+ map2original_.Copy(tlen);
960
+ map2original_.Insert(plen - tlen);
961
+ } else { // plen < tlen
962
+ map2original_.Copy(plen);
963
+ map2original_.Delete(tlen - plen);
964
+ }
965
+
966
+ ++letter_count;
967
+ if (put >= kMaxScriptBytes) {
968
+ // Buffer is full
969
+ span->truncated = true;
970
+ break;
971
+ }
972
+ } // End while letters
973
+
974
+ // Do run of non-letters (tag | &NL | NL)*
975
+ while (take < byte_length_) {
976
+ // Do fast scan to next interesting byte
977
+ tlen = ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take);
978
+ take += tlen;
979
+ map2original_.Delete(tlen);
980
+ if (take >= byte_length_) {break;} // Might have scanned to end
981
+
982
+ // We are at a letter, nonletter, tag, or entity
983
+ if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
984
+ if (next_byte_[take] == '<') {
985
+ // Begining of tag; skip to end and go around again
986
+ tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take,
987
+ exit_state_);
988
+ sc = 0;
989
+ } else if (next_byte_[take] == '>') {
990
+ // Unexpected end of tag; skip it and go around again
991
+ tlen = 1; // Over the >
992
+ sc = 0;
993
+ } else if (next_byte_[take] == '&') {
994
+ // Expand entity, no advance
995
+ EntityToBuffer(next_byte_ + take, byte_length_ - take,
996
+ script_buffer_ + put, &tlen, &plen);
997
+ if (plen > 0) {
998
+ sc = GetUTF8LetterScriptNum(script_buffer_ + put);
999
+ }
1000
+ }
1001
+ } else {
1002
+ // Update 1..4
1003
+ tlen = UTF8OneCharLen(next_byte_ + take);
1004
+ sc = GetUTF8LetterScriptNum(next_byte_ + take);
1005
+ }
1006
+ if (sc != 0) {break;} // Letter found
1007
+ take += tlen; // Else advance
1008
+ map2original_.Delete(tlen);
1009
+ } // End while not-letters
1010
+
1011
+ script_buffer_[put++] = ' ';
1012
+ map2original_.Insert(1);
1013
+
1014
+ // Letter in wrong script ?
1015
+ if ((sc != spanscript) && (sc != ULScript_Inherited)) {break;}
1016
+ if (put >= put_soft_limit) {
1017
+ // Buffer is almost full
1018
+ span->truncated = true;
1019
+ break;
1020
+ }
1021
+ }
1022
+
1023
+ // Almost done. Back up to a character boundary if needed
1024
+ while ((0 < take) && (take < byte_length_) &&
1025
+ ((next_byte_[take] & 0xc0) == 0x80)) {
1026
+ // Back up over continuation byte
1027
+ --take;
1028
+ --put;
1029
+ }
1030
+
1031
+ // Update input position
1032
+ next_byte_ += take;
1033
+ byte_length_ -= take;
1034
+
1035
+ // Put four more spaces/NUL. Worst case is abcd _ _ _ \0
1036
+ // kMaxScriptBytes | | put
1037
+ script_buffer_[put + 0] = ' ';
1038
+ script_buffer_[put + 1] = ' ';
1039
+ script_buffer_[put + 2] = ' ';
1040
+ script_buffer_[put + 3] = '\0';
1041
+ map2original_.Insert(4);
1042
+ map2original_.Reset();
1043
+
1044
+ span->text_bytes = put; // Does not include the last four chars above
1045
+ return true;
1046
+ }
1047
+
1048
+ // Force Latin, Cyrillic, Armenian, Greek scripts to be lowercase
1049
+ // List changes with each version of Unicode, so just always lowercase
1050
+ // Unicode 6.2.0:
1051
+ // ARMENIAN COPTIC CYRILLIC DESERET GEORGIAN GLAGOLITIC GREEK LATIN
1052
+ void ScriptScanner::LowerScriptSpan(LangSpan* span) {
1053
+ // If needed, lowercase all the text. If we do it sooner, might miss
1054
+ // lowercasing an entity such as &Aacute;
1055
+ // We only need to do this for Latn and Cyrl scripts
1056
+ map2uplow_.Clear();
1057
+ // Full Unicode lowercase of the entire buffer, including
1058
+ // four pad bytes off the end.
1059
+ // Ahhh. But the last byte 0x00 is not interchange-valid, so we do 3 pad
1060
+ // bytes and put the 0x00 in explicitly.
1061
+ // Build an offset map from script_buffer_lower_ back to script_buffer_
1062
+ int consumed, filled, changed;
1063
+ StringPiece istr(span->text, span->text_bytes + 3);
1064
+ StringPiece ostr(script_buffer_lower_, kMaxScriptLowerBuffer);
1065
+
1066
+ UTF8GenericReplace(&utf8repl_lettermarklower_obj,
1067
+ istr, ostr, is_plain_text_,
1068
+ &consumed, &filled, &changed, &map2uplow_);
1069
+ script_buffer_lower_[filled] = '\0';
1070
+ span->text = script_buffer_lower_;
1071
+ span->text_bytes = filled - 3;
1072
+ map2uplow_.Reset();
1073
+ }
1074
+
1075
+ // Copy next run of same-script non-tag letters to buffer [NUL terminated]
1076
+ // Force Latin, Cyrillic, Greek scripts to be lowercase
1077
+ // Buffer ALWAYS has leading space and trailing space space space NUL
1078
+ bool ScriptScanner::GetOneScriptSpanLower(LangSpan* span) {
1079
+ bool ok = GetOneScriptSpan(span);
1080
+ if (ok) {
1081
+ LowerScriptSpan(span);
1082
+ }
1083
+ return ok;
1084
+ }
1085
+
1086
+ // Maps byte offset in most recent GetOneScriptSpan/Lower
1087
+ // span->text [0..text_bytes] into an additional byte offset from
1088
+ // span->offset, to get back to corresponding text in the original
1089
+ // input buffer.
1090
+ // text_offset must be the first byte
1091
+ // of a UTF-8 character, or just beyond the last character. Normally this
1092
+ // routine is called with the first byte of an interesting range and
1093
+ // again with the first byte of the following range.
1094
+ int ScriptScanner::MapBack(int text_offset) {
1095
+ return map2original_.MapBack(map2uplow_.MapBack(text_offset));
1096
+ }
1097
+
1098
+
1099
+ // Gets lscript number for letters; always returns
1100
+ // 0 (common script) for non-letters
1101
+ int GetUTF8LetterScriptNum(const char* src) {
1102
+ int srclen = UTF8OneCharLen(src);
1103
+ const uint8* usrc = reinterpret_cast<const uint8*>(src);
1104
+ return UTF8GenericPropertyTwoByte(&utf8prop_lettermarkscriptnum_obj,
1105
+ &usrc, &srclen);
1106
+ }
1107
+
1108
+ } // namespace CLD2
1109
+ } // namespace chrome_lang_id