cld3 3.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +18 -0
- data/LICENSE +204 -0
- data/LICENSE_CLD3 +203 -0
- data/README.md +22 -0
- data/cld3.gemspec +35 -0
- data/ext/cld3/base.cc +36 -0
- data/ext/cld3/base.h +106 -0
- data/ext/cld3/casts.h +98 -0
- data/ext/cld3/embedding_feature_extractor.cc +51 -0
- data/ext/cld3/embedding_feature_extractor.h +182 -0
- data/ext/cld3/embedding_network.cc +196 -0
- data/ext/cld3/embedding_network.h +186 -0
- data/ext/cld3/embedding_network_params.h +285 -0
- data/ext/cld3/extconf.rb +49 -0
- data/ext/cld3/feature_extractor.cc +137 -0
- data/ext/cld3/feature_extractor.h +633 -0
- data/ext/cld3/feature_extractor.proto +50 -0
- data/ext/cld3/feature_types.cc +72 -0
- data/ext/cld3/feature_types.h +158 -0
- data/ext/cld3/fixunicodevalue.cc +55 -0
- data/ext/cld3/fixunicodevalue.h +69 -0
- data/ext/cld3/float16.h +58 -0
- data/ext/cld3/fml_parser.cc +308 -0
- data/ext/cld3/fml_parser.h +123 -0
- data/ext/cld3/generated_entities.cc +296 -0
- data/ext/cld3/generated_ulscript.cc +678 -0
- data/ext/cld3/generated_ulscript.h +142 -0
- data/ext/cld3/getonescriptspan.cc +1109 -0
- data/ext/cld3/getonescriptspan.h +124 -0
- data/ext/cld3/integral_types.h +37 -0
- data/ext/cld3/lang_id_nn_params.cc +57449 -0
- data/ext/cld3/lang_id_nn_params.h +178 -0
- data/ext/cld3/language_identifier_features.cc +165 -0
- data/ext/cld3/language_identifier_features.h +116 -0
- data/ext/cld3/nnet_language_identifier.cc +380 -0
- data/ext/cld3/nnet_language_identifier.h +175 -0
- data/ext/cld3/nnet_language_identifier_c.cc +72 -0
- data/ext/cld3/offsetmap.cc +478 -0
- data/ext/cld3/offsetmap.h +168 -0
- data/ext/cld3/port.h +143 -0
- data/ext/cld3/registry.cc +28 -0
- data/ext/cld3/registry.h +242 -0
- data/ext/cld3/relevant_script_feature.cc +89 -0
- data/ext/cld3/relevant_script_feature.h +49 -0
- data/ext/cld3/script_detector.h +156 -0
- data/ext/cld3/sentence.proto +77 -0
- data/ext/cld3/sentence_features.cc +29 -0
- data/ext/cld3/sentence_features.h +35 -0
- data/ext/cld3/simple_adder.h +72 -0
- data/ext/cld3/stringpiece.h +81 -0
- data/ext/cld3/task_context.cc +161 -0
- data/ext/cld3/task_context.h +81 -0
- data/ext/cld3/task_context_params.cc +74 -0
- data/ext/cld3/task_context_params.h +54 -0
- data/ext/cld3/task_spec.proto +98 -0
- data/ext/cld3/text_processing.cc +245 -0
- data/ext/cld3/text_processing.h +30 -0
- data/ext/cld3/unicodetext.cc +96 -0
- data/ext/cld3/unicodetext.h +144 -0
- data/ext/cld3/utf8acceptinterchange.h +486 -0
- data/ext/cld3/utf8prop_lettermarkscriptnum.h +1631 -0
- data/ext/cld3/utf8repl_lettermarklower.h +758 -0
- data/ext/cld3/utf8scannot_lettermarkspecial.h +1455 -0
- data/ext/cld3/utf8statetable.cc +1344 -0
- data/ext/cld3/utf8statetable.h +285 -0
- data/ext/cld3/utils.cc +241 -0
- data/ext/cld3/utils.h +144 -0
- data/ext/cld3/workspace.cc +64 -0
- data/ext/cld3/workspace.h +177 -0
- data/lib/cld3.rb +99 -0
- metadata +158 -0
@@ -0,0 +1,142 @@
|
|
1
|
+
// Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
|
15
|
+
// generated_ulscript.h
|
16
|
+
// Machine generated. Do Not Edit.
|
17
|
+
//
|
18
|
+
// Declarations for scripts recognized by CLD2
|
19
|
+
//
|
20
|
+
|
21
|
+
#ifndef SCRIPT_SPAN_GENERATED_ULSCRIPT_H_
|
22
|
+
#define SCRIPT_SPAN_GENERATED_ULSCRIPT_H_
|
23
|
+
|
24
|
+
namespace chrome_lang_id {
|
25
|
+
namespace CLD2 {
|
26
|
+
|
27
|
+
typedef enum {RTypeNone = 0, RTypeOne, RTypeMany, RTypeCJK} ULScriptRType;
|
28
|
+
|
29
|
+
typedef struct {const char* s; int i;} CharIntPair;
|
30
|
+
|
31
|
+
typedef enum {
|
32
|
+
ULScript_Common = 0, // Zyyy
|
33
|
+
ULScript_Latin = 1, // Latn
|
34
|
+
ULScript_Greek = 2, // Grek
|
35
|
+
ULScript_Cyrillic = 3, // Cyrl
|
36
|
+
ULScript_Armenian = 4, // Armn
|
37
|
+
ULScript_Hebrew = 5, // Hebr
|
38
|
+
ULScript_Arabic = 6, // Arab
|
39
|
+
ULScript_Syriac = 7, // Syrc
|
40
|
+
ULScript_Thaana = 8, // Thaa
|
41
|
+
ULScript_Devanagari = 9, // Deva
|
42
|
+
ULScript_Bengali = 10, // Beng
|
43
|
+
ULScript_Gurmukhi = 11, // Guru
|
44
|
+
ULScript_Gujarati = 12, // Gujr
|
45
|
+
ULScript_Oriya = 13, // Orya
|
46
|
+
ULScript_Tamil = 14, // Taml
|
47
|
+
ULScript_Telugu = 15, // Telu
|
48
|
+
ULScript_Kannada = 16, // Knda
|
49
|
+
ULScript_Malayalam = 17, // Mlym
|
50
|
+
ULScript_Sinhala = 18, // Sinh
|
51
|
+
ULScript_Thai = 19, // Thai
|
52
|
+
ULScript_Lao = 20, // Laoo
|
53
|
+
ULScript_Tibetan = 21, // Tibt
|
54
|
+
ULScript_Myanmar = 22, // Mymr
|
55
|
+
ULScript_Georgian = 23, // Geor
|
56
|
+
ULScript_Hani = 24, // Hani
|
57
|
+
ULScript_Ethiopic = 25, // Ethi
|
58
|
+
ULScript_Cherokee = 26, // Cher
|
59
|
+
ULScript_Canadian_Aboriginal = 27, // Cans
|
60
|
+
ULScript_Ogham = 28, // Ogam
|
61
|
+
ULScript_Runic = 29, // Runr
|
62
|
+
ULScript_Khmer = 30, // Khmr
|
63
|
+
ULScript_Mongolian = 31, // Mong
|
64
|
+
ULScript_32 = 32, //
|
65
|
+
ULScript_33 = 33, //
|
66
|
+
ULScript_Bopomofo = 34, // Bopo
|
67
|
+
ULScript_35 = 35, //
|
68
|
+
ULScript_Yi = 36, // Yiii
|
69
|
+
ULScript_Old_Italic = 37, // Ital
|
70
|
+
ULScript_Gothic = 38, // Goth
|
71
|
+
ULScript_Deseret = 39, // Dsrt
|
72
|
+
ULScript_Inherited = 40, // Zinh
|
73
|
+
ULScript_Tagalog = 41, // Tglg
|
74
|
+
ULScript_Hanunoo = 42, // Hano
|
75
|
+
ULScript_Buhid = 43, // Buhd
|
76
|
+
ULScript_Tagbanwa = 44, // Tagb
|
77
|
+
ULScript_Limbu = 45, // Limb
|
78
|
+
ULScript_Tai_Le = 46, // Tale
|
79
|
+
ULScript_Linear_B = 47, // Linb
|
80
|
+
ULScript_Ugaritic = 48, // Ugar
|
81
|
+
ULScript_Shavian = 49, // Shaw
|
82
|
+
ULScript_Osmanya = 50, // Osma
|
83
|
+
ULScript_Cypriot = 51, // Cprt
|
84
|
+
ULScript_Braille = 52, // Brai
|
85
|
+
ULScript_Buginese = 53, // Bugi
|
86
|
+
ULScript_Coptic = 54, // Copt
|
87
|
+
ULScript_New_Tai_Lue = 55, // Talu
|
88
|
+
ULScript_Glagolitic = 56, // Glag
|
89
|
+
ULScript_Tifinagh = 57, // Tfng
|
90
|
+
ULScript_Syloti_Nagri = 58, // Sylo
|
91
|
+
ULScript_Old_Persian = 59, // Xpeo
|
92
|
+
ULScript_Kharoshthi = 60, // Khar
|
93
|
+
ULScript_Balinese = 61, // Bali
|
94
|
+
ULScript_Cuneiform = 62, // Xsux
|
95
|
+
ULScript_Phoenician = 63, // Phnx
|
96
|
+
ULScript_Phags_Pa = 64, // Phag
|
97
|
+
ULScript_Nko = 65, // Nkoo
|
98
|
+
ULScript_Sundanese = 66, // Sund
|
99
|
+
ULScript_Lepcha = 67, // Lepc
|
100
|
+
ULScript_Ol_Chiki = 68, // Olck
|
101
|
+
ULScript_Vai = 69, // Vaii
|
102
|
+
ULScript_Saurashtra = 70, // Saur
|
103
|
+
ULScript_Kayah_Li = 71, // Kali
|
104
|
+
ULScript_Rejang = 72, // Rjng
|
105
|
+
ULScript_Lycian = 73, // Lyci
|
106
|
+
ULScript_Carian = 74, // Cari
|
107
|
+
ULScript_Lydian = 75, // Lydi
|
108
|
+
ULScript_Cham = 76, // Cham
|
109
|
+
ULScript_Tai_Tham = 77, // Lana
|
110
|
+
ULScript_Tai_Viet = 78, // Tavt
|
111
|
+
ULScript_Avestan = 79, // Avst
|
112
|
+
ULScript_Egyptian_Hieroglyphs = 80, // Egyp
|
113
|
+
ULScript_Samaritan = 81, // Samr
|
114
|
+
ULScript_Lisu = 82, // Lisu
|
115
|
+
ULScript_Bamum = 83, // Bamu
|
116
|
+
ULScript_Javanese = 84, // Java
|
117
|
+
ULScript_Meetei_Mayek = 85, // Mtei
|
118
|
+
ULScript_Imperial_Aramaic = 86, // Armi
|
119
|
+
ULScript_Old_South_Arabian = 87, // Sarb
|
120
|
+
ULScript_Inscriptional_Parthian = 88, // Prti
|
121
|
+
ULScript_Inscriptional_Pahlavi = 89, // Phli
|
122
|
+
ULScript_Old_Turkic = 90, // Orkh
|
123
|
+
ULScript_Kaithi = 91, // Kthi
|
124
|
+
ULScript_Batak = 92, // Batk
|
125
|
+
ULScript_Brahmi = 93, // Brah
|
126
|
+
ULScript_Mandaic = 94, // Mand
|
127
|
+
ULScript_Chakma = 95, // Cakm
|
128
|
+
ULScript_Meroitic_Cursive = 96, // Merc
|
129
|
+
ULScript_Meroitic_Hieroglyphs = 97, // Mero
|
130
|
+
ULScript_Miao = 98, // Plrd
|
131
|
+
ULScript_Sharada = 99, // Shrd
|
132
|
+
ULScript_Sora_Sompeng = 100, // Sora
|
133
|
+
ULScript_Takri = 101, // Takr
|
134
|
+
NUM_ULSCRIPTS
|
135
|
+
} ULScript;
|
136
|
+
|
137
|
+
#define UNKNOWN_ULSCRIPT ULScript_Common
|
138
|
+
|
139
|
+
} // namespace CLD2
|
140
|
+
} // namespace chrome_lang_id
|
141
|
+
|
142
|
+
#endif // SCRIPT_SPAN_GENERATED_ULSCRIPT_H_
|
@@ -0,0 +1,1109 @@
|
|
1
|
+
// Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
|
15
|
+
//
|
16
|
+
// Author: dsites@google.com (Dick Sites)
|
17
|
+
//
|
18
|
+
|
19
|
+
|
20
|
+
#include "getonescriptspan.h"
|
21
|
+
|
22
|
+
#include <string.h>
|
23
|
+
|
24
|
+
#include "fixunicodevalue.h"
|
25
|
+
#include "port.h"
|
26
|
+
#include "utf8acceptinterchange.h"
|
27
|
+
#include "utf8repl_lettermarklower.h"
|
28
|
+
#include "utf8prop_lettermarkscriptnum.h"
|
29
|
+
#include "utf8scannot_lettermarkspecial.h"
|
30
|
+
#include "utf8statetable.h"
|
31
|
+
|
32
|
+
namespace chrome_lang_id {
|
33
|
+
namespace CLD2 {
|
34
|
+
|
35
|
+
// Alphabetical order for binary search, from
|
36
|
+
// generated_entities.cc
|
37
|
+
extern const int kNameToEntitySize;
|
38
|
+
extern const CharIntPair kNameToEntity[];
|
39
|
+
|
40
|
+
static const char kSpecialSymbol[256] = { // true for < > &
|
41
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
42
|
+
0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0,
|
43
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
44
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
45
|
+
|
46
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
47
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
48
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
49
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
50
|
+
};
|
51
|
+
|
52
|
+
|
53
|
+
|
54
|
+
#define LT 0 // <
|
55
|
+
#define GT 1 // >
|
56
|
+
#define EX 2 // !
|
57
|
+
#define HY 3 // -
|
58
|
+
#define QU 4 // "
|
59
|
+
#define AP 5 // '
|
60
|
+
#define SL 6 // /
|
61
|
+
#define S_ 7
|
62
|
+
#define C_ 8
|
63
|
+
#define R_ 9
|
64
|
+
#define I_ 10
|
65
|
+
#define P_ 11
|
66
|
+
#define T_ 12
|
67
|
+
#define Y_ 13
|
68
|
+
#define L_ 14
|
69
|
+
#define E_ 15
|
70
|
+
#define CR 16 // <cr> or <lf>
|
71
|
+
#define NL 17 // non-letter: ASCII whitespace, digit, punctuation
|
72
|
+
#define PL 18 // possible letter, incl. &
|
73
|
+
#define xx 19 // <unused>
|
74
|
+
|
75
|
+
// Map byte to one of ~20 interesting categories for cheap tag parsing
|
76
|
+
static const uint8 kCharToSub[256] = {
|
77
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL,
|
78
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
|
79
|
+
NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL,
|
80
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL,
|
81
|
+
|
82
|
+
PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
|
83
|
+
P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
|
84
|
+
PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
|
85
|
+
P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
|
86
|
+
|
87
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
|
88
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
|
89
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
|
90
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
|
91
|
+
|
92
|
+
PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
|
93
|
+
PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
|
94
|
+
PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
|
95
|
+
PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
|
96
|
+
};
|
97
|
+
|
98
|
+
#undef LT
|
99
|
+
#undef GT
|
100
|
+
#undef EX
|
101
|
+
#undef HY
|
102
|
+
#undef QU
|
103
|
+
#undef AP
|
104
|
+
#undef SL
|
105
|
+
#undef S_
|
106
|
+
#undef C_
|
107
|
+
#undef R_
|
108
|
+
#undef I_
|
109
|
+
#undef P_
|
110
|
+
#undef T_
|
111
|
+
#undef Y_
|
112
|
+
#undef L_
|
113
|
+
#undef E_
|
114
|
+
#undef CR
|
115
|
+
#undef NL
|
116
|
+
#undef PL
|
117
|
+
#undef xx
|
118
|
+
|
119
|
+
|
120
|
+
#define OK 0
|
121
|
+
#define X_ 1
|
122
|
+
|
123
|
+
|
124
|
+
static const int kMaxExitStateLettersMarksOnly = 1;
|
125
|
+
static const int kMaxExitStateAllText = 2;
|
126
|
+
|
127
|
+
|
128
|
+
// State machine to do cheap parse of non-letter strings incl. tags
|
129
|
+
// advances <tag>
|
130
|
+
// | |
|
131
|
+
// advances <tag> ... </tag> for <script> <style>
|
132
|
+
// | |
|
133
|
+
// advances <!-- ... <tag> ... -->
|
134
|
+
// | |
|
135
|
+
// advances <tag
|
136
|
+
// || (0)
|
137
|
+
// advances <tag <tag2>
|
138
|
+
// || (0)
|
139
|
+
//
|
140
|
+
// We start in state [0] at a non-letter and make at least one transition
|
141
|
+
// When scanning for just letters, arriving back at state [0] or [1] exits
|
142
|
+
// the state machine.
|
143
|
+
// When scanning for any non-tag text, arriving at state [2] also exits
|
144
|
+
static const uint8 kTagParseTbl_0[] = {
|
145
|
+
// < > ! - " ' / S C R I P T Y L E CR NL PL xx
|
146
|
+
3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [0] OK exit state
|
147
|
+
X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error exit state
|
148
|
+
3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [2] NL* [exit state]
|
149
|
+
X_, 2, 4, 9, 10,11, 9,13, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [3] <
|
150
|
+
X_, 2, 9, 5, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [4] <!
|
151
|
+
X_, 2, 9, 6, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [5] <!-
|
152
|
+
6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [6] <!--.*
|
153
|
+
6, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [7] <!--.*-
|
154
|
+
6, 2, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [8] <!--.*--
|
155
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [9] <.*
|
156
|
+
10,10,10,10, 9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*"
|
157
|
+
11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*'
|
158
|
+
X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " '
|
159
|
+
|
160
|
+
// < > ! - " ' / S C R I P T Y L E CR NL PL xx
|
161
|
+
X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9, 9, 9, 9,X_, // [13] <S
|
162
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9,15, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [14] <SC
|
163
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9,16, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [15] <SCR
|
164
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9,17, 9, 9, 9, 9, 9, 9, 9,X_, // [16] <SCRI
|
165
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 18, 9, 9, 9, 9, 9, 9,X_, // [17] <SCRIP
|
166
|
+
X_,19, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT
|
167
|
+
20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .*
|
168
|
+
19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*<
|
169
|
+
19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 21,21,19,X_, // [21] <SCRIPT .*</ allow SP CR LF
|
170
|
+
19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S
|
171
|
+
19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC
|
172
|
+
19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR
|
173
|
+
19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI
|
174
|
+
19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP
|
175
|
+
19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT
|
176
|
+
|
177
|
+
// < > ! - " ' / S C R I P T Y L E CR NL PL xx
|
178
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9,29, 9, 9, 9, 9, 9,X_, // [28] <ST
|
179
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9,30, 9, 9, 9, 9,X_, // [29] <STY
|
180
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9,31, 9, 9, 9,X_, // [30] <STYL
|
181
|
+
X_,32, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE
|
182
|
+
33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .*
|
183
|
+
32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*<
|
184
|
+
32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 34,34,32,X_, // [34] <STYLE .*</ allow SP CR LF
|
185
|
+
32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S
|
186
|
+
32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST
|
187
|
+
32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY
|
188
|
+
32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL
|
189
|
+
32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE
|
190
|
+
};
|
191
|
+
|
192
|
+
#undef OK
|
193
|
+
#undef X_
|
194
|
+
|
195
|
+
enum
|
196
|
+
{
|
197
|
+
UTFmax = 4, // maximum bytes per rune
|
198
|
+
Runesync = 0x80, // cannot represent part of a UTF sequence (<)
|
199
|
+
Runeself = 0x80, // rune and UTF sequences are the same (<)
|
200
|
+
Runeerror = 0xFFFD, // decoding error in UTF
|
201
|
+
Runemax = 0x10FFFF, // maximum rune value
|
202
|
+
};
|
203
|
+
|
204
|
+
// Debugging. Not thread safe.
|
205
|
+
static char gDisplayPiece[32];
|
206
|
+
const uint8 gCharlen[16] = {1,1,1,1, 1,1,1,1, 1,1,1,1, 2,2,3,4};
|
207
|
+
char* DisplayPiece(const char* next_byte_, int byte_length_) {
|
208
|
+
// Copy up to 8 UTF-8 chars to buffer
|
209
|
+
int k = 0; // byte count
|
210
|
+
int n = 0; // character count
|
211
|
+
for (int i = 0; i < byte_length_; ++i) {
|
212
|
+
char c = next_byte_[i];
|
213
|
+
if ((c & 0xc0) != 0x80) {
|
214
|
+
// Beginning of a UTF-8 character
|
215
|
+
int charlen = gCharlen[static_cast<uint8>(c) >> 4];
|
216
|
+
if (i + charlen > byte_length_) {break;} // Not enough room for full char
|
217
|
+
if (k >= (32 - 7)) {break;} // Not necessarily enough room
|
218
|
+
if (n >= 8) {break;} // Enough characters already
|
219
|
+
++n;
|
220
|
+
}
|
221
|
+
if (c == '<') {
|
222
|
+
memcpy(&gDisplayPiece[k], "<", 4); k += 4;
|
223
|
+
} else if (c == '>') {
|
224
|
+
memcpy(&gDisplayPiece[k], ">", 4); k += 4;
|
225
|
+
} else if (c == '&') {
|
226
|
+
memcpy(&gDisplayPiece[k], "&", 5); k += 5;
|
227
|
+
} else if (c == '\'') {
|
228
|
+
memcpy(&gDisplayPiece[k], "'", 6); k += 6;
|
229
|
+
} else if (c == '"') {
|
230
|
+
memcpy(&gDisplayPiece[k], """, 6); k += 6;
|
231
|
+
} else {
|
232
|
+
gDisplayPiece[k++] = c;
|
233
|
+
}
|
234
|
+
}
|
235
|
+
gDisplayPiece[k++] = '\0';
|
236
|
+
return gDisplayPiece;
|
237
|
+
}
|
238
|
+
|
239
|
+
|
240
|
+
|
241
|
+
// runetochar copies (encodes) one rune, pointed to by r, to at most
|
242
|
+
// UTFmax bytes starting at s and returns the number of bytes generated.
|
243
|
+
int runetochar(char *str, const char32 *rune) {
|
244
|
+
// Convert to unsigned for range check.
|
245
|
+
unsigned long c;
|
246
|
+
|
247
|
+
// 1 char 00-7F
|
248
|
+
c = *rune;
|
249
|
+
if(c <= 0x7F) {
|
250
|
+
str[0] = static_cast<char>(c);
|
251
|
+
return 1;
|
252
|
+
}
|
253
|
+
|
254
|
+
// 2 char 0080-07FF
|
255
|
+
if(c <= 0x07FF) {
|
256
|
+
str[0] = 0xC0 | static_cast<char>(c >> 1*6);
|
257
|
+
str[1] = 0x80 | (c & 0x3F);
|
258
|
+
return 2;
|
259
|
+
}
|
260
|
+
|
261
|
+
// Range check
|
262
|
+
if (c > Runemax) {
|
263
|
+
c = Runeerror;
|
264
|
+
}
|
265
|
+
|
266
|
+
// 3 char 0800-FFFF
|
267
|
+
if (c <= 0xFFFF) {
|
268
|
+
str[0] = 0xE0 | static_cast<char>(c >> 2*6);
|
269
|
+
str[1] = 0x80 | ((c >> 1*6) & 0x3F);
|
270
|
+
str[2] = 0x80 | (c & 0x3F);
|
271
|
+
return 3;
|
272
|
+
}
|
273
|
+
|
274
|
+
// 4 char 10000-1FFFFF
|
275
|
+
str[0] = 0xF0 | static_cast<char>(c >> 3*6);
|
276
|
+
str[1] = 0x80 | ((c >> 2*6) & 0x3F);
|
277
|
+
str[2] = 0x80 | ((c >> 1*6) & 0x3F);
|
278
|
+
str[3] = 0x80 | (c & 0x3F);
|
279
|
+
return 4;
|
280
|
+
}
|
281
|
+
|
282
|
+
|
283
|
+
|
284
|
+
// Useful for converting an entity to an ascii value.
|
285
|
+
// RETURNS unicode value, or -1 if entity isn't valid. Don't include & or ;
|
286
|
+
int LookupEntity(const char* entity_name, int entity_len) {
|
287
|
+
// Make a C string
|
288
|
+
if (entity_len >= 16) {return -1;} // All real entities are shorter
|
289
|
+
char temp[16];
|
290
|
+
memcpy(temp, entity_name, entity_len);
|
291
|
+
temp[entity_len] = '\0';
|
292
|
+
int match = BinarySearch(temp, 0, kNameToEntitySize, kNameToEntity);
|
293
|
+
if (match >= 0) {return kNameToEntity[match].i;}
|
294
|
+
return -1;
|
295
|
+
}
|
296
|
+
|
297
|
+
bool ascii_isdigit(char c) {
|
298
|
+
return ('0' <= c) && (c <= '9');
|
299
|
+
}
|
300
|
+
bool ascii_isxdigit(char c) {
|
301
|
+
if (('0' <= c) && (c <= '9')) {return true;}
|
302
|
+
if (('a' <= c) && (c <= 'f')) {return true;}
|
303
|
+
if (('A' <= c) && (c <= 'F')) {return true;}
|
304
|
+
return false;
|
305
|
+
}
|
306
|
+
bool ascii_isalnum(char c) {
|
307
|
+
if (('0' <= c) && (c <= '9')) {return true;}
|
308
|
+
if (('a' <= c) && (c <= 'z')) {return true;}
|
309
|
+
if (('A' <= c) && (c <= 'Z')) {return true;}
|
310
|
+
return false;
|
311
|
+
}
|
312
|
+
int hex_digit_to_int(char c) {
|
313
|
+
if (('0' <= c) && (c <= '9')) {return c - '0';}
|
314
|
+
if (('a' <= c) && (c <= 'f')) {return c - 'a' + 10;}
|
315
|
+
if (('A' <= c) && (c <= 'F')) {return c - 'A' + 10;}
|
316
|
+
return 0;
|
317
|
+
}
|
318
|
+
|
319
|
+
static int32 strto32_base10(const char* nptr, const char* limit,
|
320
|
+
const char **endptr) {
|
321
|
+
*endptr = nptr;
|
322
|
+
while (nptr < limit && *nptr == '0') {
|
323
|
+
++nptr;
|
324
|
+
}
|
325
|
+
if (nptr == limit || !ascii_isdigit(*nptr))
|
326
|
+
return -1;
|
327
|
+
const char* end_digits_run = nptr;
|
328
|
+
while (end_digits_run < limit && ascii_isdigit(*end_digits_run)) {
|
329
|
+
++end_digits_run;
|
330
|
+
}
|
331
|
+
*endptr = end_digits_run;
|
332
|
+
const int num_digits = end_digits_run - nptr;
|
333
|
+
// kint32max == 2147483647.
|
334
|
+
if (num_digits < 9 ||
|
335
|
+
(num_digits == 10 && memcmp(nptr, "2147483647", 10) <= 0)) {
|
336
|
+
int value = 0;
|
337
|
+
for (; nptr < end_digits_run; ++nptr) {
|
338
|
+
value *= 10;
|
339
|
+
value += *nptr - '0';
|
340
|
+
}
|
341
|
+
// Overflow past the last valid unicode codepoint
|
342
|
+
// (0x10ffff) is converted to U+FFFD by FixUnicodeValue().
|
343
|
+
return FixUnicodeValue(value);
|
344
|
+
} else {
|
345
|
+
// Overflow: can't fit in an int32;
|
346
|
+
// returns the replacement character 0xFFFD.
|
347
|
+
return 0xFFFD;
|
348
|
+
}
|
349
|
+
}
|
350
|
+
|
351
|
+
static int32 strto32_base16(const char* nptr, const char* limit,
|
352
|
+
const char **endptr) {
|
353
|
+
*endptr = nptr;
|
354
|
+
while (nptr < limit && *nptr == '0') {
|
355
|
+
++nptr;
|
356
|
+
}
|
357
|
+
if (nptr == limit || !ascii_isxdigit(*nptr)) {
|
358
|
+
return -1;
|
359
|
+
}
|
360
|
+
const char* end_xdigits_run = nptr;
|
361
|
+
while (end_xdigits_run < limit && ascii_isxdigit(*end_xdigits_run)) {
|
362
|
+
++end_xdigits_run;
|
363
|
+
}
|
364
|
+
*endptr = end_xdigits_run;
|
365
|
+
const int num_xdigits = end_xdigits_run - nptr;
|
366
|
+
// kint32max == 0x7FFFFFFF.
|
367
|
+
if (num_xdigits < 8 || (num_xdigits == 8 && nptr[0] < '8')) {
|
368
|
+
int value = 0;
|
369
|
+
for (; nptr < end_xdigits_run; ++nptr) {
|
370
|
+
value <<= 4;
|
371
|
+
value += hex_digit_to_int(*nptr);
|
372
|
+
}
|
373
|
+
// Overflow past the last valid unicode codepoint
|
374
|
+
// (0x10ffff) is converted to U+FFFD by FixUnicodeValue().
|
375
|
+
return FixUnicodeValue(value);
|
376
|
+
} else {
|
377
|
+
// Overflow: can't fit in an int32;
|
378
|
+
// returns the replacement character 0xFFFD.
|
379
|
+
return 0xFFFD;
|
380
|
+
}
|
381
|
+
}
|
382
|
+
|
383
|
+
// Unescape the current character pointed to by src. SETS the number
|
384
|
+
// of chars read for the conversion (in UTF8). If src isn't a valid entity,
|
385
|
+
// just consume the & and RETURN -1. If src doesn't point to & -- which it
|
386
|
+
// should -- set src_consumed to 0 and RETURN -1.
|
387
|
+
int ReadEntity(const char* src, int srcn, int* src_consumed) {
|
388
|
+
const char* const srcend = src + srcn;
|
389
|
+
|
390
|
+
if (srcn == 0 || *src != '&') { // input should start with an ampersand
|
391
|
+
*src_consumed = 0;
|
392
|
+
return -1;
|
393
|
+
}
|
394
|
+
*src_consumed = 1; // we'll get the & at least
|
395
|
+
|
396
|
+
// The standards are a bit unclear on when an entity ends. Certainly a ";"
|
397
|
+
// ends one, but spaces probably do too. We follow the lead of both IE and
|
398
|
+
// Netscape, which as far as we can tell end numeric entities (1st case below)
|
399
|
+
// at any non-digit, and end character entities (2nd case) at any non-alnum.
|
400
|
+
const char* entstart, *entend; // where the entity starts and ends
|
401
|
+
entstart = src + 1; // read past the &
|
402
|
+
int entval; // UCS2 value of the entity
|
403
|
+
if ( *entstart == '#' ) { // -- 1st case: numeric entity
|
404
|
+
if ( entstart + 2 >= srcend ) {
|
405
|
+
return -1; // no way a legitimate number could fit
|
406
|
+
} else if ( entstart[1] == 'x' || entstart[1] == 'X' ) { // hex numeric
|
407
|
+
entval = strto32_base16(entstart + 2, srcend, &entend);
|
408
|
+
} else { // decimal numeric entity
|
409
|
+
entval = strto32_base10(entstart+1, srcend, &entend);
|
410
|
+
}
|
411
|
+
if (entval == -1 || entend > srcend) {
|
412
|
+
return -1; // not entirely correct, but close enough
|
413
|
+
}
|
414
|
+
} else { // -- 2nd case: character entity
|
415
|
+
for (entend = entstart;
|
416
|
+
entend < srcend && ascii_isalnum(*entend);
|
417
|
+
++entend ) {
|
418
|
+
// entity consists of alphanumeric chars
|
419
|
+
}
|
420
|
+
entval = LookupEntity(entstart, entend - entstart);
|
421
|
+
if (entval < 0) {
|
422
|
+
return -1; // not a legal entity name
|
423
|
+
}
|
424
|
+
// Now we do a strange-seeming IE6-compatibility check: if entval is
|
425
|
+
// >= 256, it *must* be followed by a semicolon or it's not considered
|
426
|
+
// an entity. The problem is lots of the newfangled entity names, like
|
427
|
+
// "lang", also occur in URL CGI arguments: "/search?q=test&lang=en".
|
428
|
+
// When these links are written in HTML, it would be really bad if the
|
429
|
+
// "&lang" were treated as an entity, which is what the spec says
|
430
|
+
// *should* happen (even when the HTML is inside an "A HREF" tag!)
|
431
|
+
// IE ignores the spec for these new, high-value entities, so we do too.
|
432
|
+
if ( entval >= 256 && !(entend < srcend && *entend == ';') ) {
|
433
|
+
return -1; // make non-;-terminated entity illegal
|
434
|
+
}
|
435
|
+
}
|
436
|
+
|
437
|
+
// Finally, figure out how much src was consumed
|
438
|
+
if ( entend < srcend && *entend == ';' ) {
|
439
|
+
entend++; // standard says ; terminator is special
|
440
|
+
}
|
441
|
+
*src_consumed = entend - src;
|
442
|
+
return entval;
|
443
|
+
}
|
444
|
+
|
445
|
+
|
446
|
+
// Src points to '&'
|
447
|
+
// Writes entity value to dst. Returns take(src), put(dst) byte counts
|
448
|
+
void EntityToBuffer(const char* src, int len, char* dst,
|
449
|
+
int* tlen, int* plen) {
|
450
|
+
char32 entval = ReadEntity(src, len, tlen);
|
451
|
+
|
452
|
+
// ReadEntity does this already: entval = FixUnicodeValue(entval);
|
453
|
+
|
454
|
+
// Convert UTF-32 to UTF-8
|
455
|
+
if (entval > 0) {
|
456
|
+
*plen = runetochar(dst, &entval);
|
457
|
+
} else {
|
458
|
+
// Illegal entity; ignore the '&'
|
459
|
+
*tlen = 1;
|
460
|
+
*plen = 0;
|
461
|
+
}
|
462
|
+
}
|
463
|
+
|
464
|
+
// Returns true if character is < > or &, none of which are letters
|
465
|
+
bool inline IsSpecial(char c) {
|
466
|
+
// Comparison (int != 0) is used to silence the warning:
|
467
|
+
// 'const char': forcing value to bool
|
468
|
+
if ((c & 0xe0) == 0x20) {
|
469
|
+
return (kSpecialSymbol[static_cast<uint8>(c)] != 0);
|
470
|
+
}
|
471
|
+
return false;
|
472
|
+
}
|
473
|
+
|
474
|
+
// Quick Skip to next letter or < > & or to end of string (eos)
|
475
|
+
// Always return is_letter for eos
|
476
|
+
int ScanToLetterOrSpecial(const char* src, int len) {
|
477
|
+
int bytes_consumed;
|
478
|
+
StringPiece str(src, len);
|
479
|
+
UTF8GenericScan(&utf8scannot_lettermarkspecial_obj, str, &bytes_consumed);
|
480
|
+
return bytes_consumed;
|
481
|
+
}
|
482
|
+
|
483
|
+
|
484
|
+
|
485
|
+
|
486
|
+
// src points to non-letter, such as tag-opening '<'
|
487
|
+
// Return length from here to next possible letter
|
488
|
+
// On another < before >, return 1
|
489
|
+
// advances <tag>
|
490
|
+
// | |
|
491
|
+
// advances <tag> ... </tag> for <script> <style>
|
492
|
+
// | |
|
493
|
+
// advances <!-- ... <tag> ... -->
|
494
|
+
// | |
|
495
|
+
// advances <tag
|
496
|
+
// | | end of string
|
497
|
+
// advances <tag <tag2>
|
498
|
+
// ||
|
499
|
+
int ScanToPossibleLetter(const char* isrc, int len, int max_exit_state) {
|
500
|
+
const uint8* src = reinterpret_cast<const uint8*>(isrc);
|
501
|
+
const uint8* srclimit = src + len;
|
502
|
+
const uint8* tagParseTbl = kTagParseTbl_0;
|
503
|
+
int e = 0;
|
504
|
+
while (src < srclimit) {
|
505
|
+
e = tagParseTbl[kCharToSub[*src++]];
|
506
|
+
if (e <= max_exit_state) {
|
507
|
+
// We overshot by one byte
|
508
|
+
--src;
|
509
|
+
break;
|
510
|
+
}
|
511
|
+
tagParseTbl = &kTagParseTbl_0[e * 20];
|
512
|
+
}
|
513
|
+
|
514
|
+
if (src >= srclimit) {
|
515
|
+
// We fell off the end of the text.
|
516
|
+
// It looks like the most common case for this is a truncated file, not
|
517
|
+
// mismatched angle brackets. So we pretend that the last char was '>'
|
518
|
+
return len;
|
519
|
+
}
|
520
|
+
|
521
|
+
// OK to be in state 0 or state 2 at exit
|
522
|
+
if ((e != 0) && (e != 2)) {
|
523
|
+
// Error, '<' followed by '<'
|
524
|
+
// We want to back up to first <, then advance by one byte past it
|
525
|
+
int offset = src - reinterpret_cast<const uint8*>(isrc);
|
526
|
+
|
527
|
+
// Backscan to first '<' and return enough length to just get past it
|
528
|
+
--offset; // back up over the second '<', which caused us to stop
|
529
|
+
while ((0 < offset) && (isrc[offset] != '<')) {
|
530
|
+
// Find the first '<', which is unmatched
|
531
|
+
--offset;
|
532
|
+
}
|
533
|
+
// skip to just beyond first '<'
|
534
|
+
return offset + 1;
|
535
|
+
}
|
536
|
+
|
537
|
+
return src - reinterpret_cast<const uint8*>(isrc);
|
538
|
+
}
|
539
|
+
|
540
|
+
// Returns mid if key found in lo <= mid < hi, else -1
|
541
|
+
int BinarySearch(const char* key, int lo, int hi, const CharIntPair* cipair) {
|
542
|
+
// binary search
|
543
|
+
while (lo < hi) {
|
544
|
+
int mid = (lo + hi) >> 1;
|
545
|
+
if (strcmp(key, cipair[mid].s) < 0) {
|
546
|
+
hi = mid;
|
547
|
+
} else if (strcmp(key, cipair[mid].s) > 0) {
|
548
|
+
lo = mid + 1;
|
549
|
+
} else {
|
550
|
+
return mid;
|
551
|
+
}
|
552
|
+
}
|
553
|
+
return -1;
|
554
|
+
}
|
555
|
+
|
556
|
+
// Returns the length in bytes of the prefix of src that is all
|
557
|
+
// interchange valid UTF-8
|
558
|
+
int SpanInterchangeValid(const char* src, int byte_length) {
|
559
|
+
int bytes_consumed;
|
560
|
+
const UTF8ReplaceObj* st = &utf8acceptinterchange_obj;
|
561
|
+
StringPiece str(src, byte_length);
|
562
|
+
UTF8GenericScan(st, str, &bytes_consumed);
|
563
|
+
return bytes_consumed;
|
564
|
+
}
|
565
|
+
|
566
|
+
ScriptScanner::ScriptScanner(const char* buffer,
|
567
|
+
int buffer_length,
|
568
|
+
bool is_plain_text)
|
569
|
+
: start_byte_(buffer),
|
570
|
+
next_byte_(buffer),
|
571
|
+
byte_length_(buffer_length),
|
572
|
+
is_plain_text_(is_plain_text),
|
573
|
+
letters_marks_only_(true),
|
574
|
+
one_script_only_(true),
|
575
|
+
exit_state_(kMaxExitStateLettersMarksOnly) {
|
576
|
+
script_buffer_ = new char[kMaxScriptBuffer];
|
577
|
+
script_buffer_lower_ = new char[kMaxScriptLowerBuffer];
|
578
|
+
map2original_.Clear(); // map from script_buffer_ to buffer
|
579
|
+
map2uplow_.Clear(); // map from script_buffer_lower_ to script_buffer_
|
580
|
+
}
|
581
|
+
|
582
|
+
// Extended version to allow spans of any non-tag text and spans of mixed script
|
583
|
+
ScriptScanner::ScriptScanner(const char* buffer,
|
584
|
+
int buffer_length,
|
585
|
+
bool is_plain_text,
|
586
|
+
bool any_text,
|
587
|
+
bool any_script)
|
588
|
+
: start_byte_(buffer),
|
589
|
+
next_byte_(buffer),
|
590
|
+
byte_length_(buffer_length),
|
591
|
+
is_plain_text_(is_plain_text),
|
592
|
+
letters_marks_only_(!any_text),
|
593
|
+
one_script_only_(!any_script),
|
594
|
+
exit_state_(any_text ? kMaxExitStateAllText : kMaxExitStateLettersMarksOnly) {
|
595
|
+
script_buffer_ = new char[kMaxScriptBuffer];
|
596
|
+
script_buffer_lower_ = new char[kMaxScriptLowerBuffer];
|
597
|
+
map2original_.Clear(); // map from script_buffer_ to buffer
|
598
|
+
map2uplow_.Clear(); // map from script_buffer_lower_ to script_buffer_
|
599
|
+
}
|
600
|
+
|
601
|
+
|
602
|
+
ScriptScanner::~ScriptScanner() {
|
603
|
+
delete[] script_buffer_;
|
604
|
+
delete[] script_buffer_lower_;
|
605
|
+
}
|
606
|
+
|
607
|
+
|
608
|
+
|
609
|
+
|
610
|
+
// Get to the first real non-tag letter or entity that is a letter
|
611
|
+
// Sets script of that letter
|
612
|
+
// Return len if no more letters
|
613
|
+
int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) {
|
614
|
+
int sc = UNKNOWN_ULSCRIPT;
|
615
|
+
int skip = 0;
|
616
|
+
int tlen, plen;
|
617
|
+
|
618
|
+
// Do run of non-letters (tag | &NL | NL)*
|
619
|
+
tlen = 0;
|
620
|
+
while (skip < len) {
|
621
|
+
// Do fast scan to next interesting byte
|
622
|
+
// int oldskip = skip;
|
623
|
+
skip += ScanToLetterOrSpecial(src + skip, len - skip);
|
624
|
+
|
625
|
+
// Check for no more letters/specials
|
626
|
+
if (skip >= len) {
|
627
|
+
// All done
|
628
|
+
*script = sc;
|
629
|
+
return len;
|
630
|
+
}
|
631
|
+
|
632
|
+
// We are at a letter, nonletter, tag, or entity
|
633
|
+
if (IsSpecial(src[skip]) && !is_plain_text_) {
|
634
|
+
if (src[skip] == '<') {
|
635
|
+
// Begining of tag; skip to end and go around again
|
636
|
+
tlen = ScanToPossibleLetter(src + skip, len - skip,
|
637
|
+
exit_state_);
|
638
|
+
sc = 0;
|
639
|
+
} else if (src[skip] == '>') {
|
640
|
+
// Unexpected end of tag; skip it and go around again
|
641
|
+
tlen = 1; // Over the >
|
642
|
+
sc = 0;
|
643
|
+
} else if (src[skip] == '&') {
|
644
|
+
// Expand entity, no advance
|
645
|
+
char temp[4];
|
646
|
+
EntityToBuffer(src + skip, len - skip,
|
647
|
+
temp, &tlen, &plen);
|
648
|
+
if (plen > 0) {
|
649
|
+
sc = GetUTF8LetterScriptNum(temp);
|
650
|
+
}
|
651
|
+
}
|
652
|
+
} else {
|
653
|
+
// Update 1..4 bytes
|
654
|
+
tlen = UTF8OneCharLen(src + skip);
|
655
|
+
sc = GetUTF8LetterScriptNum(src + skip);
|
656
|
+
}
|
657
|
+
if (sc != 0) {break;} // Letter found
|
658
|
+
skip += tlen; // Else advance
|
659
|
+
}
|
660
|
+
|
661
|
+
*script = sc;
|
662
|
+
return skip;
|
663
|
+
}
|
664
|
+
|
665
|
+
|
666
|
+
// These are for ASCII-only tag names
|
667
|
+
// Compare one letter uplow to c, ignoring case of uplowp
|
668
|
+
inline bool EqCase(char uplow, char c) {
|
669
|
+
return (uplow | 0x20) == c;
|
670
|
+
}
|
671
|
+
|
672
|
+
// These are for ASCII-only tag names
|
673
|
+
// Return true for space / < > etc. all less than 0x40
|
674
|
+
inline bool NeqLetter(char c) {
|
675
|
+
return c < 0x40;
|
676
|
+
}
|
677
|
+
|
678
|
+
// These are for ASCII-only tag names
|
679
|
+
// Return true for space \n false for \r
|
680
|
+
inline bool WS(char c) {
|
681
|
+
return (c == ' ') || (c == '\n');
|
682
|
+
}
|
683
|
+
|
684
|
+
// Canonical CR or LF
|
685
|
+
static const char LF = '\n';
|
686
|
+
|
687
|
+
|
688
|
+
// The naive loop scans from next_byte_ to script_buffer_ until full.
|
689
|
+
// But this can leave an awkward hard-to-identify short fragment at the
|
690
|
+
// end of the input. We would prefer to make the next-to-last fragment
|
691
|
+
// shorter and the last fragment longer.
|
692
|
+
|
693
|
+
// Copy next run of non-tag characters to buffer [NUL terminated]
|
694
|
+
// This just replaces tags with space or \n and removes entities.
|
695
|
+
// Tags <br> <p> and <tr> are replaced with \n. Non-letter sequences
|
696
|
+
// including \r or \n are replaced by \n. All other tags and skipped text
|
697
|
+
// are replaced with ASCII space.
|
698
|
+
//
|
699
|
+
// Buffer ALWAYS has leading space and trailing space space space NUL
|
700
|
+
bool ScriptScanner::GetOneTextSpan(LangSpan* span) {
|
701
|
+
span->text = script_buffer_;
|
702
|
+
span->text_bytes = 0;
|
703
|
+
span->offset = next_byte_ - start_byte_;
|
704
|
+
span->ulscript = UNKNOWN_ULSCRIPT;
|
705
|
+
span->truncated = false;
|
706
|
+
|
707
|
+
int put_soft_limit = kMaxScriptBytes - kWithinScriptTail;
|
708
|
+
if ((kMaxScriptBytes <= byte_length_) &&
|
709
|
+
(byte_length_ < (2 * kMaxScriptBytes))) {
|
710
|
+
// Try to split the last two fragments in half
|
711
|
+
put_soft_limit = byte_length_ / 2;
|
712
|
+
}
|
713
|
+
|
714
|
+
script_buffer_[0] = ' '; // Always a space at front of output
|
715
|
+
script_buffer_[1] = '\0';
|
716
|
+
int take = 0;
|
717
|
+
int put = 1; // Start after the initial space
|
718
|
+
int tlen = 0, plen = 0;
|
719
|
+
|
720
|
+
if (byte_length_ <= 0) {
|
721
|
+
return false; // No more text to be found
|
722
|
+
}
|
723
|
+
|
724
|
+
// Go over alternating spans of text and tags,
|
725
|
+
// copying letters to buffer with single spaces for each run of non-letters
|
726
|
+
bool last_byte_was_space = false;
|
727
|
+
while (take < byte_length_) {
|
728
|
+
char c = next_byte_[take];
|
729
|
+
if (c == '\r') {c = LF;} // Canonical CR or LF
|
730
|
+
if (c == '\n') {c = LF;} // Canonical CR or LF
|
731
|
+
|
732
|
+
if (IsSpecial(c) && !is_plain_text_) {
|
733
|
+
if (c == '<') {
|
734
|
+
// Replace tag with space
|
735
|
+
c = ' '; // for almost-full test below
|
736
|
+
// or if <p> <br> <tr>, replace with \n
|
737
|
+
if (take < (byte_length_ - 3)) {
|
738
|
+
if (EqCase(next_byte_[take + 1], 'p') &&
|
739
|
+
NeqLetter(next_byte_[take + 2])) {
|
740
|
+
c = LF;
|
741
|
+
}
|
742
|
+
if (EqCase(next_byte_[take + 1], 'b') &&
|
743
|
+
EqCase(next_byte_[take + 2], 'r') &&
|
744
|
+
NeqLetter(next_byte_[take + 3])) {
|
745
|
+
c = LF;
|
746
|
+
}
|
747
|
+
if (EqCase(next_byte_[take + 1], 't') &&
|
748
|
+
EqCase(next_byte_[take + 2], 'r') &&
|
749
|
+
NeqLetter(next_byte_[take + 3])) {
|
750
|
+
c = LF;
|
751
|
+
}
|
752
|
+
}
|
753
|
+
// Begining of tag; skip to end and go around again
|
754
|
+
tlen = 1 + ScanToPossibleLetter(next_byte_ + take, byte_length_ - take,
|
755
|
+
exit_state_);
|
756
|
+
// Copy one byte, compressing spaces
|
757
|
+
if (!last_byte_was_space || !WS(c)) {
|
758
|
+
script_buffer_[put++] = c; // Advance dest
|
759
|
+
last_byte_was_space = WS(c);
|
760
|
+
}
|
761
|
+
} else if (c == '>') {
|
762
|
+
// Unexpected end of tag; copy it and go around again
|
763
|
+
tlen = 1; // Over the >
|
764
|
+
script_buffer_[put++] = c; // Advance dest
|
765
|
+
} else if (c == '&') {
|
766
|
+
// Expand entity, no advance
|
767
|
+
EntityToBuffer(next_byte_ + take, byte_length_ - take,
|
768
|
+
script_buffer_ + put, &tlen, &plen);
|
769
|
+
put += plen; // Advance dest
|
770
|
+
}
|
771
|
+
take += tlen; // Advance source
|
772
|
+
} else {
|
773
|
+
// Copy one byte, compressing spaces
|
774
|
+
if (!last_byte_was_space || !WS(c)) {
|
775
|
+
script_buffer_[put++] = c; // Advance dest
|
776
|
+
last_byte_was_space = WS(c);
|
777
|
+
}
|
778
|
+
++take; // Advance source
|
779
|
+
}
|
780
|
+
|
781
|
+
if (WS(c) &&
|
782
|
+
(put >= put_soft_limit)) {
|
783
|
+
// Buffer is almost full
|
784
|
+
span->truncated = true;
|
785
|
+
break;
|
786
|
+
}
|
787
|
+
if (put >= kMaxScriptBytes) {
|
788
|
+
// Buffer is completely full
|
789
|
+
span->truncated = true;
|
790
|
+
break;
|
791
|
+
}
|
792
|
+
}
|
793
|
+
|
794
|
+
// Almost done. Back up to a character boundary if needed
|
795
|
+
while ((0 < take) && ((next_byte_[take] & 0xc0) == 0x80)) {
|
796
|
+
// Back up over continuation byte
|
797
|
+
--take;
|
798
|
+
--put;
|
799
|
+
}
|
800
|
+
|
801
|
+
// Update input position
|
802
|
+
next_byte_ += take;
|
803
|
+
byte_length_ -= take;
|
804
|
+
|
805
|
+
// Put four more spaces/NUL. Worst case is abcd _ _ _ \0
|
806
|
+
// kMaxScriptBytes | | put
|
807
|
+
script_buffer_[put + 0] = ' ';
|
808
|
+
script_buffer_[put + 1] = ' ';
|
809
|
+
script_buffer_[put + 2] = ' ';
|
810
|
+
script_buffer_[put + 3] = '\0';
|
811
|
+
|
812
|
+
span->text_bytes = put; // Does not include the last four chars above
|
813
|
+
return true;
|
814
|
+
}
|
815
|
+
|
816
|
+
|
817
|
+
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
|
818
|
+
// Buffer ALWAYS has leading space and trailing space space space NUL
|
819
|
+
bool ScriptScanner::GetOneScriptSpan(LangSpan* span) {
|
820
|
+
if (!letters_marks_only_) {
|
821
|
+
// Return non-tag text, including punctuation and digits
|
822
|
+
return GetOneTextSpan(span);
|
823
|
+
}
|
824
|
+
|
825
|
+
span->text = script_buffer_;
|
826
|
+
span->text_bytes = 0;
|
827
|
+
span->offset = next_byte_ - start_byte_;
|
828
|
+
span->ulscript = UNKNOWN_ULSCRIPT;
|
829
|
+
span->truncated = false;
|
830
|
+
|
831
|
+
// struct timeval script_start, script_mid, script_end;
|
832
|
+
|
833
|
+
int put_soft_limit = kMaxScriptBytes - kWithinScriptTail;
|
834
|
+
if ((kMaxScriptBytes <= byte_length_) &&
|
835
|
+
(byte_length_ < (2 * kMaxScriptBytes))) {
|
836
|
+
// Try to split the last two fragments in half
|
837
|
+
put_soft_limit = byte_length_ / 2;
|
838
|
+
}
|
839
|
+
|
840
|
+
|
841
|
+
int spanscript; // The script of this span
|
842
|
+
int sc = UNKNOWN_ULSCRIPT; // The script of next character
|
843
|
+
int tlen = 0;
|
844
|
+
int plen = 0;
|
845
|
+
|
846
|
+
script_buffer_[0] = ' '; // Always a space at front of output
|
847
|
+
script_buffer_[1] = '\0';
|
848
|
+
int take = 0;
|
849
|
+
int put = 1; // Start after the initial space
|
850
|
+
|
851
|
+
// Build offsets from span->text back to start_byte_ + span->offset
|
852
|
+
// This mapping reflects deletion of non-letters, expansion of
|
853
|
+
// entities, etc.
|
854
|
+
map2original_.Clear();
|
855
|
+
map2original_.Delete(span->offset); // So that MapBack(0) gives offset
|
856
|
+
|
857
|
+
// Get to the first real non-tag letter or entity that is a letter
|
858
|
+
int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript);
|
859
|
+
next_byte_ += skip;
|
860
|
+
byte_length_ -= skip;
|
861
|
+
|
862
|
+
if (skip != 1) {
|
863
|
+
map2original_.Delete(skip);
|
864
|
+
map2original_.Insert(1);
|
865
|
+
} else {
|
866
|
+
map2original_.Copy(1);
|
867
|
+
}
|
868
|
+
if (byte_length_ <= 0) {
|
869
|
+
map2original_.Reset();
|
870
|
+
return false; // No more letters to be found
|
871
|
+
}
|
872
|
+
|
873
|
+
// There is at least one letter, so we know the script for this span
|
874
|
+
span->ulscript = (ULScript)spanscript;
|
875
|
+
|
876
|
+
|
877
|
+
// Go over alternating spans of same-script letters and non-letters,
|
878
|
+
// copying letters to buffer with single spaces for each run of non-letters
|
879
|
+
while (take < byte_length_) {
|
880
|
+
// Copy run of letters in same script (&LS | LS)*
|
881
|
+
int letter_count = 0; // Keep track of word length
|
882
|
+
bool need_break = false;
|
883
|
+
|
884
|
+
while (take < byte_length_) {
|
885
|
+
// We are at a letter, nonletter, tag, or entity
|
886
|
+
if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
|
887
|
+
if (next_byte_[take] == '<') {
|
888
|
+
// Begining of tag
|
889
|
+
sc = 0;
|
890
|
+
break;
|
891
|
+
} else if (next_byte_[take] == '>') {
|
892
|
+
// Unexpected end of tag
|
893
|
+
sc = 0;
|
894
|
+
break;
|
895
|
+
} else if (next_byte_[take] == '&') {
|
896
|
+
// Copy entity, no advance
|
897
|
+
EntityToBuffer(next_byte_ + take, byte_length_ - take,
|
898
|
+
script_buffer_ + put, &tlen, &plen);
|
899
|
+
if (plen > 0) {
|
900
|
+
sc = GetUTF8LetterScriptNum(script_buffer_ + put);
|
901
|
+
}
|
902
|
+
}
|
903
|
+
} else {
|
904
|
+
// Real letter, safely copy up to 4 bytes, increment by 1..4
|
905
|
+
// Will update by 1..4 bytes at Advance, below
|
906
|
+
tlen = plen = UTF8OneCharLen(next_byte_ + take);
|
907
|
+
if (take < (byte_length_ - 3)) {
|
908
|
+
// X86 fast case, does unaligned load/store
|
909
|
+
UNALIGNED_STORE32(script_buffer_ + put,
|
910
|
+
UNALIGNED_LOAD32(next_byte_ + take));
|
911
|
+
|
912
|
+
} else {
|
913
|
+
// Slow case, happens 1-3 times per input document
|
914
|
+
memcpy(script_buffer_ + put, next_byte_ + take, plen);
|
915
|
+
}
|
916
|
+
sc = GetUTF8LetterScriptNum(next_byte_ + take);
|
917
|
+
}
|
918
|
+
|
919
|
+
// Allow continue across a single letter in a different script:
|
920
|
+
// A B D = three scripts, c = common script, i = inherited script,
|
921
|
+
// - = don't care, ( = take position before the += below
|
922
|
+
// AAA(A- continue
|
923
|
+
//
|
924
|
+
// AAA(BA continue
|
925
|
+
// AAA(BB break
|
926
|
+
// AAA(Bc continue (breaks after B)
|
927
|
+
// AAA(BD break
|
928
|
+
// AAA(Bi break
|
929
|
+
//
|
930
|
+
// AAA(c- break
|
931
|
+
//
|
932
|
+
// AAA(i- continue
|
933
|
+
//
|
934
|
+
|
935
|
+
if ((sc != spanscript) && (sc != ULScript_Inherited)) {
|
936
|
+
// Might need to break this script span
|
937
|
+
if (sc == ULScript_Common) {
|
938
|
+
need_break = true;
|
939
|
+
} else {
|
940
|
+
// Look at next following character, ignoring entity as Common
|
941
|
+
int sc2 = GetUTF8LetterScriptNum(next_byte_ + take + tlen);
|
942
|
+
if ((sc2 != ULScript_Common) && (sc2 != spanscript)) {
|
943
|
+
// We found a non-trivial change of script
|
944
|
+
if (one_script_only_) {
|
945
|
+
need_break = true;
|
946
|
+
}
|
947
|
+
}
|
948
|
+
}
|
949
|
+
}
|
950
|
+
if (need_break) {break;} // Non-letter or letter in wrong script
|
951
|
+
|
952
|
+
take += tlen; // Advance
|
953
|
+
put += plen; // Advance
|
954
|
+
|
955
|
+
// Update the offset map to reflect take/put lengths
|
956
|
+
if (tlen == plen) {
|
957
|
+
map2original_.Copy(tlen);
|
958
|
+
} else if (tlen < plen) {
|
959
|
+
map2original_.Copy(tlen);
|
960
|
+
map2original_.Insert(plen - tlen);
|
961
|
+
} else { // plen < tlen
|
962
|
+
map2original_.Copy(plen);
|
963
|
+
map2original_.Delete(tlen - plen);
|
964
|
+
}
|
965
|
+
|
966
|
+
++letter_count;
|
967
|
+
if (put >= kMaxScriptBytes) {
|
968
|
+
// Buffer is full
|
969
|
+
span->truncated = true;
|
970
|
+
break;
|
971
|
+
}
|
972
|
+
} // End while letters
|
973
|
+
|
974
|
+
// Do run of non-letters (tag | &NL | NL)*
|
975
|
+
while (take < byte_length_) {
|
976
|
+
// Do fast scan to next interesting byte
|
977
|
+
tlen = ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take);
|
978
|
+
take += tlen;
|
979
|
+
map2original_.Delete(tlen);
|
980
|
+
if (take >= byte_length_) {break;} // Might have scanned to end
|
981
|
+
|
982
|
+
// We are at a letter, nonletter, tag, or entity
|
983
|
+
if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
|
984
|
+
if (next_byte_[take] == '<') {
|
985
|
+
// Begining of tag; skip to end and go around again
|
986
|
+
tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take,
|
987
|
+
exit_state_);
|
988
|
+
sc = 0;
|
989
|
+
} else if (next_byte_[take] == '>') {
|
990
|
+
// Unexpected end of tag; skip it and go around again
|
991
|
+
tlen = 1; // Over the >
|
992
|
+
sc = 0;
|
993
|
+
} else if (next_byte_[take] == '&') {
|
994
|
+
// Expand entity, no advance
|
995
|
+
EntityToBuffer(next_byte_ + take, byte_length_ - take,
|
996
|
+
script_buffer_ + put, &tlen, &plen);
|
997
|
+
if (plen > 0) {
|
998
|
+
sc = GetUTF8LetterScriptNum(script_buffer_ + put);
|
999
|
+
}
|
1000
|
+
}
|
1001
|
+
} else {
|
1002
|
+
// Update 1..4
|
1003
|
+
tlen = UTF8OneCharLen(next_byte_ + take);
|
1004
|
+
sc = GetUTF8LetterScriptNum(next_byte_ + take);
|
1005
|
+
}
|
1006
|
+
if (sc != 0) {break;} // Letter found
|
1007
|
+
take += tlen; // Else advance
|
1008
|
+
map2original_.Delete(tlen);
|
1009
|
+
} // End while not-letters
|
1010
|
+
|
1011
|
+
script_buffer_[put++] = ' ';
|
1012
|
+
map2original_.Insert(1);
|
1013
|
+
|
1014
|
+
// Letter in wrong script ?
|
1015
|
+
if ((sc != spanscript) && (sc != ULScript_Inherited)) {break;}
|
1016
|
+
if (put >= put_soft_limit) {
|
1017
|
+
// Buffer is almost full
|
1018
|
+
span->truncated = true;
|
1019
|
+
break;
|
1020
|
+
}
|
1021
|
+
}
|
1022
|
+
|
1023
|
+
// Almost done. Back up to a character boundary if needed
|
1024
|
+
while ((0 < take) && (take < byte_length_) &&
|
1025
|
+
((next_byte_[take] & 0xc0) == 0x80)) {
|
1026
|
+
// Back up over continuation byte
|
1027
|
+
--take;
|
1028
|
+
--put;
|
1029
|
+
}
|
1030
|
+
|
1031
|
+
// Update input position
|
1032
|
+
next_byte_ += take;
|
1033
|
+
byte_length_ -= take;
|
1034
|
+
|
1035
|
+
// Put four more spaces/NUL. Worst case is abcd _ _ _ \0
|
1036
|
+
// kMaxScriptBytes | | put
|
1037
|
+
script_buffer_[put + 0] = ' ';
|
1038
|
+
script_buffer_[put + 1] = ' ';
|
1039
|
+
script_buffer_[put + 2] = ' ';
|
1040
|
+
script_buffer_[put + 3] = '\0';
|
1041
|
+
map2original_.Insert(4);
|
1042
|
+
map2original_.Reset();
|
1043
|
+
|
1044
|
+
span->text_bytes = put; // Does not include the last four chars above
|
1045
|
+
return true;
|
1046
|
+
}
|
1047
|
+
|
1048
|
+
// Force Latin, Cyrillic, Armenian, Greek scripts to be lowercase
|
1049
|
+
// List changes with each version of Unicode, so just always lowercase
|
1050
|
+
// Unicode 6.2.0:
|
1051
|
+
// ARMENIAN COPTIC CYRILLIC DESERET GEORGIAN GLAGOLITIC GREEK LATIN
|
1052
|
+
void ScriptScanner::LowerScriptSpan(LangSpan* span) {
|
1053
|
+
// If needed, lowercase all the text. If we do it sooner, might miss
|
1054
|
+
// lowercasing an entity such as Á
|
1055
|
+
// We only need to do this for Latn and Cyrl scripts
|
1056
|
+
map2uplow_.Clear();
|
1057
|
+
// Full Unicode lowercase of the entire buffer, including
|
1058
|
+
// four pad bytes off the end.
|
1059
|
+
// Ahhh. But the last byte 0x00 is not interchange-valid, so we do 3 pad
|
1060
|
+
// bytes and put the 0x00 in explicitly.
|
1061
|
+
// Build an offset map from script_buffer_lower_ back to script_buffer_
|
1062
|
+
int consumed, filled, changed;
|
1063
|
+
StringPiece istr(span->text, span->text_bytes + 3);
|
1064
|
+
StringPiece ostr(script_buffer_lower_, kMaxScriptLowerBuffer);
|
1065
|
+
|
1066
|
+
UTF8GenericReplace(&utf8repl_lettermarklower_obj,
|
1067
|
+
istr, ostr, is_plain_text_,
|
1068
|
+
&consumed, &filled, &changed, &map2uplow_);
|
1069
|
+
script_buffer_lower_[filled] = '\0';
|
1070
|
+
span->text = script_buffer_lower_;
|
1071
|
+
span->text_bytes = filled - 3;
|
1072
|
+
map2uplow_.Reset();
|
1073
|
+
}
|
1074
|
+
|
1075
|
+
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
|
1076
|
+
// Force Latin, Cyrillic, Greek scripts to be lowercase
|
1077
|
+
// Buffer ALWAYS has leading space and trailing space space space NUL
|
1078
|
+
bool ScriptScanner::GetOneScriptSpanLower(LangSpan* span) {
|
1079
|
+
bool ok = GetOneScriptSpan(span);
|
1080
|
+
if (ok) {
|
1081
|
+
LowerScriptSpan(span);
|
1082
|
+
}
|
1083
|
+
return ok;
|
1084
|
+
}
|
1085
|
+
|
1086
|
+
// Maps byte offset in most recent GetOneScriptSpan/Lower
|
1087
|
+
// span->text [0..text_bytes] into an additional byte offset from
|
1088
|
+
// span->offset, to get back to corresponding text in the original
|
1089
|
+
// input buffer.
|
1090
|
+
// text_offset must be the first byte
|
1091
|
+
// of a UTF-8 character, or just beyond the last character. Normally this
|
1092
|
+
// routine is called with the first byte of an interesting range and
|
1093
|
+
// again with the first byte of the following range.
|
1094
|
+
int ScriptScanner::MapBack(int text_offset) {
|
1095
|
+
return map2original_.MapBack(map2uplow_.MapBack(text_offset));
|
1096
|
+
}
|
1097
|
+
|
1098
|
+
|
1099
|
+
// Gets lscript number for letters; always returns
|
1100
|
+
// 0 (common script) for non-letters
|
1101
|
+
int GetUTF8LetterScriptNum(const char* src) {
|
1102
|
+
int srclen = UTF8OneCharLen(src);
|
1103
|
+
const uint8* usrc = reinterpret_cast<const uint8*>(src);
|
1104
|
+
return UTF8GenericPropertyTwoByte(&utf8prop_lettermarkscriptnum_obj,
|
1105
|
+
&usrc, &srclen);
|
1106
|
+
}
|
1107
|
+
|
1108
|
+
} // namespace CLD2
|
1109
|
+
} // namespace chrome_lang_id
|