cld3 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +18 -0
- data/LICENSE +204 -0
- data/LICENSE_CLD3 +203 -0
- data/README.md +22 -0
- data/cld3.gemspec +35 -0
- data/ext/cld3/base.cc +36 -0
- data/ext/cld3/base.h +106 -0
- data/ext/cld3/casts.h +98 -0
- data/ext/cld3/embedding_feature_extractor.cc +51 -0
- data/ext/cld3/embedding_feature_extractor.h +182 -0
- data/ext/cld3/embedding_network.cc +196 -0
- data/ext/cld3/embedding_network.h +186 -0
- data/ext/cld3/embedding_network_params.h +285 -0
- data/ext/cld3/extconf.rb +49 -0
- data/ext/cld3/feature_extractor.cc +137 -0
- data/ext/cld3/feature_extractor.h +633 -0
- data/ext/cld3/feature_extractor.proto +50 -0
- data/ext/cld3/feature_types.cc +72 -0
- data/ext/cld3/feature_types.h +158 -0
- data/ext/cld3/fixunicodevalue.cc +55 -0
- data/ext/cld3/fixunicodevalue.h +69 -0
- data/ext/cld3/float16.h +58 -0
- data/ext/cld3/fml_parser.cc +308 -0
- data/ext/cld3/fml_parser.h +123 -0
- data/ext/cld3/generated_entities.cc +296 -0
- data/ext/cld3/generated_ulscript.cc +678 -0
- data/ext/cld3/generated_ulscript.h +142 -0
- data/ext/cld3/getonescriptspan.cc +1109 -0
- data/ext/cld3/getonescriptspan.h +124 -0
- data/ext/cld3/integral_types.h +37 -0
- data/ext/cld3/lang_id_nn_params.cc +57449 -0
- data/ext/cld3/lang_id_nn_params.h +178 -0
- data/ext/cld3/language_identifier_features.cc +165 -0
- data/ext/cld3/language_identifier_features.h +116 -0
- data/ext/cld3/nnet_language_identifier.cc +380 -0
- data/ext/cld3/nnet_language_identifier.h +175 -0
- data/ext/cld3/nnet_language_identifier_c.cc +72 -0
- data/ext/cld3/offsetmap.cc +478 -0
- data/ext/cld3/offsetmap.h +168 -0
- data/ext/cld3/port.h +143 -0
- data/ext/cld3/registry.cc +28 -0
- data/ext/cld3/registry.h +242 -0
- data/ext/cld3/relevant_script_feature.cc +89 -0
- data/ext/cld3/relevant_script_feature.h +49 -0
- data/ext/cld3/script_detector.h +156 -0
- data/ext/cld3/sentence.proto +77 -0
- data/ext/cld3/sentence_features.cc +29 -0
- data/ext/cld3/sentence_features.h +35 -0
- data/ext/cld3/simple_adder.h +72 -0
- data/ext/cld3/stringpiece.h +81 -0
- data/ext/cld3/task_context.cc +161 -0
- data/ext/cld3/task_context.h +81 -0
- data/ext/cld3/task_context_params.cc +74 -0
- data/ext/cld3/task_context_params.h +54 -0
- data/ext/cld3/task_spec.proto +98 -0
- data/ext/cld3/text_processing.cc +245 -0
- data/ext/cld3/text_processing.h +30 -0
- data/ext/cld3/unicodetext.cc +96 -0
- data/ext/cld3/unicodetext.h +144 -0
- data/ext/cld3/utf8acceptinterchange.h +486 -0
- data/ext/cld3/utf8prop_lettermarkscriptnum.h +1631 -0
- data/ext/cld3/utf8repl_lettermarklower.h +758 -0
- data/ext/cld3/utf8scannot_lettermarkspecial.h +1455 -0
- data/ext/cld3/utf8statetable.cc +1344 -0
- data/ext/cld3/utf8statetable.h +285 -0
- data/ext/cld3/utils.cc +241 -0
- data/ext/cld3/utils.h +144 -0
- data/ext/cld3/workspace.cc +64 -0
- data/ext/cld3/workspace.h +177 -0
- data/lib/cld3.rb +99 -0
- metadata +158 -0
@@ -0,0 +1,142 @@
|
|
1
|
+
// Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
|
15
|
+
// generated_ulscript.h
|
16
|
+
// Machine generated. Do Not Edit.
|
17
|
+
//
|
18
|
+
// Declarations for scripts recognized by CLD2
|
19
|
+
//
|
20
|
+
|
21
|
+
#ifndef SCRIPT_SPAN_GENERATED_ULSCRIPT_H_
|
22
|
+
#define SCRIPT_SPAN_GENERATED_ULSCRIPT_H_
|
23
|
+
|
24
|
+
namespace chrome_lang_id {
|
25
|
+
namespace CLD2 {
|
26
|
+
|
27
|
+
typedef enum {RTypeNone = 0, RTypeOne, RTypeMany, RTypeCJK} ULScriptRType;
|
28
|
+
|
29
|
+
typedef struct {const char* s; int i;} CharIntPair;
|
30
|
+
|
31
|
+
typedef enum {
|
32
|
+
ULScript_Common = 0, // Zyyy
|
33
|
+
ULScript_Latin = 1, // Latn
|
34
|
+
ULScript_Greek = 2, // Grek
|
35
|
+
ULScript_Cyrillic = 3, // Cyrl
|
36
|
+
ULScript_Armenian = 4, // Armn
|
37
|
+
ULScript_Hebrew = 5, // Hebr
|
38
|
+
ULScript_Arabic = 6, // Arab
|
39
|
+
ULScript_Syriac = 7, // Syrc
|
40
|
+
ULScript_Thaana = 8, // Thaa
|
41
|
+
ULScript_Devanagari = 9, // Deva
|
42
|
+
ULScript_Bengali = 10, // Beng
|
43
|
+
ULScript_Gurmukhi = 11, // Guru
|
44
|
+
ULScript_Gujarati = 12, // Gujr
|
45
|
+
ULScript_Oriya = 13, // Orya
|
46
|
+
ULScript_Tamil = 14, // Taml
|
47
|
+
ULScript_Telugu = 15, // Telu
|
48
|
+
ULScript_Kannada = 16, // Knda
|
49
|
+
ULScript_Malayalam = 17, // Mlym
|
50
|
+
ULScript_Sinhala = 18, // Sinh
|
51
|
+
ULScript_Thai = 19, // Thai
|
52
|
+
ULScript_Lao = 20, // Laoo
|
53
|
+
ULScript_Tibetan = 21, // Tibt
|
54
|
+
ULScript_Myanmar = 22, // Mymr
|
55
|
+
ULScript_Georgian = 23, // Geor
|
56
|
+
ULScript_Hani = 24, // Hani
|
57
|
+
ULScript_Ethiopic = 25, // Ethi
|
58
|
+
ULScript_Cherokee = 26, // Cher
|
59
|
+
ULScript_Canadian_Aboriginal = 27, // Cans
|
60
|
+
ULScript_Ogham = 28, // Ogam
|
61
|
+
ULScript_Runic = 29, // Runr
|
62
|
+
ULScript_Khmer = 30, // Khmr
|
63
|
+
ULScript_Mongolian = 31, // Mong
|
64
|
+
ULScript_32 = 32, //
|
65
|
+
ULScript_33 = 33, //
|
66
|
+
ULScript_Bopomofo = 34, // Bopo
|
67
|
+
ULScript_35 = 35, //
|
68
|
+
ULScript_Yi = 36, // Yiii
|
69
|
+
ULScript_Old_Italic = 37, // Ital
|
70
|
+
ULScript_Gothic = 38, // Goth
|
71
|
+
ULScript_Deseret = 39, // Dsrt
|
72
|
+
ULScript_Inherited = 40, // Zinh
|
73
|
+
ULScript_Tagalog = 41, // Tglg
|
74
|
+
ULScript_Hanunoo = 42, // Hano
|
75
|
+
ULScript_Buhid = 43, // Buhd
|
76
|
+
ULScript_Tagbanwa = 44, // Tagb
|
77
|
+
ULScript_Limbu = 45, // Limb
|
78
|
+
ULScript_Tai_Le = 46, // Tale
|
79
|
+
ULScript_Linear_B = 47, // Linb
|
80
|
+
ULScript_Ugaritic = 48, // Ugar
|
81
|
+
ULScript_Shavian = 49, // Shaw
|
82
|
+
ULScript_Osmanya = 50, // Osma
|
83
|
+
ULScript_Cypriot = 51, // Cprt
|
84
|
+
ULScript_Braille = 52, // Brai
|
85
|
+
ULScript_Buginese = 53, // Bugi
|
86
|
+
ULScript_Coptic = 54, // Copt
|
87
|
+
ULScript_New_Tai_Lue = 55, // Talu
|
88
|
+
ULScript_Glagolitic = 56, // Glag
|
89
|
+
ULScript_Tifinagh = 57, // Tfng
|
90
|
+
ULScript_Syloti_Nagri = 58, // Sylo
|
91
|
+
ULScript_Old_Persian = 59, // Xpeo
|
92
|
+
ULScript_Kharoshthi = 60, // Khar
|
93
|
+
ULScript_Balinese = 61, // Bali
|
94
|
+
ULScript_Cuneiform = 62, // Xsux
|
95
|
+
ULScript_Phoenician = 63, // Phnx
|
96
|
+
ULScript_Phags_Pa = 64, // Phag
|
97
|
+
ULScript_Nko = 65, // Nkoo
|
98
|
+
ULScript_Sundanese = 66, // Sund
|
99
|
+
ULScript_Lepcha = 67, // Lepc
|
100
|
+
ULScript_Ol_Chiki = 68, // Olck
|
101
|
+
ULScript_Vai = 69, // Vaii
|
102
|
+
ULScript_Saurashtra = 70, // Saur
|
103
|
+
ULScript_Kayah_Li = 71, // Kali
|
104
|
+
ULScript_Rejang = 72, // Rjng
|
105
|
+
ULScript_Lycian = 73, // Lyci
|
106
|
+
ULScript_Carian = 74, // Cari
|
107
|
+
ULScript_Lydian = 75, // Lydi
|
108
|
+
ULScript_Cham = 76, // Cham
|
109
|
+
ULScript_Tai_Tham = 77, // Lana
|
110
|
+
ULScript_Tai_Viet = 78, // Tavt
|
111
|
+
ULScript_Avestan = 79, // Avst
|
112
|
+
ULScript_Egyptian_Hieroglyphs = 80, // Egyp
|
113
|
+
ULScript_Samaritan = 81, // Samr
|
114
|
+
ULScript_Lisu = 82, // Lisu
|
115
|
+
ULScript_Bamum = 83, // Bamu
|
116
|
+
ULScript_Javanese = 84, // Java
|
117
|
+
ULScript_Meetei_Mayek = 85, // Mtei
|
118
|
+
ULScript_Imperial_Aramaic = 86, // Armi
|
119
|
+
ULScript_Old_South_Arabian = 87, // Sarb
|
120
|
+
ULScript_Inscriptional_Parthian = 88, // Prti
|
121
|
+
ULScript_Inscriptional_Pahlavi = 89, // Phli
|
122
|
+
ULScript_Old_Turkic = 90, // Orkh
|
123
|
+
ULScript_Kaithi = 91, // Kthi
|
124
|
+
ULScript_Batak = 92, // Batk
|
125
|
+
ULScript_Brahmi = 93, // Brah
|
126
|
+
ULScript_Mandaic = 94, // Mand
|
127
|
+
ULScript_Chakma = 95, // Cakm
|
128
|
+
ULScript_Meroitic_Cursive = 96, // Merc
|
129
|
+
ULScript_Meroitic_Hieroglyphs = 97, // Mero
|
130
|
+
ULScript_Miao = 98, // Plrd
|
131
|
+
ULScript_Sharada = 99, // Shrd
|
132
|
+
ULScript_Sora_Sompeng = 100, // Sora
|
133
|
+
ULScript_Takri = 101, // Takr
|
134
|
+
NUM_ULSCRIPTS
|
135
|
+
} ULScript;
|
136
|
+
|
137
|
+
#define UNKNOWN_ULSCRIPT ULScript_Common
|
138
|
+
|
139
|
+
} // namespace CLD2
|
140
|
+
} // namespace chrome_lang_id
|
141
|
+
|
142
|
+
#endif // SCRIPT_SPAN_GENERATED_ULSCRIPT_H_
|
@@ -0,0 +1,1109 @@
|
|
1
|
+
// Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
|
15
|
+
//
|
16
|
+
// Author: dsites@google.com (Dick Sites)
|
17
|
+
//
|
18
|
+
|
19
|
+
|
20
|
+
#include "getonescriptspan.h"
|
21
|
+
|
22
|
+
#include <string.h>
|
23
|
+
|
24
|
+
#include "fixunicodevalue.h"
|
25
|
+
#include "port.h"
|
26
|
+
#include "utf8acceptinterchange.h"
|
27
|
+
#include "utf8repl_lettermarklower.h"
|
28
|
+
#include "utf8prop_lettermarkscriptnum.h"
|
29
|
+
#include "utf8scannot_lettermarkspecial.h"
|
30
|
+
#include "utf8statetable.h"
|
31
|
+
|
32
|
+
namespace chrome_lang_id {
|
33
|
+
namespace CLD2 {
|
34
|
+
|
35
|
+
// Alphabetical order for binary search, from
|
36
|
+
// generated_entities.cc
|
37
|
+
extern const int kNameToEntitySize;
|
38
|
+
extern const CharIntPair kNameToEntity[];
|
39
|
+
|
40
|
+
static const char kSpecialSymbol[256] = { // true for < > &
|
41
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
42
|
+
0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0,
|
43
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
44
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
45
|
+
|
46
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
47
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
48
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
49
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
50
|
+
};
|
51
|
+
|
52
|
+
|
53
|
+
|
54
|
+
#define LT 0 // <
|
55
|
+
#define GT 1 // >
|
56
|
+
#define EX 2 // !
|
57
|
+
#define HY 3 // -
|
58
|
+
#define QU 4 // "
|
59
|
+
#define AP 5 // '
|
60
|
+
#define SL 6 // /
|
61
|
+
#define S_ 7
|
62
|
+
#define C_ 8
|
63
|
+
#define R_ 9
|
64
|
+
#define I_ 10
|
65
|
+
#define P_ 11
|
66
|
+
#define T_ 12
|
67
|
+
#define Y_ 13
|
68
|
+
#define L_ 14
|
69
|
+
#define E_ 15
|
70
|
+
#define CR 16 // <cr> or <lf>
|
71
|
+
#define NL 17 // non-letter: ASCII whitespace, digit, punctuation
|
72
|
+
#define PL 18 // possible letter, incl. &
|
73
|
+
#define xx 19 // <unused>
|
74
|
+
|
75
|
+
// Map byte to one of ~20 interesting categories for cheap tag parsing
|
76
|
+
static const uint8 kCharToSub[256] = {
|
77
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL,
|
78
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
|
79
|
+
NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL,
|
80
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL,
|
81
|
+
|
82
|
+
PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
|
83
|
+
P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
|
84
|
+
PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
|
85
|
+
P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
|
86
|
+
|
87
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
|
88
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
|
89
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
|
90
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
|
91
|
+
|
92
|
+
PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
|
93
|
+
PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
|
94
|
+
PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
|
95
|
+
PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
|
96
|
+
};
|
97
|
+
|
98
|
+
#undef LT
|
99
|
+
#undef GT
|
100
|
+
#undef EX
|
101
|
+
#undef HY
|
102
|
+
#undef QU
|
103
|
+
#undef AP
|
104
|
+
#undef SL
|
105
|
+
#undef S_
|
106
|
+
#undef C_
|
107
|
+
#undef R_
|
108
|
+
#undef I_
|
109
|
+
#undef P_
|
110
|
+
#undef T_
|
111
|
+
#undef Y_
|
112
|
+
#undef L_
|
113
|
+
#undef E_
|
114
|
+
#undef CR
|
115
|
+
#undef NL
|
116
|
+
#undef PL
|
117
|
+
#undef xx
|
118
|
+
|
119
|
+
|
120
|
+
#define OK 0
|
121
|
+
#define X_ 1
|
122
|
+
|
123
|
+
|
124
|
+
static const int kMaxExitStateLettersMarksOnly = 1;
|
125
|
+
static const int kMaxExitStateAllText = 2;
|
126
|
+
|
127
|
+
|
128
|
+
// State machine to do cheap parse of non-letter strings incl. tags
|
129
|
+
// advances <tag>
|
130
|
+
// | |
|
131
|
+
// advances <tag> ... </tag> for <script> <style>
|
132
|
+
// | |
|
133
|
+
// advances <!-- ... <tag> ... -->
|
134
|
+
// | |
|
135
|
+
// advances <tag
|
136
|
+
// || (0)
|
137
|
+
// advances <tag <tag2>
|
138
|
+
// || (0)
|
139
|
+
//
|
140
|
+
// We start in state [0] at a non-letter and make at least one transition
|
141
|
+
// When scanning for just letters, arriving back at state [0] or [1] exits
|
142
|
+
// the state machine.
|
143
|
+
// When scanning for any non-tag text, arriving at state [2] also exits
|
144
|
+
static const uint8 kTagParseTbl_0[] = {
|
145
|
+
// < > ! - " ' / S C R I P T Y L E CR NL PL xx
|
146
|
+
3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [0] OK exit state
|
147
|
+
X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error exit state
|
148
|
+
3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [2] NL* [exit state]
|
149
|
+
X_, 2, 4, 9, 10,11, 9,13, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [3] <
|
150
|
+
X_, 2, 9, 5, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [4] <!
|
151
|
+
X_, 2, 9, 6, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [5] <!-
|
152
|
+
6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [6] <!--.*
|
153
|
+
6, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [7] <!--.*-
|
154
|
+
6, 2, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [8] <!--.*--
|
155
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [9] <.*
|
156
|
+
10,10,10,10, 9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*"
|
157
|
+
11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*'
|
158
|
+
X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " '
|
159
|
+
|
160
|
+
// < > ! - " ' / S C R I P T Y L E CR NL PL xx
|
161
|
+
X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9, 9, 9, 9,X_, // [13] <S
|
162
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9,15, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [14] <SC
|
163
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9,16, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [15] <SCR
|
164
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9,17, 9, 9, 9, 9, 9, 9, 9,X_, // [16] <SCRI
|
165
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 18, 9, 9, 9, 9, 9, 9,X_, // [17] <SCRIP
|
166
|
+
X_,19, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT
|
167
|
+
20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .*
|
168
|
+
19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*<
|
169
|
+
19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 21,21,19,X_, // [21] <SCRIPT .*</ allow SP CR LF
|
170
|
+
19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S
|
171
|
+
19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC
|
172
|
+
19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR
|
173
|
+
19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI
|
174
|
+
19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP
|
175
|
+
19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT
|
176
|
+
|
177
|
+
// < > ! - " ' / S C R I P T Y L E CR NL PL xx
|
178
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9,29, 9, 9, 9, 9, 9,X_, // [28] <ST
|
179
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9,30, 9, 9, 9, 9,X_, // [29] <STY
|
180
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9,31, 9, 9, 9,X_, // [30] <STYL
|
181
|
+
X_,32, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE
|
182
|
+
33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .*
|
183
|
+
32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*<
|
184
|
+
32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 34,34,32,X_, // [34] <STYLE .*</ allow SP CR LF
|
185
|
+
32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S
|
186
|
+
32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST
|
187
|
+
32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY
|
188
|
+
32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL
|
189
|
+
32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE
|
190
|
+
};
|
191
|
+
|
192
|
+
#undef OK
|
193
|
+
#undef X_
|
194
|
+
|
195
|
+
enum
|
196
|
+
{
|
197
|
+
UTFmax = 4, // maximum bytes per rune
|
198
|
+
Runesync = 0x80, // cannot represent part of a UTF sequence (<)
|
199
|
+
Runeself = 0x80, // rune and UTF sequences are the same (<)
|
200
|
+
Runeerror = 0xFFFD, // decoding error in UTF
|
201
|
+
Runemax = 0x10FFFF, // maximum rune value
|
202
|
+
};
|
203
|
+
|
204
|
+
// Debugging. Not thread safe.
|
205
|
+
static char gDisplayPiece[32];
|
206
|
+
const uint8 gCharlen[16] = {1,1,1,1, 1,1,1,1, 1,1,1,1, 2,2,3,4};
|
207
|
+
char* DisplayPiece(const char* next_byte_, int byte_length_) {
|
208
|
+
// Copy up to 8 UTF-8 chars to buffer
|
209
|
+
int k = 0; // byte count
|
210
|
+
int n = 0; // character count
|
211
|
+
for (int i = 0; i < byte_length_; ++i) {
|
212
|
+
char c = next_byte_[i];
|
213
|
+
if ((c & 0xc0) != 0x80) {
|
214
|
+
// Beginning of a UTF-8 character
|
215
|
+
int charlen = gCharlen[static_cast<uint8>(c) >> 4];
|
216
|
+
if (i + charlen > byte_length_) {break;} // Not enough room for full char
|
217
|
+
if (k >= (32 - 7)) {break;} // Not necessarily enough room
|
218
|
+
if (n >= 8) {break;} // Enough characters already
|
219
|
+
++n;
|
220
|
+
}
|
221
|
+
if (c == '<') {
|
222
|
+
memcpy(&gDisplayPiece[k], "<", 4); k += 4;
|
223
|
+
} else if (c == '>') {
|
224
|
+
memcpy(&gDisplayPiece[k], ">", 4); k += 4;
|
225
|
+
} else if (c == '&') {
|
226
|
+
memcpy(&gDisplayPiece[k], "&", 5); k += 5;
|
227
|
+
} else if (c == '\'') {
|
228
|
+
memcpy(&gDisplayPiece[k], "'", 6); k += 6;
|
229
|
+
} else if (c == '"') {
|
230
|
+
memcpy(&gDisplayPiece[k], """, 6); k += 6;
|
231
|
+
} else {
|
232
|
+
gDisplayPiece[k++] = c;
|
233
|
+
}
|
234
|
+
}
|
235
|
+
gDisplayPiece[k++] = '\0';
|
236
|
+
return gDisplayPiece;
|
237
|
+
}
|
238
|
+
|
239
|
+
|
240
|
+
|
241
|
+
// runetochar copies (encodes) one rune, pointed to by r, to at most
|
242
|
+
// UTFmax bytes starting at s and returns the number of bytes generated.
|
243
|
+
int runetochar(char *str, const char32 *rune) {
|
244
|
+
// Convert to unsigned for range check.
|
245
|
+
unsigned long c;
|
246
|
+
|
247
|
+
// 1 char 00-7F
|
248
|
+
c = *rune;
|
249
|
+
if(c <= 0x7F) {
|
250
|
+
str[0] = static_cast<char>(c);
|
251
|
+
return 1;
|
252
|
+
}
|
253
|
+
|
254
|
+
// 2 char 0080-07FF
|
255
|
+
if(c <= 0x07FF) {
|
256
|
+
str[0] = 0xC0 | static_cast<char>(c >> 1*6);
|
257
|
+
str[1] = 0x80 | (c & 0x3F);
|
258
|
+
return 2;
|
259
|
+
}
|
260
|
+
|
261
|
+
// Range check
|
262
|
+
if (c > Runemax) {
|
263
|
+
c = Runeerror;
|
264
|
+
}
|
265
|
+
|
266
|
+
// 3 char 0800-FFFF
|
267
|
+
if (c <= 0xFFFF) {
|
268
|
+
str[0] = 0xE0 | static_cast<char>(c >> 2*6);
|
269
|
+
str[1] = 0x80 | ((c >> 1*6) & 0x3F);
|
270
|
+
str[2] = 0x80 | (c & 0x3F);
|
271
|
+
return 3;
|
272
|
+
}
|
273
|
+
|
274
|
+
// 4 char 10000-1FFFFF
|
275
|
+
str[0] = 0xF0 | static_cast<char>(c >> 3*6);
|
276
|
+
str[1] = 0x80 | ((c >> 2*6) & 0x3F);
|
277
|
+
str[2] = 0x80 | ((c >> 1*6) & 0x3F);
|
278
|
+
str[3] = 0x80 | (c & 0x3F);
|
279
|
+
return 4;
|
280
|
+
}
|
281
|
+
|
282
|
+
|
283
|
+
|
284
|
+
// Useful for converting an entity to an ascii value.
|
285
|
+
// RETURNS unicode value, or -1 if entity isn't valid. Don't include & or ;
|
286
|
+
int LookupEntity(const char* entity_name, int entity_len) {
|
287
|
+
// Make a C string
|
288
|
+
if (entity_len >= 16) {return -1;} // All real entities are shorter
|
289
|
+
char temp[16];
|
290
|
+
memcpy(temp, entity_name, entity_len);
|
291
|
+
temp[entity_len] = '\0';
|
292
|
+
int match = BinarySearch(temp, 0, kNameToEntitySize, kNameToEntity);
|
293
|
+
if (match >= 0) {return kNameToEntity[match].i;}
|
294
|
+
return -1;
|
295
|
+
}
|
296
|
+
|
297
|
+
bool ascii_isdigit(char c) {
|
298
|
+
return ('0' <= c) && (c <= '9');
|
299
|
+
}
|
300
|
+
bool ascii_isxdigit(char c) {
|
301
|
+
if (('0' <= c) && (c <= '9')) {return true;}
|
302
|
+
if (('a' <= c) && (c <= 'f')) {return true;}
|
303
|
+
if (('A' <= c) && (c <= 'F')) {return true;}
|
304
|
+
return false;
|
305
|
+
}
|
306
|
+
bool ascii_isalnum(char c) {
|
307
|
+
if (('0' <= c) && (c <= '9')) {return true;}
|
308
|
+
if (('a' <= c) && (c <= 'z')) {return true;}
|
309
|
+
if (('A' <= c) && (c <= 'Z')) {return true;}
|
310
|
+
return false;
|
311
|
+
}
|
312
|
+
int hex_digit_to_int(char c) {
|
313
|
+
if (('0' <= c) && (c <= '9')) {return c - '0';}
|
314
|
+
if (('a' <= c) && (c <= 'f')) {return c - 'a' + 10;}
|
315
|
+
if (('A' <= c) && (c <= 'F')) {return c - 'A' + 10;}
|
316
|
+
return 0;
|
317
|
+
}
|
318
|
+
|
319
|
+
static int32 strto32_base10(const char* nptr, const char* limit,
|
320
|
+
const char **endptr) {
|
321
|
+
*endptr = nptr;
|
322
|
+
while (nptr < limit && *nptr == '0') {
|
323
|
+
++nptr;
|
324
|
+
}
|
325
|
+
if (nptr == limit || !ascii_isdigit(*nptr))
|
326
|
+
return -1;
|
327
|
+
const char* end_digits_run = nptr;
|
328
|
+
while (end_digits_run < limit && ascii_isdigit(*end_digits_run)) {
|
329
|
+
++end_digits_run;
|
330
|
+
}
|
331
|
+
*endptr = end_digits_run;
|
332
|
+
const int num_digits = end_digits_run - nptr;
|
333
|
+
// kint32max == 2147483647.
|
334
|
+
if (num_digits < 9 ||
|
335
|
+
(num_digits == 10 && memcmp(nptr, "2147483647", 10) <= 0)) {
|
336
|
+
int value = 0;
|
337
|
+
for (; nptr < end_digits_run; ++nptr) {
|
338
|
+
value *= 10;
|
339
|
+
value += *nptr - '0';
|
340
|
+
}
|
341
|
+
// Overflow past the last valid unicode codepoint
|
342
|
+
// (0x10ffff) is converted to U+FFFD by FixUnicodeValue().
|
343
|
+
return FixUnicodeValue(value);
|
344
|
+
} else {
|
345
|
+
// Overflow: can't fit in an int32;
|
346
|
+
// returns the replacement character 0xFFFD.
|
347
|
+
return 0xFFFD;
|
348
|
+
}
|
349
|
+
}
|
350
|
+
|
351
|
+
static int32 strto32_base16(const char* nptr, const char* limit,
|
352
|
+
const char **endptr) {
|
353
|
+
*endptr = nptr;
|
354
|
+
while (nptr < limit && *nptr == '0') {
|
355
|
+
++nptr;
|
356
|
+
}
|
357
|
+
if (nptr == limit || !ascii_isxdigit(*nptr)) {
|
358
|
+
return -1;
|
359
|
+
}
|
360
|
+
const char* end_xdigits_run = nptr;
|
361
|
+
while (end_xdigits_run < limit && ascii_isxdigit(*end_xdigits_run)) {
|
362
|
+
++end_xdigits_run;
|
363
|
+
}
|
364
|
+
*endptr = end_xdigits_run;
|
365
|
+
const int num_xdigits = end_xdigits_run - nptr;
|
366
|
+
// kint32max == 0x7FFFFFFF.
|
367
|
+
if (num_xdigits < 8 || (num_xdigits == 8 && nptr[0] < '8')) {
|
368
|
+
int value = 0;
|
369
|
+
for (; nptr < end_xdigits_run; ++nptr) {
|
370
|
+
value <<= 4;
|
371
|
+
value += hex_digit_to_int(*nptr);
|
372
|
+
}
|
373
|
+
// Overflow past the last valid unicode codepoint
|
374
|
+
// (0x10ffff) is converted to U+FFFD by FixUnicodeValue().
|
375
|
+
return FixUnicodeValue(value);
|
376
|
+
} else {
|
377
|
+
// Overflow: can't fit in an int32;
|
378
|
+
// returns the replacement character 0xFFFD.
|
379
|
+
return 0xFFFD;
|
380
|
+
}
|
381
|
+
}
|
382
|
+
|
383
|
+
// Unescape the current character pointed to by src. SETS the number
|
384
|
+
// of chars read for the conversion (in UTF8). If src isn't a valid entity,
|
385
|
+
// just consume the & and RETURN -1. If src doesn't point to & -- which it
|
386
|
+
// should -- set src_consumed to 0 and RETURN -1.
|
387
|
+
int ReadEntity(const char* src, int srcn, int* src_consumed) {
|
388
|
+
const char* const srcend = src + srcn;
|
389
|
+
|
390
|
+
if (srcn == 0 || *src != '&') { // input should start with an ampersand
|
391
|
+
*src_consumed = 0;
|
392
|
+
return -1;
|
393
|
+
}
|
394
|
+
*src_consumed = 1; // we'll get the & at least
|
395
|
+
|
396
|
+
// The standards are a bit unclear on when an entity ends. Certainly a ";"
|
397
|
+
// ends one, but spaces probably do too. We follow the lead of both IE and
|
398
|
+
// Netscape, which as far as we can tell end numeric entities (1st case below)
|
399
|
+
// at any non-digit, and end character entities (2nd case) at any non-alnum.
|
400
|
+
const char* entstart, *entend; // where the entity starts and ends
|
401
|
+
entstart = src + 1; // read past the &
|
402
|
+
int entval; // UCS2 value of the entity
|
403
|
+
if ( *entstart == '#' ) { // -- 1st case: numeric entity
|
404
|
+
if ( entstart + 2 >= srcend ) {
|
405
|
+
return -1; // no way a legitimate number could fit
|
406
|
+
} else if ( entstart[1] == 'x' || entstart[1] == 'X' ) { // hex numeric
|
407
|
+
entval = strto32_base16(entstart + 2, srcend, &entend);
|
408
|
+
} else { // decimal numeric entity
|
409
|
+
entval = strto32_base10(entstart+1, srcend, &entend);
|
410
|
+
}
|
411
|
+
if (entval == -1 || entend > srcend) {
|
412
|
+
return -1; // not entirely correct, but close enough
|
413
|
+
}
|
414
|
+
} else { // -- 2nd case: character entity
|
415
|
+
for (entend = entstart;
|
416
|
+
entend < srcend && ascii_isalnum(*entend);
|
417
|
+
++entend ) {
|
418
|
+
// entity consists of alphanumeric chars
|
419
|
+
}
|
420
|
+
entval = LookupEntity(entstart, entend - entstart);
|
421
|
+
if (entval < 0) {
|
422
|
+
return -1; // not a legal entity name
|
423
|
+
}
|
424
|
+
// Now we do a strange-seeming IE6-compatibility check: if entval is
|
425
|
+
// >= 256, it *must* be followed by a semicolon or it's not considered
|
426
|
+
// an entity. The problem is lots of the newfangled entity names, like
|
427
|
+
// "lang", also occur in URL CGI arguments: "/search?q=test&lang=en".
|
428
|
+
// When these links are written in HTML, it would be really bad if the
|
429
|
+
// "&lang" were treated as an entity, which is what the spec says
|
430
|
+
// *should* happen (even when the HTML is inside an "A HREF" tag!)
|
431
|
+
// IE ignores the spec for these new, high-value entities, so we do too.
|
432
|
+
if ( entval >= 256 && !(entend < srcend && *entend == ';') ) {
|
433
|
+
return -1; // make non-;-terminated entity illegal
|
434
|
+
}
|
435
|
+
}
|
436
|
+
|
437
|
+
// Finally, figure out how much src was consumed
|
438
|
+
if ( entend < srcend && *entend == ';' ) {
|
439
|
+
entend++; // standard says ; terminator is special
|
440
|
+
}
|
441
|
+
*src_consumed = entend - src;
|
442
|
+
return entval;
|
443
|
+
}
|
444
|
+
|
445
|
+
|
446
|
+
// Src points to '&'
|
447
|
+
// Writes entity value to dst. Returns take(src), put(dst) byte counts
|
448
|
+
void EntityToBuffer(const char* src, int len, char* dst,
|
449
|
+
int* tlen, int* plen) {
|
450
|
+
char32 entval = ReadEntity(src, len, tlen);
|
451
|
+
|
452
|
+
// ReadEntity does this already: entval = FixUnicodeValue(entval);
|
453
|
+
|
454
|
+
// Convert UTF-32 to UTF-8
|
455
|
+
if (entval > 0) {
|
456
|
+
*plen = runetochar(dst, &entval);
|
457
|
+
} else {
|
458
|
+
// Illegal entity; ignore the '&'
|
459
|
+
*tlen = 1;
|
460
|
+
*plen = 0;
|
461
|
+
}
|
462
|
+
}
|
463
|
+
|
464
|
+
// Returns true if character is < > or &, none of which are letters
|
465
|
+
bool inline IsSpecial(char c) {
|
466
|
+
// Comparison (int != 0) is used to silence the warning:
|
467
|
+
// 'const char': forcing value to bool
|
468
|
+
if ((c & 0xe0) == 0x20) {
|
469
|
+
return (kSpecialSymbol[static_cast<uint8>(c)] != 0);
|
470
|
+
}
|
471
|
+
return false;
|
472
|
+
}
|
473
|
+
|
474
|
+
// Quick Skip to next letter or < > & or to end of string (eos)
|
475
|
+
// Always return is_letter for eos
|
476
|
+
int ScanToLetterOrSpecial(const char* src, int len) {
|
477
|
+
int bytes_consumed;
|
478
|
+
StringPiece str(src, len);
|
479
|
+
UTF8GenericScan(&utf8scannot_lettermarkspecial_obj, str, &bytes_consumed);
|
480
|
+
return bytes_consumed;
|
481
|
+
}
|
482
|
+
|
483
|
+
|
484
|
+
|
485
|
+
|
486
|
+
// src points to non-letter, such as tag-opening '<'
|
487
|
+
// Return length from here to next possible letter
|
488
|
+
// On another < before >, return 1
|
489
|
+
// advances <tag>
|
490
|
+
// | |
|
491
|
+
// advances <tag> ... </tag> for <script> <style>
|
492
|
+
// | |
|
493
|
+
// advances <!-- ... <tag> ... -->
|
494
|
+
// | |
|
495
|
+
// advances <tag
|
496
|
+
// | | end of string
|
497
|
+
// advances <tag <tag2>
|
498
|
+
// ||
|
499
|
+
int ScanToPossibleLetter(const char* isrc, int len, int max_exit_state) {
|
500
|
+
const uint8* src = reinterpret_cast<const uint8*>(isrc);
|
501
|
+
const uint8* srclimit = src + len;
|
502
|
+
const uint8* tagParseTbl = kTagParseTbl_0;
|
503
|
+
int e = 0;
|
504
|
+
while (src < srclimit) {
|
505
|
+
e = tagParseTbl[kCharToSub[*src++]];
|
506
|
+
if (e <= max_exit_state) {
|
507
|
+
// We overshot by one byte
|
508
|
+
--src;
|
509
|
+
break;
|
510
|
+
}
|
511
|
+
tagParseTbl = &kTagParseTbl_0[e * 20];
|
512
|
+
}
|
513
|
+
|
514
|
+
if (src >= srclimit) {
|
515
|
+
// We fell off the end of the text.
|
516
|
+
// It looks like the most common case for this is a truncated file, not
|
517
|
+
// mismatched angle brackets. So we pretend that the last char was '>'
|
518
|
+
return len;
|
519
|
+
}
|
520
|
+
|
521
|
+
// OK to be in state 0 or state 2 at exit
|
522
|
+
if ((e != 0) && (e != 2)) {
|
523
|
+
// Error, '<' followed by '<'
|
524
|
+
// We want to back up to first <, then advance by one byte past it
|
525
|
+
int offset = src - reinterpret_cast<const uint8*>(isrc);
|
526
|
+
|
527
|
+
// Backscan to first '<' and return enough length to just get past it
|
528
|
+
--offset; // back up over the second '<', which caused us to stop
|
529
|
+
while ((0 < offset) && (isrc[offset] != '<')) {
|
530
|
+
// Find the first '<', which is unmatched
|
531
|
+
--offset;
|
532
|
+
}
|
533
|
+
// skip to just beyond first '<'
|
534
|
+
return offset + 1;
|
535
|
+
}
|
536
|
+
|
537
|
+
return src - reinterpret_cast<const uint8*>(isrc);
|
538
|
+
}
|
539
|
+
|
540
|
+
// Returns mid if key found in lo <= mid < hi, else -1
|
541
|
+
int BinarySearch(const char* key, int lo, int hi, const CharIntPair* cipair) {
|
542
|
+
// binary search
|
543
|
+
while (lo < hi) {
|
544
|
+
int mid = (lo + hi) >> 1;
|
545
|
+
if (strcmp(key, cipair[mid].s) < 0) {
|
546
|
+
hi = mid;
|
547
|
+
} else if (strcmp(key, cipair[mid].s) > 0) {
|
548
|
+
lo = mid + 1;
|
549
|
+
} else {
|
550
|
+
return mid;
|
551
|
+
}
|
552
|
+
}
|
553
|
+
return -1;
|
554
|
+
}
|
555
|
+
|
556
|
+
// Returns the length in bytes of the prefix of src that is all
|
557
|
+
// interchange valid UTF-8
|
558
|
+
int SpanInterchangeValid(const char* src, int byte_length) {
|
559
|
+
int bytes_consumed;
|
560
|
+
const UTF8ReplaceObj* st = &utf8acceptinterchange_obj;
|
561
|
+
StringPiece str(src, byte_length);
|
562
|
+
UTF8GenericScan(st, str, &bytes_consumed);
|
563
|
+
return bytes_consumed;
|
564
|
+
}
|
565
|
+
|
566
|
+
ScriptScanner::ScriptScanner(const char* buffer,
|
567
|
+
int buffer_length,
|
568
|
+
bool is_plain_text)
|
569
|
+
: start_byte_(buffer),
|
570
|
+
next_byte_(buffer),
|
571
|
+
byte_length_(buffer_length),
|
572
|
+
is_plain_text_(is_plain_text),
|
573
|
+
letters_marks_only_(true),
|
574
|
+
one_script_only_(true),
|
575
|
+
exit_state_(kMaxExitStateLettersMarksOnly) {
|
576
|
+
script_buffer_ = new char[kMaxScriptBuffer];
|
577
|
+
script_buffer_lower_ = new char[kMaxScriptLowerBuffer];
|
578
|
+
map2original_.Clear(); // map from script_buffer_ to buffer
|
579
|
+
map2uplow_.Clear(); // map from script_buffer_lower_ to script_buffer_
|
580
|
+
}
|
581
|
+
|
582
|
+
// Extended version to allow spans of any non-tag text and spans of mixed script
|
583
|
+
ScriptScanner::ScriptScanner(const char* buffer,
|
584
|
+
int buffer_length,
|
585
|
+
bool is_plain_text,
|
586
|
+
bool any_text,
|
587
|
+
bool any_script)
|
588
|
+
: start_byte_(buffer),
|
589
|
+
next_byte_(buffer),
|
590
|
+
byte_length_(buffer_length),
|
591
|
+
is_plain_text_(is_plain_text),
|
592
|
+
letters_marks_only_(!any_text),
|
593
|
+
one_script_only_(!any_script),
|
594
|
+
exit_state_(any_text ? kMaxExitStateAllText : kMaxExitStateLettersMarksOnly) {
|
595
|
+
script_buffer_ = new char[kMaxScriptBuffer];
|
596
|
+
script_buffer_lower_ = new char[kMaxScriptLowerBuffer];
|
597
|
+
map2original_.Clear(); // map from script_buffer_ to buffer
|
598
|
+
map2uplow_.Clear(); // map from script_buffer_lower_ to script_buffer_
|
599
|
+
}
|
600
|
+
|
601
|
+
|
602
|
+
ScriptScanner::~ScriptScanner() {
|
603
|
+
delete[] script_buffer_;
|
604
|
+
delete[] script_buffer_lower_;
|
605
|
+
}
|
606
|
+
|
607
|
+
|
608
|
+
|
609
|
+
|
610
|
+
// Get to the first real non-tag letter or entity that is a letter
|
611
|
+
// Sets script of that letter
|
612
|
+
// Return len if no more letters
|
613
|
+
int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) {
|
614
|
+
int sc = UNKNOWN_ULSCRIPT;
|
615
|
+
int skip = 0;
|
616
|
+
int tlen, plen;
|
617
|
+
|
618
|
+
// Do run of non-letters (tag | &NL | NL)*
|
619
|
+
tlen = 0;
|
620
|
+
while (skip < len) {
|
621
|
+
// Do fast scan to next interesting byte
|
622
|
+
// int oldskip = skip;
|
623
|
+
skip += ScanToLetterOrSpecial(src + skip, len - skip);
|
624
|
+
|
625
|
+
// Check for no more letters/specials
|
626
|
+
if (skip >= len) {
|
627
|
+
// All done
|
628
|
+
*script = sc;
|
629
|
+
return len;
|
630
|
+
}
|
631
|
+
|
632
|
+
// We are at a letter, nonletter, tag, or entity
|
633
|
+
if (IsSpecial(src[skip]) && !is_plain_text_) {
|
634
|
+
if (src[skip] == '<') {
|
635
|
+
// Begining of tag; skip to end and go around again
|
636
|
+
tlen = ScanToPossibleLetter(src + skip, len - skip,
|
637
|
+
exit_state_);
|
638
|
+
sc = 0;
|
639
|
+
} else if (src[skip] == '>') {
|
640
|
+
// Unexpected end of tag; skip it and go around again
|
641
|
+
tlen = 1; // Over the >
|
642
|
+
sc = 0;
|
643
|
+
} else if (src[skip] == '&') {
|
644
|
+
// Expand entity, no advance
|
645
|
+
char temp[4];
|
646
|
+
EntityToBuffer(src + skip, len - skip,
|
647
|
+
temp, &tlen, &plen);
|
648
|
+
if (plen > 0) {
|
649
|
+
sc = GetUTF8LetterScriptNum(temp);
|
650
|
+
}
|
651
|
+
}
|
652
|
+
} else {
|
653
|
+
// Update 1..4 bytes
|
654
|
+
tlen = UTF8OneCharLen(src + skip);
|
655
|
+
sc = GetUTF8LetterScriptNum(src + skip);
|
656
|
+
}
|
657
|
+
if (sc != 0) {break;} // Letter found
|
658
|
+
skip += tlen; // Else advance
|
659
|
+
}
|
660
|
+
|
661
|
+
*script = sc;
|
662
|
+
return skip;
|
663
|
+
}
|
664
|
+
|
665
|
+
|
666
|
+
// These are for ASCII-only tag names
|
667
|
+
// Compare one letter uplow to c, ignoring case of uplowp
|
668
|
+
inline bool EqCase(char uplow, char c) {
|
669
|
+
return (uplow | 0x20) == c;
|
670
|
+
}
|
671
|
+
|
672
|
+
// These are for ASCII-only tag names
|
673
|
+
// Return true for space / < > etc. all less than 0x40
|
674
|
+
inline bool NeqLetter(char c) {
|
675
|
+
return c < 0x40;
|
676
|
+
}
|
677
|
+
|
678
|
+
// These are for ASCII-only tag names
|
679
|
+
// Return true for space \n false for \r
|
680
|
+
inline bool WS(char c) {
|
681
|
+
return (c == ' ') || (c == '\n');
|
682
|
+
}
|
683
|
+
|
684
|
+
// Canonical CR or LF
|
685
|
+
static const char LF = '\n';
|
686
|
+
|
687
|
+
|
688
|
+
// The naive loop scans from next_byte_ to script_buffer_ until full.
|
689
|
+
// But this can leave an awkward hard-to-identify short fragment at the
|
690
|
+
// end of the input. We would prefer to make the next-to-last fragment
|
691
|
+
// shorter and the last fragment longer.
|
692
|
+
|
693
|
+
// Copy next run of non-tag characters to buffer [NUL terminated]
|
694
|
+
// This just replaces tags with space or \n and removes entities.
|
695
|
+
// Tags <br> <p> and <tr> are replaced with \n. Non-letter sequences
|
696
|
+
// including \r or \n are replaced by \n. All other tags and skipped text
|
697
|
+
// are replaced with ASCII space.
|
698
|
+
//
|
699
|
+
// Buffer ALWAYS has leading space and trailing space space space NUL
|
700
|
+
bool ScriptScanner::GetOneTextSpan(LangSpan* span) {
|
701
|
+
span->text = script_buffer_;
|
702
|
+
span->text_bytes = 0;
|
703
|
+
span->offset = next_byte_ - start_byte_;
|
704
|
+
span->ulscript = UNKNOWN_ULSCRIPT;
|
705
|
+
span->truncated = false;
|
706
|
+
|
707
|
+
int put_soft_limit = kMaxScriptBytes - kWithinScriptTail;
|
708
|
+
if ((kMaxScriptBytes <= byte_length_) &&
|
709
|
+
(byte_length_ < (2 * kMaxScriptBytes))) {
|
710
|
+
// Try to split the last two fragments in half
|
711
|
+
put_soft_limit = byte_length_ / 2;
|
712
|
+
}
|
713
|
+
|
714
|
+
script_buffer_[0] = ' '; // Always a space at front of output
|
715
|
+
script_buffer_[1] = '\0';
|
716
|
+
int take = 0;
|
717
|
+
int put = 1; // Start after the initial space
|
718
|
+
int tlen = 0, plen = 0;
|
719
|
+
|
720
|
+
if (byte_length_ <= 0) {
|
721
|
+
return false; // No more text to be found
|
722
|
+
}
|
723
|
+
|
724
|
+
// Go over alternating spans of text and tags,
|
725
|
+
// copying letters to buffer with single spaces for each run of non-letters
|
726
|
+
bool last_byte_was_space = false;
|
727
|
+
while (take < byte_length_) {
|
728
|
+
char c = next_byte_[take];
|
729
|
+
if (c == '\r') {c = LF;} // Canonical CR or LF
|
730
|
+
if (c == '\n') {c = LF;} // Canonical CR or LF
|
731
|
+
|
732
|
+
if (IsSpecial(c) && !is_plain_text_) {
|
733
|
+
if (c == '<') {
|
734
|
+
// Replace tag with space
|
735
|
+
c = ' '; // for almost-full test below
|
736
|
+
// or if <p> <br> <tr>, replace with \n
|
737
|
+
if (take < (byte_length_ - 3)) {
|
738
|
+
if (EqCase(next_byte_[take + 1], 'p') &&
|
739
|
+
NeqLetter(next_byte_[take + 2])) {
|
740
|
+
c = LF;
|
741
|
+
}
|
742
|
+
if (EqCase(next_byte_[take + 1], 'b') &&
|
743
|
+
EqCase(next_byte_[take + 2], 'r') &&
|
744
|
+
NeqLetter(next_byte_[take + 3])) {
|
745
|
+
c = LF;
|
746
|
+
}
|
747
|
+
if (EqCase(next_byte_[take + 1], 't') &&
|
748
|
+
EqCase(next_byte_[take + 2], 'r') &&
|
749
|
+
NeqLetter(next_byte_[take + 3])) {
|
750
|
+
c = LF;
|
751
|
+
}
|
752
|
+
}
|
753
|
+
// Begining of tag; skip to end and go around again
|
754
|
+
tlen = 1 + ScanToPossibleLetter(next_byte_ + take, byte_length_ - take,
|
755
|
+
exit_state_);
|
756
|
+
// Copy one byte, compressing spaces
|
757
|
+
if (!last_byte_was_space || !WS(c)) {
|
758
|
+
script_buffer_[put++] = c; // Advance dest
|
759
|
+
last_byte_was_space = WS(c);
|
760
|
+
}
|
761
|
+
} else if (c == '>') {
|
762
|
+
// Unexpected end of tag; copy it and go around again
|
763
|
+
tlen = 1; // Over the >
|
764
|
+
script_buffer_[put++] = c; // Advance dest
|
765
|
+
} else if (c == '&') {
|
766
|
+
// Expand entity, no advance
|
767
|
+
EntityToBuffer(next_byte_ + take, byte_length_ - take,
|
768
|
+
script_buffer_ + put, &tlen, &plen);
|
769
|
+
put += plen; // Advance dest
|
770
|
+
}
|
771
|
+
take += tlen; // Advance source
|
772
|
+
} else {
|
773
|
+
// Copy one byte, compressing spaces
|
774
|
+
if (!last_byte_was_space || !WS(c)) {
|
775
|
+
script_buffer_[put++] = c; // Advance dest
|
776
|
+
last_byte_was_space = WS(c);
|
777
|
+
}
|
778
|
+
++take; // Advance source
|
779
|
+
}
|
780
|
+
|
781
|
+
if (WS(c) &&
|
782
|
+
(put >= put_soft_limit)) {
|
783
|
+
// Buffer is almost full
|
784
|
+
span->truncated = true;
|
785
|
+
break;
|
786
|
+
}
|
787
|
+
if (put >= kMaxScriptBytes) {
|
788
|
+
// Buffer is completely full
|
789
|
+
span->truncated = true;
|
790
|
+
break;
|
791
|
+
}
|
792
|
+
}
|
793
|
+
|
794
|
+
// Almost done. Back up to a character boundary if needed
|
795
|
+
while ((0 < take) && ((next_byte_[take] & 0xc0) == 0x80)) {
|
796
|
+
// Back up over continuation byte
|
797
|
+
--take;
|
798
|
+
--put;
|
799
|
+
}
|
800
|
+
|
801
|
+
// Update input position
|
802
|
+
next_byte_ += take;
|
803
|
+
byte_length_ -= take;
|
804
|
+
|
805
|
+
// Put four more spaces/NUL. Worst case is abcd _ _ _ \0
|
806
|
+
// kMaxScriptBytes | | put
|
807
|
+
script_buffer_[put + 0] = ' ';
|
808
|
+
script_buffer_[put + 1] = ' ';
|
809
|
+
script_buffer_[put + 2] = ' ';
|
810
|
+
script_buffer_[put + 3] = '\0';
|
811
|
+
|
812
|
+
span->text_bytes = put; // Does not include the last four chars above
|
813
|
+
return true;
|
814
|
+
}
|
815
|
+
|
816
|
+
|
817
|
+
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
|
818
|
+
// Buffer ALWAYS has leading space and trailing space space space NUL
|
819
|
+
bool ScriptScanner::GetOneScriptSpan(LangSpan* span) {
|
820
|
+
if (!letters_marks_only_) {
|
821
|
+
// Return non-tag text, including punctuation and digits
|
822
|
+
return GetOneTextSpan(span);
|
823
|
+
}
|
824
|
+
|
825
|
+
span->text = script_buffer_;
|
826
|
+
span->text_bytes = 0;
|
827
|
+
span->offset = next_byte_ - start_byte_;
|
828
|
+
span->ulscript = UNKNOWN_ULSCRIPT;
|
829
|
+
span->truncated = false;
|
830
|
+
|
831
|
+
// struct timeval script_start, script_mid, script_end;
|
832
|
+
|
833
|
+
int put_soft_limit = kMaxScriptBytes - kWithinScriptTail;
|
834
|
+
if ((kMaxScriptBytes <= byte_length_) &&
|
835
|
+
(byte_length_ < (2 * kMaxScriptBytes))) {
|
836
|
+
// Try to split the last two fragments in half
|
837
|
+
put_soft_limit = byte_length_ / 2;
|
838
|
+
}
|
839
|
+
|
840
|
+
|
841
|
+
int spanscript; // The script of this span
|
842
|
+
int sc = UNKNOWN_ULSCRIPT; // The script of next character
|
843
|
+
int tlen = 0;
|
844
|
+
int plen = 0;
|
845
|
+
|
846
|
+
script_buffer_[0] = ' '; // Always a space at front of output
|
847
|
+
script_buffer_[1] = '\0';
|
848
|
+
int take = 0;
|
849
|
+
int put = 1; // Start after the initial space
|
850
|
+
|
851
|
+
// Build offsets from span->text back to start_byte_ + span->offset
|
852
|
+
// This mapping reflects deletion of non-letters, expansion of
|
853
|
+
// entities, etc.
|
854
|
+
map2original_.Clear();
|
855
|
+
map2original_.Delete(span->offset); // So that MapBack(0) gives offset
|
856
|
+
|
857
|
+
// Get to the first real non-tag letter or entity that is a letter
|
858
|
+
int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript);
|
859
|
+
next_byte_ += skip;
|
860
|
+
byte_length_ -= skip;
|
861
|
+
|
862
|
+
if (skip != 1) {
|
863
|
+
map2original_.Delete(skip);
|
864
|
+
map2original_.Insert(1);
|
865
|
+
} else {
|
866
|
+
map2original_.Copy(1);
|
867
|
+
}
|
868
|
+
if (byte_length_ <= 0) {
|
869
|
+
map2original_.Reset();
|
870
|
+
return false; // No more letters to be found
|
871
|
+
}
|
872
|
+
|
873
|
+
// There is at least one letter, so we know the script for this span
|
874
|
+
span->ulscript = (ULScript)spanscript;
|
875
|
+
|
876
|
+
|
877
|
+
// Go over alternating spans of same-script letters and non-letters,
|
878
|
+
// copying letters to buffer with single spaces for each run of non-letters
|
879
|
+
while (take < byte_length_) {
|
880
|
+
// Copy run of letters in same script (&LS | LS)*
|
881
|
+
int letter_count = 0; // Keep track of word length
|
882
|
+
bool need_break = false;
|
883
|
+
|
884
|
+
while (take < byte_length_) {
|
885
|
+
// We are at a letter, nonletter, tag, or entity
|
886
|
+
if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
|
887
|
+
if (next_byte_[take] == '<') {
|
888
|
+
// Begining of tag
|
889
|
+
sc = 0;
|
890
|
+
break;
|
891
|
+
} else if (next_byte_[take] == '>') {
|
892
|
+
// Unexpected end of tag
|
893
|
+
sc = 0;
|
894
|
+
break;
|
895
|
+
} else if (next_byte_[take] == '&') {
|
896
|
+
// Copy entity, no advance
|
897
|
+
EntityToBuffer(next_byte_ + take, byte_length_ - take,
|
898
|
+
script_buffer_ + put, &tlen, &plen);
|
899
|
+
if (plen > 0) {
|
900
|
+
sc = GetUTF8LetterScriptNum(script_buffer_ + put);
|
901
|
+
}
|
902
|
+
}
|
903
|
+
} else {
|
904
|
+
// Real letter, safely copy up to 4 bytes, increment by 1..4
|
905
|
+
// Will update by 1..4 bytes at Advance, below
|
906
|
+
tlen = plen = UTF8OneCharLen(next_byte_ + take);
|
907
|
+
if (take < (byte_length_ - 3)) {
|
908
|
+
// X86 fast case, does unaligned load/store
|
909
|
+
UNALIGNED_STORE32(script_buffer_ + put,
|
910
|
+
UNALIGNED_LOAD32(next_byte_ + take));
|
911
|
+
|
912
|
+
} else {
|
913
|
+
// Slow case, happens 1-3 times per input document
|
914
|
+
memcpy(script_buffer_ + put, next_byte_ + take, plen);
|
915
|
+
}
|
916
|
+
sc = GetUTF8LetterScriptNum(next_byte_ + take);
|
917
|
+
}
|
918
|
+
|
919
|
+
// Allow continue across a single letter in a different script:
|
920
|
+
// A B D = three scripts, c = common script, i = inherited script,
|
921
|
+
// - = don't care, ( = take position before the += below
|
922
|
+
// AAA(A- continue
|
923
|
+
//
|
924
|
+
// AAA(BA continue
|
925
|
+
// AAA(BB break
|
926
|
+
// AAA(Bc continue (breaks after B)
|
927
|
+
// AAA(BD break
|
928
|
+
// AAA(Bi break
|
929
|
+
//
|
930
|
+
// AAA(c- break
|
931
|
+
//
|
932
|
+
// AAA(i- continue
|
933
|
+
//
|
934
|
+
|
935
|
+
if ((sc != spanscript) && (sc != ULScript_Inherited)) {
|
936
|
+
// Might need to break this script span
|
937
|
+
if (sc == ULScript_Common) {
|
938
|
+
need_break = true;
|
939
|
+
} else {
|
940
|
+
// Look at next following character, ignoring entity as Common
|
941
|
+
int sc2 = GetUTF8LetterScriptNum(next_byte_ + take + tlen);
|
942
|
+
if ((sc2 != ULScript_Common) && (sc2 != spanscript)) {
|
943
|
+
// We found a non-trivial change of script
|
944
|
+
if (one_script_only_) {
|
945
|
+
need_break = true;
|
946
|
+
}
|
947
|
+
}
|
948
|
+
}
|
949
|
+
}
|
950
|
+
if (need_break) {break;} // Non-letter or letter in wrong script
|
951
|
+
|
952
|
+
take += tlen; // Advance
|
953
|
+
put += plen; // Advance
|
954
|
+
|
955
|
+
// Update the offset map to reflect take/put lengths
|
956
|
+
if (tlen == plen) {
|
957
|
+
map2original_.Copy(tlen);
|
958
|
+
} else if (tlen < plen) {
|
959
|
+
map2original_.Copy(tlen);
|
960
|
+
map2original_.Insert(plen - tlen);
|
961
|
+
} else { // plen < tlen
|
962
|
+
map2original_.Copy(plen);
|
963
|
+
map2original_.Delete(tlen - plen);
|
964
|
+
}
|
965
|
+
|
966
|
+
++letter_count;
|
967
|
+
if (put >= kMaxScriptBytes) {
|
968
|
+
// Buffer is full
|
969
|
+
span->truncated = true;
|
970
|
+
break;
|
971
|
+
}
|
972
|
+
} // End while letters
|
973
|
+
|
974
|
+
// Do run of non-letters (tag | &NL | NL)*
|
975
|
+
while (take < byte_length_) {
|
976
|
+
// Do fast scan to next interesting byte
|
977
|
+
tlen = ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take);
|
978
|
+
take += tlen;
|
979
|
+
map2original_.Delete(tlen);
|
980
|
+
if (take >= byte_length_) {break;} // Might have scanned to end
|
981
|
+
|
982
|
+
// We are at a letter, nonletter, tag, or entity
|
983
|
+
if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
|
984
|
+
if (next_byte_[take] == '<') {
|
985
|
+
// Begining of tag; skip to end and go around again
|
986
|
+
tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take,
|
987
|
+
exit_state_);
|
988
|
+
sc = 0;
|
989
|
+
} else if (next_byte_[take] == '>') {
|
990
|
+
// Unexpected end of tag; skip it and go around again
|
991
|
+
tlen = 1; // Over the >
|
992
|
+
sc = 0;
|
993
|
+
} else if (next_byte_[take] == '&') {
|
994
|
+
// Expand entity, no advance
|
995
|
+
EntityToBuffer(next_byte_ + take, byte_length_ - take,
|
996
|
+
script_buffer_ + put, &tlen, &plen);
|
997
|
+
if (plen > 0) {
|
998
|
+
sc = GetUTF8LetterScriptNum(script_buffer_ + put);
|
999
|
+
}
|
1000
|
+
}
|
1001
|
+
} else {
|
1002
|
+
// Update 1..4
|
1003
|
+
tlen = UTF8OneCharLen(next_byte_ + take);
|
1004
|
+
sc = GetUTF8LetterScriptNum(next_byte_ + take);
|
1005
|
+
}
|
1006
|
+
if (sc != 0) {break;} // Letter found
|
1007
|
+
take += tlen; // Else advance
|
1008
|
+
map2original_.Delete(tlen);
|
1009
|
+
} // End while not-letters
|
1010
|
+
|
1011
|
+
script_buffer_[put++] = ' ';
|
1012
|
+
map2original_.Insert(1);
|
1013
|
+
|
1014
|
+
// Letter in wrong script ?
|
1015
|
+
if ((sc != spanscript) && (sc != ULScript_Inherited)) {break;}
|
1016
|
+
if (put >= put_soft_limit) {
|
1017
|
+
// Buffer is almost full
|
1018
|
+
span->truncated = true;
|
1019
|
+
break;
|
1020
|
+
}
|
1021
|
+
}
|
1022
|
+
|
1023
|
+
// Almost done. Back up to a character boundary if needed
|
1024
|
+
while ((0 < take) && (take < byte_length_) &&
|
1025
|
+
((next_byte_[take] & 0xc0) == 0x80)) {
|
1026
|
+
// Back up over continuation byte
|
1027
|
+
--take;
|
1028
|
+
--put;
|
1029
|
+
}
|
1030
|
+
|
1031
|
+
// Update input position
|
1032
|
+
next_byte_ += take;
|
1033
|
+
byte_length_ -= take;
|
1034
|
+
|
1035
|
+
// Put four more spaces/NUL. Worst case is abcd _ _ _ \0
|
1036
|
+
// kMaxScriptBytes | | put
|
1037
|
+
script_buffer_[put + 0] = ' ';
|
1038
|
+
script_buffer_[put + 1] = ' ';
|
1039
|
+
script_buffer_[put + 2] = ' ';
|
1040
|
+
script_buffer_[put + 3] = '\0';
|
1041
|
+
map2original_.Insert(4);
|
1042
|
+
map2original_.Reset();
|
1043
|
+
|
1044
|
+
span->text_bytes = put; // Does not include the last four chars above
|
1045
|
+
return true;
|
1046
|
+
}
|
1047
|
+
|
1048
|
+
// Force Latin, Cyrillic, Armenian, Greek scripts to be lowercase
|
1049
|
+
// List changes with each version of Unicode, so just always lowercase
|
1050
|
+
// Unicode 6.2.0:
|
1051
|
+
// ARMENIAN COPTIC CYRILLIC DESERET GEORGIAN GLAGOLITIC GREEK LATIN
|
1052
|
+
void ScriptScanner::LowerScriptSpan(LangSpan* span) {
|
1053
|
+
// If needed, lowercase all the text. If we do it sooner, might miss
|
1054
|
+
// lowercasing an entity such as Á
|
1055
|
+
// We only need to do this for Latn and Cyrl scripts
|
1056
|
+
map2uplow_.Clear();
|
1057
|
+
// Full Unicode lowercase of the entire buffer, including
|
1058
|
+
// four pad bytes off the end.
|
1059
|
+
// Ahhh. But the last byte 0x00 is not interchange-valid, so we do 3 pad
|
1060
|
+
// bytes and put the 0x00 in explicitly.
|
1061
|
+
// Build an offset map from script_buffer_lower_ back to script_buffer_
|
1062
|
+
int consumed, filled, changed;
|
1063
|
+
StringPiece istr(span->text, span->text_bytes + 3);
|
1064
|
+
StringPiece ostr(script_buffer_lower_, kMaxScriptLowerBuffer);
|
1065
|
+
|
1066
|
+
UTF8GenericReplace(&utf8repl_lettermarklower_obj,
|
1067
|
+
istr, ostr, is_plain_text_,
|
1068
|
+
&consumed, &filled, &changed, &map2uplow_);
|
1069
|
+
script_buffer_lower_[filled] = '\0';
|
1070
|
+
span->text = script_buffer_lower_;
|
1071
|
+
span->text_bytes = filled - 3;
|
1072
|
+
map2uplow_.Reset();
|
1073
|
+
}
|
1074
|
+
|
1075
|
+
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
|
1076
|
+
// Force Latin, Cyrillic, Greek scripts to be lowercase
|
1077
|
+
// Buffer ALWAYS has leading space and trailing space space space NUL
|
1078
|
+
bool ScriptScanner::GetOneScriptSpanLower(LangSpan* span) {
|
1079
|
+
bool ok = GetOneScriptSpan(span);
|
1080
|
+
if (ok) {
|
1081
|
+
LowerScriptSpan(span);
|
1082
|
+
}
|
1083
|
+
return ok;
|
1084
|
+
}
|
1085
|
+
|
1086
|
+
// Maps byte offset in most recent GetOneScriptSpan/Lower
|
1087
|
+
// span->text [0..text_bytes] into an additional byte offset from
|
1088
|
+
// span->offset, to get back to corresponding text in the original
|
1089
|
+
// input buffer.
|
1090
|
+
// text_offset must be the first byte
|
1091
|
+
// of a UTF-8 character, or just beyond the last character. Normally this
|
1092
|
+
// routine is called with the first byte of an interesting range and
|
1093
|
+
// again with the first byte of the following range.
|
1094
|
+
int ScriptScanner::MapBack(int text_offset) {
|
1095
|
+
return map2original_.MapBack(map2uplow_.MapBack(text_offset));
|
1096
|
+
}
|
1097
|
+
|
1098
|
+
|
1099
|
+
// Gets lscript number for letters; always returns
|
1100
|
+
// 0 (common script) for non-letters
|
1101
|
+
int GetUTF8LetterScriptNum(const char* src) {
|
1102
|
+
int srclen = UTF8OneCharLen(src);
|
1103
|
+
const uint8* usrc = reinterpret_cast<const uint8*>(src);
|
1104
|
+
return UTF8GenericPropertyTwoByte(&utf8prop_lettermarkscriptnum_obj,
|
1105
|
+
&usrc, &srclen);
|
1106
|
+
}
|
1107
|
+
|
1108
|
+
} // namespace CLD2
|
1109
|
+
} // namespace chrome_lang_id
|