language_detection 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +19 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +85 -0
- data/Rakefile +11 -0
- data/ext/cld/Makefile +34 -0
- data/ext/cld/base/basictypes.h +348 -0
- data/ext/cld/base/build_config.h +124 -0
- data/ext/cld/base/casts.h +156 -0
- data/ext/cld/base/commandlineflags.h +443 -0
- data/ext/cld/base/crash.h +41 -0
- data/ext/cld/base/dynamic_annotations.h +358 -0
- data/ext/cld/base/global_strip_options.h +59 -0
- data/ext/cld/base/log_severity.h +46 -0
- data/ext/cld/base/logging.h +1403 -0
- data/ext/cld/base/macros.h +243 -0
- data/ext/cld/base/port.h +54 -0
- data/ext/cld/base/scoped_ptr.h +428 -0
- data/ext/cld/base/stl_decl.h +0 -0
- data/ext/cld/base/stl_decl_msvc.h +107 -0
- data/ext/cld/base/string_util.h +29 -0
- data/ext/cld/base/strtoint.h +93 -0
- data/ext/cld/base/template_util.h +96 -0
- data/ext/cld/base/type_traits.h +198 -0
- data/ext/cld/base/vlog_is_on.h +143 -0
- data/ext/cld/cld.so +0 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
- data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
- data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
- data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
- data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
- data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
- data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
- data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
- data/ext/cld/encodings/internal/encodings.cc +12 -0
- data/ext/cld/encodings/lang_enc.h +254 -0
- data/ext/cld/encodings/proto/encodings.pb.h +169 -0
- data/ext/cld/encodings/public/encodings.h +301 -0
- data/ext/cld/extconf.rb +1 -0
- data/ext/cld/language_detection.cc +88 -0
- data/ext/cld/languages/internal/languages.cc +337 -0
- data/ext/cld/languages/proto/languages.pb.h +179 -0
- data/ext/cld/languages/public/languages.h +379 -0
- data/language_detection.gemspec +28 -0
- data/lib/language_detection/string.rb +1 -0
- data/lib/language_detection/version.rb +3 -0
- data/lib/language_detection.rb +54 -0
- data/test/_helper.rb +15 -0
- data/test/fixtures/languages.csv +80 -0
- data/test/language_detection_test.rb +88 -0
- metadata +250 -0
@@ -0,0 +1,570 @@
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
#include "encodings/compact_lang_det/getonescriptspan.h"
|
6
|
+
#include <stdio.h>
|
7
|
+
#include <string.h>
|
8
|
+
|
9
|
+
#include "encodings/lang_enc.h"
|
10
|
+
#include "encodings/compact_lang_det/utf8propjustletter.h"
|
11
|
+
#include "encodings/compact_lang_det/utf8propletterscriptnum.h"
|
12
|
+
#include "encodings/compact_lang_det/utf8scannotjustletterspecial.h"
|
13
|
+
|
14
|
+
#include "encodings/compact_lang_det/win/cld_basictypes.h"
|
15
|
+
#include "encodings/compact_lang_det/win/cld_commandlineflags.h"
|
16
|
+
#include "encodings/compact_lang_det/win/cld_google.h"
|
17
|
+
#include "encodings/compact_lang_det/win/cld_htmlutils.h"
|
18
|
+
#include "encodings/compact_lang_det/win/cld_unilib.h"
|
19
|
+
#include "encodings/compact_lang_det/win/cld_utf8statetable.h"
|
20
|
+
#include "encodings/compact_lang_det/win/cld_utf8utils.h"
|
21
|
+
|
22
|
+
static const Language GRAY_LANG = (Language)254;
|
23
|
+
|
24
|
+
static const int kMaxUpToWordBoundary = 50; // span < this make longer,
|
25
|
+
// else make shorter
|
26
|
+
static const int kMaxAdvanceToWordBoundary = 10; // +/- this many bytes
|
27
|
+
// to round to word boundary,
|
28
|
+
// direction above
|
29
|
+
|
30
|
+
static const char kSpecialSymbol[256] = { // true for < > &
|
31
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
32
|
+
0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0,
|
33
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
34
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
35
|
+
|
36
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
37
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
38
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
39
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
40
|
+
};
|
41
|
+
|
42
|
+
|
43
|
+
|
44
|
+
#define LT 0 // <
|
45
|
+
#define GT 1 // >
|
46
|
+
#define EX 2 // !
|
47
|
+
#define HY 3 // -
|
48
|
+
#define QU 4 // "
|
49
|
+
#define AP 5 // '
|
50
|
+
#define SL 6 // /
|
51
|
+
#define S_ 7
|
52
|
+
#define C_ 8
|
53
|
+
#define R_ 9
|
54
|
+
#define I_ 10
|
55
|
+
#define P_ 11
|
56
|
+
#define T_ 12
|
57
|
+
#define Y_ 13
|
58
|
+
#define L_ 14
|
59
|
+
#define E_ 15
|
60
|
+
#define CR 16 // <cr> or <lf>
|
61
|
+
#define NL 17 // non-letter: ASCII whitespace, digit, punctuation
|
62
|
+
#define PL 18 // possible letter, incl. &
|
63
|
+
#define xx 19 // <unused>
|
64
|
+
|
65
|
+
// Map byte to one of ~20 interesting categories for cheap tag parsing
|
66
|
+
static const uint8 kCharToSub[256] = {
|
67
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL,
|
68
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
|
69
|
+
NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL,
|
70
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL,
|
71
|
+
|
72
|
+
PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
|
73
|
+
P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
|
74
|
+
PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
|
75
|
+
P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
|
76
|
+
|
77
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
|
78
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
|
79
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
|
80
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
|
81
|
+
|
82
|
+
PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
|
83
|
+
PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
|
84
|
+
PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
|
85
|
+
PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
|
86
|
+
};
|
87
|
+
|
88
|
+
#undef LT
|
89
|
+
#undef GT
|
90
|
+
#undef EX
|
91
|
+
#undef HY
|
92
|
+
#undef QU
|
93
|
+
#undef AP
|
94
|
+
#undef SL
|
95
|
+
#undef S_
|
96
|
+
#undef C_
|
97
|
+
#undef R_
|
98
|
+
#undef I_
|
99
|
+
#undef P_
|
100
|
+
#undef T_
|
101
|
+
#undef Y_
|
102
|
+
#undef L_
|
103
|
+
#undef E_
|
104
|
+
#undef CR
|
105
|
+
#undef NL
|
106
|
+
#undef PL
|
107
|
+
#undef xx
|
108
|
+
|
109
|
+
|
110
|
+
#define OK 0
|
111
|
+
#define X_ 1
|
112
|
+
|
113
|
+
// State machine to do cheap parse of non-letter strings incl. tags
|
114
|
+
// advances <tag>
|
115
|
+
// | |
|
116
|
+
// advances <tag> ... </tag> for <script> <style>
|
117
|
+
// | |
|
118
|
+
// advances <!-- ... <tag> ... -->
|
119
|
+
// | |
|
120
|
+
// advances <tag
|
121
|
+
// || (0)
|
122
|
+
// advances <tag <tag2>
|
123
|
+
// || (0)
|
124
|
+
static const uint8 kTagParseTbl_0[] = {
|
125
|
+
// < > ! - " ' / S C R I P T Y L E CR NL PL xx
|
126
|
+
3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [0] OK
|
127
|
+
X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error
|
128
|
+
3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [2] NL*
|
129
|
+
X_, 2, 4, 9, 10,11, 9,13, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [3] <
|
130
|
+
X_, 2, 9, 5, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [4] <!
|
131
|
+
X_, 2, 9, 6, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [5] <!-
|
132
|
+
6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [6] <!--.*
|
133
|
+
6, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [7] <!--.*-
|
134
|
+
6, 2, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [8] <!--.*--
|
135
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [9] <.*
|
136
|
+
10,10,10,10, 9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*"
|
137
|
+
11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*'
|
138
|
+
X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " '
|
139
|
+
|
140
|
+
// < > ! - " ' / S C R I P T Y L E CR NL PL xx
|
141
|
+
X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9, 9, 9, 9,X_, // [13] <S
|
142
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9,15, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [14] <SC
|
143
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9,16, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [15] <SCR
|
144
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9,17, 9, 9, 9, 9, 9, 9, 9,X_, // [16] <SCRI
|
145
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 18, 9, 9, 9, 9, 9, 9,X_, // [17] <SCRIP
|
146
|
+
X_,19, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT
|
147
|
+
20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .*
|
148
|
+
19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*<
|
149
|
+
19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [21] <SCRIPT .*</
|
150
|
+
19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S
|
151
|
+
19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC
|
152
|
+
19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR
|
153
|
+
19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI
|
154
|
+
19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP
|
155
|
+
19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT
|
156
|
+
|
157
|
+
// < > ! - " ' / S C R I P T Y L E CR NL PL xx
|
158
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9,29, 9, 9, 9, 9, 9,X_, // [28] <ST
|
159
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9,30, 9, 9, 9, 9,X_, // [29] <STY
|
160
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9,31, 9, 9, 9,X_, // [30] <STYL
|
161
|
+
X_,32, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE
|
162
|
+
33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .*
|
163
|
+
32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*<
|
164
|
+
32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [34] <STYLE .*</
|
165
|
+
32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S
|
166
|
+
32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST
|
167
|
+
32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY
|
168
|
+
32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL
|
169
|
+
32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE
|
170
|
+
};
|
171
|
+
|
172
|
+
#undef OK
|
173
|
+
#undef X_
|
174
|
+
|
175
|
+
|
176
|
+
/*
|
177
|
+
// Convert GetTimeOfDay output to 64-bit usec
|
178
|
+
static inline uint64 Microseconds(const struct timeval& t) {
|
179
|
+
// The SumReducer uses uint64, so convert to (uint64) microseconds,
|
180
|
+
// not (double) seconds.
|
181
|
+
return t.tv_sec * 1000000ULL + t.tv_usec;
|
182
|
+
}
|
183
|
+
*/
|
184
|
+
|
185
|
+
|
186
|
+
// Returns true if character is < > or &
|
187
|
+
bool inline IsSpecial(char c) {
|
188
|
+
if ((c & 0xe0) == 0x20) {
|
189
|
+
return kSpecialSymbol[static_cast<uint8>(c)];
|
190
|
+
}
|
191
|
+
return false;
|
192
|
+
}
|
193
|
+
|
194
|
+
// Quick Skip to next letter or < > & or to end of string (eos)
|
195
|
+
// Always return is_letter for eos
|
196
|
+
int ScanToLetterOrSpecial(const char* src, int len) {
|
197
|
+
int bytes_consumed;
|
198
|
+
cld::UTF8GenericScan(&utf8scannotjustletterspecial_obj, src, len,
|
199
|
+
&bytes_consumed);
|
200
|
+
return bytes_consumed;
|
201
|
+
}
|
202
|
+
|
203
|
+
|
204
|
+
|
205
|
+
// src points to non-letter, such as tag-opening '<'
|
206
|
+
// Return length from here to next possible letter
|
207
|
+
// On eos or another < before >, return 1
|
208
|
+
// advances <tag>
|
209
|
+
// | |
|
210
|
+
// advances <tag> ... </tag> for <script> <style>
|
211
|
+
// | |
|
212
|
+
// advances <!-- ... <tag> ... -->
|
213
|
+
// | |
|
214
|
+
// advances <tag
|
215
|
+
// || (1)
|
216
|
+
// advances <tag <tag2>
|
217
|
+
// || (1)
|
218
|
+
int ScanToPossibleLetter(const char* isrc, int len) {
|
219
|
+
const uint8* src = reinterpret_cast<const uint8*>(isrc);
|
220
|
+
const uint8* srclimit = src + len;
|
221
|
+
const uint8* tagParseTbl = kTagParseTbl_0;
|
222
|
+
int e = 0;
|
223
|
+
while (src < srclimit) {
|
224
|
+
e = tagParseTbl[kCharToSub[*src++]];
|
225
|
+
if ((e & ~1) == 0) {
|
226
|
+
// We overshot by one byte
|
227
|
+
--src;
|
228
|
+
break;
|
229
|
+
}
|
230
|
+
tagParseTbl = &kTagParseTbl_0[e * 20];
|
231
|
+
}
|
232
|
+
|
233
|
+
if (src >= srclimit) {
|
234
|
+
// We fell off the end of the text.
|
235
|
+
// It looks like the most common case for this is a truncated file, not
|
236
|
+
// mismatched angle brackets. So we pretend that the last char was '>'
|
237
|
+
return len;
|
238
|
+
}
|
239
|
+
|
240
|
+
// OK to be in state 0 or state 2 at exit
|
241
|
+
if ((e != 0) && (e != 2)) {
|
242
|
+
// Error, '<' followed by '<'
|
243
|
+
// We want to back up to first <, then advance by one byte past it
|
244
|
+
int offset = src - reinterpret_cast<const uint8*>(isrc);
|
245
|
+
// printf("ScanToPossibleLetter error at %d[%d] in '%s'\n",offset,e,isrc);
|
246
|
+
|
247
|
+
// Backscan to first '<' and return enough length to just get past it
|
248
|
+
--offset; // back up over the second '<', which caused us to stop
|
249
|
+
while ((0 < offset) && (isrc[offset] != '<')) {
|
250
|
+
// Find the first '<', which is unmatched
|
251
|
+
--offset;
|
252
|
+
}
|
253
|
+
// skip to just beyond first '<'
|
254
|
+
// printf(" returning %d\n", offset + 1);
|
255
|
+
return offset + 1;
|
256
|
+
}
|
257
|
+
|
258
|
+
return src - reinterpret_cast<const uint8*>(isrc);
|
259
|
+
}
|
260
|
+
|
261
|
+
|
262
|
+
|
263
|
+
ScriptScanner::ScriptScanner(const char* buffer,
|
264
|
+
int buffer_length,
|
265
|
+
bool is_plain_text)
|
266
|
+
: start_byte_(buffer),
|
267
|
+
next_byte_(buffer),
|
268
|
+
next_byte_limit_(buffer + buffer_length),
|
269
|
+
byte_length_(buffer_length),
|
270
|
+
is_plain_text_(is_plain_text) {
|
271
|
+
script_buffer_ = new char[getone::kMaxScriptBuffer];
|
272
|
+
script_buffer_lower_ = new char[getone::kMaxScriptLowerBuffer];
|
273
|
+
}
|
274
|
+
|
275
|
+
ScriptScanner::~ScriptScanner() {
|
276
|
+
delete[] script_buffer_;
|
277
|
+
delete[] script_buffer_lower_;
|
278
|
+
}
|
279
|
+
|
280
|
+
|
281
|
+
|
282
|
+
|
283
|
+
// Get to the first real non-tag letter or entity that is a letter
|
284
|
+
// Sets script of that letter
|
285
|
+
// Return len if no more letters
|
286
|
+
int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) {
|
287
|
+
int sc = UNKNOWN_LSCRIPT;
|
288
|
+
int skip = 0;
|
289
|
+
int tlen, plen;
|
290
|
+
|
291
|
+
// Do run of non-letters (tag | &NL | NL)*
|
292
|
+
while (skip < len) {
|
293
|
+
// Do fast scan to next interesting byte
|
294
|
+
// int oldskip = skip;
|
295
|
+
skip += ScanToLetterOrSpecial(src + skip, len - skip);
|
296
|
+
// TEMP
|
297
|
+
// printf("ScanToLetterOrSpecial[%d] 0x%02x => [%d] 0x%02x\n",
|
298
|
+
// oldskip, src[oldskip], skip, src[skip]);
|
299
|
+
|
300
|
+
// Check for no more letters/specials
|
301
|
+
if (skip >= len) {
|
302
|
+
// All done
|
303
|
+
return len;
|
304
|
+
}
|
305
|
+
|
306
|
+
// We are at a letter, nonletter, tag, or entity
|
307
|
+
if (IsSpecial(src[skip]) && !is_plain_text_) {
|
308
|
+
if (src[skip] == '<') {
|
309
|
+
// Begining of tag; skip to end and go around again
|
310
|
+
tlen = ScanToPossibleLetter(src + skip, len - skip);
|
311
|
+
sc = 0;
|
312
|
+
// printf("<...> ");
|
313
|
+
} else if (src[skip] == '>') {
|
314
|
+
// Unexpected end of tag; skip it and go around again
|
315
|
+
tlen = 1; // Over the >
|
316
|
+
sc = 0;
|
317
|
+
// printf("..> ");
|
318
|
+
} else if (src[skip] == '&') {
|
319
|
+
// Expand entity, no advance
|
320
|
+
char temp[4];
|
321
|
+
EntityToBuffer(src + skip, len - skip,
|
322
|
+
temp, &tlen, &plen);
|
323
|
+
sc = getone::GetUTF8LetterScriptNum(temp);
|
324
|
+
// printf("#(%02x%02x)=%d ", temp[0], temp[1], sc);
|
325
|
+
}
|
326
|
+
} else {
|
327
|
+
// Update 1..4 bytes
|
328
|
+
tlen = cld_UniLib::OneCharLen(src + skip);
|
329
|
+
sc = getone::GetUTF8LetterScriptNum(src + skip);
|
330
|
+
// printf("#(%02x%02x)=%d ", src[skip], src[skip+1], sc);
|
331
|
+
}
|
332
|
+
// TEMP
|
333
|
+
// printf("sc=%d ", sc);
|
334
|
+
if (sc != 0) {break;} // Letter found
|
335
|
+
skip += tlen; // Advance
|
336
|
+
}
|
337
|
+
|
338
|
+
*script = sc;
|
339
|
+
return skip;
|
340
|
+
}
|
341
|
+
|
342
|
+
|
343
|
+
|
344
|
+
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
|
345
|
+
// Buffer has leading space and all text is lowercased
|
346
|
+
bool ScriptScanner::GetOneScriptSpan(getone::LangSpan* span) {
|
347
|
+
span->text = script_buffer_;
|
348
|
+
span->text_bytes = 0;
|
349
|
+
span->offset = next_byte_ - start_byte_;
|
350
|
+
span->script = UNKNOWN_LSCRIPT;
|
351
|
+
span->lang = UNKNOWN_LANGUAGE;
|
352
|
+
span->truncated = false;
|
353
|
+
|
354
|
+
// printf("GetOneScriptSpan[[ ");
|
355
|
+
// struct timeval script_start, script_mid, script_end;
|
356
|
+
|
357
|
+
int spanscript; // The script of this span
|
358
|
+
int sc = UNKNOWN_LSCRIPT; // The script of next character
|
359
|
+
int tlen, plen;
|
360
|
+
|
361
|
+
|
362
|
+
script_buffer_[0] = ' '; // Always a space at front of output
|
363
|
+
script_buffer_[1] = '\0';
|
364
|
+
int take = 0;
|
365
|
+
int put = 1; // Start after the initial space
|
366
|
+
|
367
|
+
// gettimeofday(&script_start, NULL);
|
368
|
+
// Get to the first real non-tag letter or entity that is a letter
|
369
|
+
int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript);
|
370
|
+
next_byte_ += skip;
|
371
|
+
byte_length_ -= skip;
|
372
|
+
if (byte_length_ <= 0) {
|
373
|
+
// printf("]]\n");
|
374
|
+
return false; // No more letters to be found
|
375
|
+
}
|
376
|
+
|
377
|
+
// gettimeofday(&script_mid, NULL);
|
378
|
+
|
379
|
+
// There is at least one letter, so we know the script for this span
|
380
|
+
// printf("{%d} ", spanscript);
|
381
|
+
span->script = (UnicodeLScript)spanscript;
|
382
|
+
|
383
|
+
|
384
|
+
// Go over alternating spans of same-script letters and non-letters,
|
385
|
+
// copying letters to buffer with single spaces for each run of non-letters
|
386
|
+
while (take < byte_length_) {
|
387
|
+
// Copy run of letters in same script (&LS | LS)*
|
388
|
+
int letter_count = 0; // Keep track of word length
|
389
|
+
bool need_break = false;
|
390
|
+
while (take < byte_length_) {
|
391
|
+
// We are at a letter, nonletter, tag, or entity
|
392
|
+
if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
|
393
|
+
// printf("\"%c\" ", next_byte_[take]);
|
394
|
+
if (next_byte_[take] == '<') {
|
395
|
+
// Begining of tag
|
396
|
+
sc = 0;
|
397
|
+
break;
|
398
|
+
} else if (next_byte_[take] == '>') {
|
399
|
+
// Unexpected end of tag
|
400
|
+
sc = 0;
|
401
|
+
break;
|
402
|
+
} else if (next_byte_[take] == '&') {
|
403
|
+
// Copy entity, no advance
|
404
|
+
EntityToBuffer(next_byte_ + take, byte_length_ - take,
|
405
|
+
script_buffer_ + put, &tlen, &plen);
|
406
|
+
sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put);
|
407
|
+
}
|
408
|
+
} else {
|
409
|
+
// Real letter, safely copy up to 4 bytes, increment by 1..4
|
410
|
+
// Will update by 1..4 bytes at Advance, below
|
411
|
+
tlen = plen = cld_UniLib::OneCharLen(next_byte_ + take);
|
412
|
+
if (take < (byte_length_ - 3)) {
|
413
|
+
// Fast case
|
414
|
+
*reinterpret_cast<uint32*>(script_buffer_ + put) =
|
415
|
+
*reinterpret_cast<const uint32*>(next_byte_ + take);
|
416
|
+
} else {
|
417
|
+
// Slow case, happens 1-3 times per input document
|
418
|
+
memcpy(script_buffer_ + put, next_byte_ + take, plen);
|
419
|
+
}
|
420
|
+
sc = getone::GetUTF8LetterScriptNum(next_byte_ + take);
|
421
|
+
}
|
422
|
+
// printf("sc(%c)=%d ", next_byte_[take], sc);
|
423
|
+
// char xtmp[8]; memcpy(xtmp,script_buffer_ + put, plen);
|
424
|
+
// xtmp[plen] = '\0'; printf("'%s'{%d} ", xtmp, sc);
|
425
|
+
|
426
|
+
// Allow continue across a single letter in a different script:
|
427
|
+
// A B D = three scripts, c = common script, i = inherited script,
|
428
|
+
// - = don't care, ( = take position before the += below
|
429
|
+
// AAA(A- continue
|
430
|
+
//
|
431
|
+
// AAA(BA continue
|
432
|
+
// AAA(BB break
|
433
|
+
// AAA(Bc continue (breaks after B)
|
434
|
+
// AAA(BD break
|
435
|
+
// AAA(Bi break
|
436
|
+
//
|
437
|
+
// AAA(c- break
|
438
|
+
//
|
439
|
+
// AAA(i- continue
|
440
|
+
//
|
441
|
+
|
442
|
+
if ((sc != spanscript) && (sc != ULScript_Inherited)) {
|
443
|
+
// Might need to break this script span
|
444
|
+
if (sc == ULScript_Common) {
|
445
|
+
need_break = true;
|
446
|
+
} else {
|
447
|
+
// Look at next following character, ignoring entity as Common
|
448
|
+
int sc2 = getone::GetUTF8LetterScriptNum(next_byte_ + take + tlen);
|
449
|
+
if ((sc2 != ULScript_Common) && (sc2 != spanscript)) {
|
450
|
+
need_break = true;
|
451
|
+
}
|
452
|
+
}
|
453
|
+
}
|
454
|
+
if (need_break) {break;} // Non-letter or letter in wrong script
|
455
|
+
|
456
|
+
take += tlen; // Advance
|
457
|
+
put += plen; // Advance
|
458
|
+
++letter_count;
|
459
|
+
if (put >= getone::kMaxScriptBytes) {
|
460
|
+
// Buffer is full
|
461
|
+
span->truncated = true;
|
462
|
+
break;
|
463
|
+
}
|
464
|
+
} // End while letters
|
465
|
+
|
466
|
+
// Do run of non-letters (tag | &NL | NL)*
|
467
|
+
while (take < byte_length_) {
|
468
|
+
// Do fast scan to next interesting byte
|
469
|
+
take += ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take);
|
470
|
+
|
471
|
+
// Check for no more letters/specials
|
472
|
+
if (take >= byte_length_) {
|
473
|
+
take = byte_length_;
|
474
|
+
break;
|
475
|
+
}
|
476
|
+
|
477
|
+
// We are at a letter, nonletter, tag, or entity
|
478
|
+
if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
|
479
|
+
// printf("\"%c\" ", next_byte_[take]);
|
480
|
+
if (next_byte_[take] == '<') {
|
481
|
+
// Begining of tag; skip to end and go around again
|
482
|
+
tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take);
|
483
|
+
sc = 0;
|
484
|
+
// printf("<...> ");
|
485
|
+
} else if (next_byte_[take] == '>') {
|
486
|
+
// Unexpected end of tag; skip it and go around again
|
487
|
+
tlen = 1; // Over the >
|
488
|
+
sc = 0;
|
489
|
+
// printf("..> ");
|
490
|
+
} else if (next_byte_[take] == '&') {
|
491
|
+
// Expand entity, no advance
|
492
|
+
EntityToBuffer(next_byte_ + take, byte_length_ - take,
|
493
|
+
script_buffer_ + put, &tlen, &plen);
|
494
|
+
sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put);
|
495
|
+
}
|
496
|
+
} else {
|
497
|
+
// Update 1..4
|
498
|
+
tlen = cld_UniLib::OneCharLen(next_byte_ + take);
|
499
|
+
sc = getone::GetUTF8LetterScriptNum(next_byte_ + take);
|
500
|
+
}
|
501
|
+
// printf("sc[%c]=%d ", next_byte_[take], sc);
|
502
|
+
if (sc != 0) {break;} // Letter found
|
503
|
+
take += tlen; // Advance
|
504
|
+
} // End while not-letters
|
505
|
+
|
506
|
+
script_buffer_[put++] = ' ';
|
507
|
+
|
508
|
+
// We are at a letter again (or eos), after letter* not-letter*
|
509
|
+
if (sc != spanscript) {break;} // Letter in wrong script
|
510
|
+
if (put >= getone::kMaxScriptBytes - 8) {
|
511
|
+
// Buffer is almost full
|
512
|
+
span->truncated = true;
|
513
|
+
break;
|
514
|
+
}
|
515
|
+
}
|
516
|
+
|
517
|
+
// Update input position
|
518
|
+
next_byte_ += take;
|
519
|
+
byte_length_ -= take;
|
520
|
+
|
521
|
+
// Put four more spaces/NUL. Worst case is abcd _ _ _ \0
|
522
|
+
// kMaxScriptBytes | | put
|
523
|
+
script_buffer_[put + 0] = ' ';
|
524
|
+
script_buffer_[put + 1] = ' ';
|
525
|
+
script_buffer_[put + 2] = ' ';
|
526
|
+
script_buffer_[put + 3] = '\0';
|
527
|
+
|
528
|
+
span->text_bytes = put; // Does not include the last four chars above
|
529
|
+
|
530
|
+
// printf(" %d]]\n\n", put);
|
531
|
+
return true;
|
532
|
+
}
|
533
|
+
|
534
|
+
// Force Latin, Cyrillic, Greek scripts to be lowercase
|
535
|
+
void ScriptScanner::LowerScriptSpan(getone::LangSpan* span) {
|
536
|
+
// On Windows, text is lowercased beforehand, so no need to do anything here.
|
537
|
+
#if !defined(CLD_WINDOWS)
|
538
|
+
// If needed, lowercase all the text. If we do it sooner, might miss
|
539
|
+
// lowercasing an entity such as Á
|
540
|
+
// We only need to do this for Latn and Cyrl scripts
|
541
|
+
if ((span->script == ULScript_Latin) ||
|
542
|
+
(span->script == ULScript_Cyrillic) ||
|
543
|
+
(span->script == ULScript_Greek)) {
|
544
|
+
// Full Unicode lowercase of the entire buffer, including
|
545
|
+
// four pad bytes off the end
|
546
|
+
int consumed, filled;
|
547
|
+
UniLib::ToLower(span->text, span->text_bytes + 4,
|
548
|
+
script_buffer_lower_, getone::kMaxScriptLowerBuffer,
|
549
|
+
&consumed, &filled);
|
550
|
+
span->text = script_buffer_lower_;
|
551
|
+
span->text_bytes = filled - 4;
|
552
|
+
}
|
553
|
+
#endif
|
554
|
+
}
|
555
|
+
|
556
|
+
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
|
557
|
+
// Force Latin and Cyrillic scripts to be lowercase
|
558
|
+
bool ScriptScanner::GetOneScriptSpanLower(getone::LangSpan* span) {
|
559
|
+
bool ok = GetOneScriptSpan(span);
|
560
|
+
LowerScriptSpan(span);
|
561
|
+
return ok;
|
562
|
+
}
|
563
|
+
|
564
|
+
// Gets lscript number for letters; always returns
|
565
|
+
// 0 (common script) for non-letters
|
566
|
+
int getone::GetUTF8LetterScriptNum(const char* src) {
|
567
|
+
int srclen = cld_UniLib::OneCharLen(src);
|
568
|
+
const uint8* usrc = reinterpret_cast<const uint8*>(src);
|
569
|
+
return UTF8GenericProperty(&utf8propletterscriptnum_obj, &usrc, &srclen);
|
570
|
+
}
|
@@ -0,0 +1,131 @@
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
|
7
|
+
|
8
|
+
#include "encodings/compact_lang_det/letterscript_enum.h"
|
9
|
+
#include "encodings/compact_lang_det/compact_lang_det_impl.h"
|
10
|
+
|
11
|
+
namespace getone {
|
12
|
+
static const int kMaxScriptBuffer = 4096;
|
13
|
+
static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2;
|
14
|
+
static const int kMaxScriptBytes = kMaxScriptBuffer- 8; // Leave some room
|
15
|
+
static const int kMaxAnswerBuffer = 256;
|
16
|
+
|
17
|
+
typedef enum UnicodeLScript ULScript;
|
18
|
+
|
19
|
+
typedef struct {
|
20
|
+
char* text; // Pointer to the span, somewhere
|
21
|
+
int text_bytes; // Number of bytes of text in the span
|
22
|
+
int offset; // Offset of start of span in original input buffer
|
23
|
+
ULScript script; // Script of all the letters in this span
|
24
|
+
Language lang; // Language identified for this span
|
25
|
+
bool truncated; // true if buffer filled up before a
|
26
|
+
// different script or EOF was found
|
27
|
+
} LangSpan;
|
28
|
+
|
29
|
+
|
30
|
+
static inline bool IsContinuationByte(char c) {
|
31
|
+
return static_cast<signed char>(c) < -64;
|
32
|
+
}
|
33
|
+
|
34
|
+
// Gets lscript number for letters; always returns
|
35
|
+
// 0 (common script) for non-letters
|
36
|
+
int GetUTF8LetterScriptNum(const char* src);
|
37
|
+
|
38
|
+
|
39
|
+
// Update src pointer to point to next quadgram, +2..+5
|
40
|
+
// Looks at src[0..4]
|
41
|
+
const char* AdvanceQuad(const char* src);
|
42
|
+
} // end namespace getone
|
43
|
+
|
44
|
+
|
45
|
+
|
46
|
+
|
47
|
+
|
48
|
+
|
49
|
+
class ScriptScanner {
|
50
|
+
public:
|
51
|
+
ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text);
|
52
|
+
~ScriptScanner();
|
53
|
+
|
54
|
+
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
|
55
|
+
bool GetOneScriptSpan(getone::LangSpan* span);
|
56
|
+
|
57
|
+
// Force Latin and Cyrillic scripts to be lowercase
|
58
|
+
void LowerScriptSpan(getone::LangSpan* span);
|
59
|
+
|
60
|
+
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
|
61
|
+
// Force Latin and Cyrillic scripts to be lowercase
|
62
|
+
bool GetOneScriptSpanLower(getone::LangSpan* span);
|
63
|
+
|
64
|
+
private:
|
65
|
+
int SkipToFrontOfSpan(const char* src, int len, int* script);
|
66
|
+
|
67
|
+
const char* start_byte_;
|
68
|
+
const char* next_byte_;
|
69
|
+
const char* next_byte_limit_;
|
70
|
+
int byte_length_;
|
71
|
+
bool is_plain_text_;
|
72
|
+
char* script_buffer_; // Holds text with expanded entities
|
73
|
+
char* script_buffer_lower_; // Holds lowercased text
|
74
|
+
};
|
75
|
+
|
76
|
+
|
77
|
+
class LangScanner {
|
78
|
+
public:
|
79
|
+
LangScanner(const CompactLangDetImpl::LangDetObj* langdetobj,
|
80
|
+
getone::LangSpan* spn, int smoothwidth, int smoothcandidates,
|
81
|
+
int maxlangs, int minlangspan);
|
82
|
+
~LangScanner();
|
83
|
+
|
84
|
+
|
85
|
+
int script() {return script_;}
|
86
|
+
|
87
|
+
// Use new text
|
88
|
+
// Keep smoothing state if same script, otherwise reinit smoothing
|
89
|
+
void NewText(getone::LangSpan* spn);
|
90
|
+
|
91
|
+
bool GetOneShortLangSpanBoot(getone::LangSpan* span); // Just for bootstrapping
|
92
|
+
bool GetOneLangSpanBoot(getone::LangSpan* span); // Just for bootstrapping
|
93
|
+
|
94
|
+
// The real ones
|
95
|
+
bool GetOneShortLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj,
|
96
|
+
getone::LangSpan* span);
|
97
|
+
bool GetOneLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj,
|
98
|
+
getone::LangSpan* span);
|
99
|
+
|
100
|
+
// Increases language bias by delta
|
101
|
+
void SetLanguageBias(const CompactLangDetImpl::LangDetObj* langdetobj,
|
102
|
+
Language key, int delta);
|
103
|
+
|
104
|
+
// For debugging output
|
105
|
+
int next_answer_;
|
106
|
+
char answer_buffer_[getone::kMaxAnswerBuffer];
|
107
|
+
char answer_buffer2_[getone::kMaxAnswerBuffer];
|
108
|
+
char answer_buffer3_[getone::kMaxAnswerBuffer];
|
109
|
+
char answer_buffer4_[getone::kMaxAnswerBuffer];
|
110
|
+
|
111
|
+
private:
|
112
|
+
const char* start_byte_;
|
113
|
+
const char* next_byte_limit_;
|
114
|
+
const char* next_byte_;
|
115
|
+
const char* onelangspan_begin_;
|
116
|
+
int byte_length_;
|
117
|
+
int script_;
|
118
|
+
Language spanlang_;
|
119
|
+
int smoothwidth_;
|
120
|
+
int smoothwidth_2_;
|
121
|
+
int smoothcandidates_;
|
122
|
+
int maxlangs_;
|
123
|
+
int minlangspan_;
|
124
|
+
int rb_size_;
|
125
|
+
int next_rb_;
|
126
|
+
int rb_mask_;
|
127
|
+
uint32* rb_;
|
128
|
+
int* offset_rb_;
|
129
|
+
};
|
130
|
+
|
131
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
|