language_detection 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +19 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +85 -0
- data/Rakefile +11 -0
- data/ext/cld/Makefile +34 -0
- data/ext/cld/base/basictypes.h +348 -0
- data/ext/cld/base/build_config.h +124 -0
- data/ext/cld/base/casts.h +156 -0
- data/ext/cld/base/commandlineflags.h +443 -0
- data/ext/cld/base/crash.h +41 -0
- data/ext/cld/base/dynamic_annotations.h +358 -0
- data/ext/cld/base/global_strip_options.h +59 -0
- data/ext/cld/base/log_severity.h +46 -0
- data/ext/cld/base/logging.h +1403 -0
- data/ext/cld/base/macros.h +243 -0
- data/ext/cld/base/port.h +54 -0
- data/ext/cld/base/scoped_ptr.h +428 -0
- data/ext/cld/base/stl_decl.h +0 -0
- data/ext/cld/base/stl_decl_msvc.h +107 -0
- data/ext/cld/base/string_util.h +29 -0
- data/ext/cld/base/strtoint.h +93 -0
- data/ext/cld/base/template_util.h +96 -0
- data/ext/cld/base/type_traits.h +198 -0
- data/ext/cld/base/vlog_is_on.h +143 -0
- data/ext/cld/cld.so +0 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
- data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
- data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
- data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
- data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
- data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
- data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
- data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
- data/ext/cld/encodings/internal/encodings.cc +12 -0
- data/ext/cld/encodings/lang_enc.h +254 -0
- data/ext/cld/encodings/proto/encodings.pb.h +169 -0
- data/ext/cld/encodings/public/encodings.h +301 -0
- data/ext/cld/extconf.rb +1 -0
- data/ext/cld/language_detection.cc +88 -0
- data/ext/cld/languages/internal/languages.cc +337 -0
- data/ext/cld/languages/proto/languages.pb.h +179 -0
- data/ext/cld/languages/public/languages.h +379 -0
- data/language_detection.gemspec +28 -0
- data/lib/language_detection/string.rb +1 -0
- data/lib/language_detection/version.rb +3 -0
- data/lib/language_detection.rb +54 -0
- data/test/_helper.rb +15 -0
- data/test/fixtures/languages.csv +80 -0
- data/test/language_detection_test.rb +88 -0
- metadata +250 -0
|
@@ -0,0 +1,570 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#include "encodings/compact_lang_det/getonescriptspan.h"
|
|
6
|
+
#include <stdio.h>
|
|
7
|
+
#include <string.h>
|
|
8
|
+
|
|
9
|
+
#include "encodings/lang_enc.h"
|
|
10
|
+
#include "encodings/compact_lang_det/utf8propjustletter.h"
|
|
11
|
+
#include "encodings/compact_lang_det/utf8propletterscriptnum.h"
|
|
12
|
+
#include "encodings/compact_lang_det/utf8scannotjustletterspecial.h"
|
|
13
|
+
|
|
14
|
+
#include "encodings/compact_lang_det/win/cld_basictypes.h"
|
|
15
|
+
#include "encodings/compact_lang_det/win/cld_commandlineflags.h"
|
|
16
|
+
#include "encodings/compact_lang_det/win/cld_google.h"
|
|
17
|
+
#include "encodings/compact_lang_det/win/cld_htmlutils.h"
|
|
18
|
+
#include "encodings/compact_lang_det/win/cld_unilib.h"
|
|
19
|
+
#include "encodings/compact_lang_det/win/cld_utf8statetable.h"
|
|
20
|
+
#include "encodings/compact_lang_det/win/cld_utf8utils.h"
|
|
21
|
+
|
|
22
|
+
static const Language GRAY_LANG = (Language)254;
|
|
23
|
+
|
|
24
|
+
static const int kMaxUpToWordBoundary = 50; // span < this make longer,
|
|
25
|
+
// else make shorter
|
|
26
|
+
static const int kMaxAdvanceToWordBoundary = 10; // +/- this many bytes
|
|
27
|
+
// to round to word boundary,
|
|
28
|
+
// direction above
|
|
29
|
+
|
|
30
|
+
static const char kSpecialSymbol[256] = { // true for < > &
|
|
31
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
32
|
+
0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0,
|
|
33
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
34
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
35
|
+
|
|
36
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
37
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
38
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
39
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
40
|
+
};
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
#define LT 0 // <
|
|
45
|
+
#define GT 1 // >
|
|
46
|
+
#define EX 2 // !
|
|
47
|
+
#define HY 3 // -
|
|
48
|
+
#define QU 4 // "
|
|
49
|
+
#define AP 5 // '
|
|
50
|
+
#define SL 6 // /
|
|
51
|
+
#define S_ 7
|
|
52
|
+
#define C_ 8
|
|
53
|
+
#define R_ 9
|
|
54
|
+
#define I_ 10
|
|
55
|
+
#define P_ 11
|
|
56
|
+
#define T_ 12
|
|
57
|
+
#define Y_ 13
|
|
58
|
+
#define L_ 14
|
|
59
|
+
#define E_ 15
|
|
60
|
+
#define CR 16 // <cr> or <lf>
|
|
61
|
+
#define NL 17 // non-letter: ASCII whitespace, digit, punctuation
|
|
62
|
+
#define PL 18 // possible letter, incl. &
|
|
63
|
+
#define xx 19 // <unused>
|
|
64
|
+
|
|
65
|
+
// Map byte to one of ~20 interesting categories for cheap tag parsing
|
|
66
|
+
static const uint8 kCharToSub[256] = {
|
|
67
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL,
|
|
68
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
|
|
69
|
+
NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL,
|
|
70
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL,
|
|
71
|
+
|
|
72
|
+
PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
|
|
73
|
+
P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
|
|
74
|
+
PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
|
|
75
|
+
P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
|
|
76
|
+
|
|
77
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
|
|
78
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
|
|
79
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
|
|
80
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
|
|
81
|
+
|
|
82
|
+
PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
|
|
83
|
+
PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
|
|
84
|
+
PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
|
|
85
|
+
PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
|
|
86
|
+
};
|
|
87
|
+
|
|
88
|
+
#undef LT
|
|
89
|
+
#undef GT
|
|
90
|
+
#undef EX
|
|
91
|
+
#undef HY
|
|
92
|
+
#undef QU
|
|
93
|
+
#undef AP
|
|
94
|
+
#undef SL
|
|
95
|
+
#undef S_
|
|
96
|
+
#undef C_
|
|
97
|
+
#undef R_
|
|
98
|
+
#undef I_
|
|
99
|
+
#undef P_
|
|
100
|
+
#undef T_
|
|
101
|
+
#undef Y_
|
|
102
|
+
#undef L_
|
|
103
|
+
#undef E_
|
|
104
|
+
#undef CR
|
|
105
|
+
#undef NL
|
|
106
|
+
#undef PL
|
|
107
|
+
#undef xx
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
#define OK 0
|
|
111
|
+
#define X_ 1
|
|
112
|
+
|
|
113
|
+
// State machine to do cheap parse of non-letter strings incl. tags
|
|
114
|
+
// advances <tag>
|
|
115
|
+
// | |
|
|
116
|
+
// advances <tag> ... </tag> for <script> <style>
|
|
117
|
+
// | |
|
|
118
|
+
// advances <!-- ... <tag> ... -->
|
|
119
|
+
// | |
|
|
120
|
+
// advances <tag
|
|
121
|
+
// || (0)
|
|
122
|
+
// advances <tag <tag2>
|
|
123
|
+
// || (0)
|
|
124
|
+
static const uint8 kTagParseTbl_0[] = {
|
|
125
|
+
// < > ! - " ' / S C R I P T Y L E CR NL PL xx
|
|
126
|
+
3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [0] OK
|
|
127
|
+
X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error
|
|
128
|
+
3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [2] NL*
|
|
129
|
+
X_, 2, 4, 9, 10,11, 9,13, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [3] <
|
|
130
|
+
X_, 2, 9, 5, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [4] <!
|
|
131
|
+
X_, 2, 9, 6, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [5] <!-
|
|
132
|
+
6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [6] <!--.*
|
|
133
|
+
6, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [7] <!--.*-
|
|
134
|
+
6, 2, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [8] <!--.*--
|
|
135
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [9] <.*
|
|
136
|
+
10,10,10,10, 9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*"
|
|
137
|
+
11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*'
|
|
138
|
+
X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " '
|
|
139
|
+
|
|
140
|
+
// < > ! - " ' / S C R I P T Y L E CR NL PL xx
|
|
141
|
+
X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9, 9, 9, 9,X_, // [13] <S
|
|
142
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9,15, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [14] <SC
|
|
143
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9,16, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [15] <SCR
|
|
144
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9,17, 9, 9, 9, 9, 9, 9, 9,X_, // [16] <SCRI
|
|
145
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 18, 9, 9, 9, 9, 9, 9,X_, // [17] <SCRIP
|
|
146
|
+
X_,19, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT
|
|
147
|
+
20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .*
|
|
148
|
+
19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*<
|
|
149
|
+
19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [21] <SCRIPT .*</
|
|
150
|
+
19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S
|
|
151
|
+
19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC
|
|
152
|
+
19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR
|
|
153
|
+
19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI
|
|
154
|
+
19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP
|
|
155
|
+
19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT
|
|
156
|
+
|
|
157
|
+
// < > ! - " ' / S C R I P T Y L E CR NL PL xx
|
|
158
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9,29, 9, 9, 9, 9, 9,X_, // [28] <ST
|
|
159
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9,30, 9, 9, 9, 9,X_, // [29] <STY
|
|
160
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9,31, 9, 9, 9,X_, // [30] <STYL
|
|
161
|
+
X_,32, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE
|
|
162
|
+
33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .*
|
|
163
|
+
32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*<
|
|
164
|
+
32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [34] <STYLE .*</
|
|
165
|
+
32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S
|
|
166
|
+
32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST
|
|
167
|
+
32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY
|
|
168
|
+
32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL
|
|
169
|
+
32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE
|
|
170
|
+
};
|
|
171
|
+
|
|
172
|
+
#undef OK
|
|
173
|
+
#undef X_
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
/*
|
|
177
|
+
// Convert GetTimeOfDay output to 64-bit usec
|
|
178
|
+
static inline uint64 Microseconds(const struct timeval& t) {
|
|
179
|
+
// The SumReducer uses uint64, so convert to (uint64) microseconds,
|
|
180
|
+
// not (double) seconds.
|
|
181
|
+
return t.tv_sec * 1000000ULL + t.tv_usec;
|
|
182
|
+
}
|
|
183
|
+
*/
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
// Returns true if character is < > or &
|
|
187
|
+
bool inline IsSpecial(char c) {
|
|
188
|
+
if ((c & 0xe0) == 0x20) {
|
|
189
|
+
return kSpecialSymbol[static_cast<uint8>(c)];
|
|
190
|
+
}
|
|
191
|
+
return false;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
// Quick Skip to next letter or < > & or to end of string (eos)
|
|
195
|
+
// Always return is_letter for eos
|
|
196
|
+
int ScanToLetterOrSpecial(const char* src, int len) {
|
|
197
|
+
int bytes_consumed;
|
|
198
|
+
cld::UTF8GenericScan(&utf8scannotjustletterspecial_obj, src, len,
|
|
199
|
+
&bytes_consumed);
|
|
200
|
+
return bytes_consumed;
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
// src points to non-letter, such as tag-opening '<'
|
|
206
|
+
// Return length from here to next possible letter
|
|
207
|
+
// On eos or another < before >, return 1
|
|
208
|
+
// advances <tag>
|
|
209
|
+
// | |
|
|
210
|
+
// advances <tag> ... </tag> for <script> <style>
|
|
211
|
+
// | |
|
|
212
|
+
// advances <!-- ... <tag> ... -->
|
|
213
|
+
// | |
|
|
214
|
+
// advances <tag
|
|
215
|
+
// || (1)
|
|
216
|
+
// advances <tag <tag2>
|
|
217
|
+
// || (1)
|
|
218
|
+
int ScanToPossibleLetter(const char* isrc, int len) {
|
|
219
|
+
const uint8* src = reinterpret_cast<const uint8*>(isrc);
|
|
220
|
+
const uint8* srclimit = src + len;
|
|
221
|
+
const uint8* tagParseTbl = kTagParseTbl_0;
|
|
222
|
+
int e = 0;
|
|
223
|
+
while (src < srclimit) {
|
|
224
|
+
e = tagParseTbl[kCharToSub[*src++]];
|
|
225
|
+
if ((e & ~1) == 0) {
|
|
226
|
+
// We overshot by one byte
|
|
227
|
+
--src;
|
|
228
|
+
break;
|
|
229
|
+
}
|
|
230
|
+
tagParseTbl = &kTagParseTbl_0[e * 20];
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
if (src >= srclimit) {
|
|
234
|
+
// We fell off the end of the text.
|
|
235
|
+
// It looks like the most common case for this is a truncated file, not
|
|
236
|
+
// mismatched angle brackets. So we pretend that the last char was '>'
|
|
237
|
+
return len;
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
// OK to be in state 0 or state 2 at exit
|
|
241
|
+
if ((e != 0) && (e != 2)) {
|
|
242
|
+
// Error, '<' followed by '<'
|
|
243
|
+
// We want to back up to first <, then advance by one byte past it
|
|
244
|
+
int offset = src - reinterpret_cast<const uint8*>(isrc);
|
|
245
|
+
// printf("ScanToPossibleLetter error at %d[%d] in '%s'\n",offset,e,isrc);
|
|
246
|
+
|
|
247
|
+
// Backscan to first '<' and return enough length to just get past it
|
|
248
|
+
--offset; // back up over the second '<', which caused us to stop
|
|
249
|
+
while ((0 < offset) && (isrc[offset] != '<')) {
|
|
250
|
+
// Find the first '<', which is unmatched
|
|
251
|
+
--offset;
|
|
252
|
+
}
|
|
253
|
+
// skip to just beyond first '<'
|
|
254
|
+
// printf(" returning %d\n", offset + 1);
|
|
255
|
+
return offset + 1;
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
return src - reinterpret_cast<const uint8*>(isrc);
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
ScriptScanner::ScriptScanner(const char* buffer,
|
|
264
|
+
int buffer_length,
|
|
265
|
+
bool is_plain_text)
|
|
266
|
+
: start_byte_(buffer),
|
|
267
|
+
next_byte_(buffer),
|
|
268
|
+
next_byte_limit_(buffer + buffer_length),
|
|
269
|
+
byte_length_(buffer_length),
|
|
270
|
+
is_plain_text_(is_plain_text) {
|
|
271
|
+
script_buffer_ = new char[getone::kMaxScriptBuffer];
|
|
272
|
+
script_buffer_lower_ = new char[getone::kMaxScriptLowerBuffer];
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
ScriptScanner::~ScriptScanner() {
|
|
276
|
+
delete[] script_buffer_;
|
|
277
|
+
delete[] script_buffer_lower_;
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
// Get to the first real non-tag letter or entity that is a letter
|
|
284
|
+
// Sets script of that letter
|
|
285
|
+
// Return len if no more letters
|
|
286
|
+
int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) {
|
|
287
|
+
int sc = UNKNOWN_LSCRIPT;
|
|
288
|
+
int skip = 0;
|
|
289
|
+
int tlen, plen;
|
|
290
|
+
|
|
291
|
+
// Do run of non-letters (tag | &NL | NL)*
|
|
292
|
+
while (skip < len) {
|
|
293
|
+
// Do fast scan to next interesting byte
|
|
294
|
+
// int oldskip = skip;
|
|
295
|
+
skip += ScanToLetterOrSpecial(src + skip, len - skip);
|
|
296
|
+
// TEMP
|
|
297
|
+
// printf("ScanToLetterOrSpecial[%d] 0x%02x => [%d] 0x%02x\n",
|
|
298
|
+
// oldskip, src[oldskip], skip, src[skip]);
|
|
299
|
+
|
|
300
|
+
// Check for no more letters/specials
|
|
301
|
+
if (skip >= len) {
|
|
302
|
+
// All done
|
|
303
|
+
return len;
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
// We are at a letter, nonletter, tag, or entity
|
|
307
|
+
if (IsSpecial(src[skip]) && !is_plain_text_) {
|
|
308
|
+
if (src[skip] == '<') {
|
|
309
|
+
// Begining of tag; skip to end and go around again
|
|
310
|
+
tlen = ScanToPossibleLetter(src + skip, len - skip);
|
|
311
|
+
sc = 0;
|
|
312
|
+
// printf("<...> ");
|
|
313
|
+
} else if (src[skip] == '>') {
|
|
314
|
+
// Unexpected end of tag; skip it and go around again
|
|
315
|
+
tlen = 1; // Over the >
|
|
316
|
+
sc = 0;
|
|
317
|
+
// printf("..> ");
|
|
318
|
+
} else if (src[skip] == '&') {
|
|
319
|
+
// Expand entity, no advance
|
|
320
|
+
char temp[4];
|
|
321
|
+
EntityToBuffer(src + skip, len - skip,
|
|
322
|
+
temp, &tlen, &plen);
|
|
323
|
+
sc = getone::GetUTF8LetterScriptNum(temp);
|
|
324
|
+
// printf("#(%02x%02x)=%d ", temp[0], temp[1], sc);
|
|
325
|
+
}
|
|
326
|
+
} else {
|
|
327
|
+
// Update 1..4 bytes
|
|
328
|
+
tlen = cld_UniLib::OneCharLen(src + skip);
|
|
329
|
+
sc = getone::GetUTF8LetterScriptNum(src + skip);
|
|
330
|
+
// printf("#(%02x%02x)=%d ", src[skip], src[skip+1], sc);
|
|
331
|
+
}
|
|
332
|
+
// TEMP
|
|
333
|
+
// printf("sc=%d ", sc);
|
|
334
|
+
if (sc != 0) {break;} // Letter found
|
|
335
|
+
skip += tlen; // Advance
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
*script = sc;
|
|
339
|
+
return skip;
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
|
|
345
|
+
// Buffer has leading space and all text is lowercased
|
|
346
|
+
bool ScriptScanner::GetOneScriptSpan(getone::LangSpan* span) {
|
|
347
|
+
span->text = script_buffer_;
|
|
348
|
+
span->text_bytes = 0;
|
|
349
|
+
span->offset = next_byte_ - start_byte_;
|
|
350
|
+
span->script = UNKNOWN_LSCRIPT;
|
|
351
|
+
span->lang = UNKNOWN_LANGUAGE;
|
|
352
|
+
span->truncated = false;
|
|
353
|
+
|
|
354
|
+
// printf("GetOneScriptSpan[[ ");
|
|
355
|
+
// struct timeval script_start, script_mid, script_end;
|
|
356
|
+
|
|
357
|
+
int spanscript; // The script of this span
|
|
358
|
+
int sc = UNKNOWN_LSCRIPT; // The script of next character
|
|
359
|
+
int tlen, plen;
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
script_buffer_[0] = ' '; // Always a space at front of output
|
|
363
|
+
script_buffer_[1] = '\0';
|
|
364
|
+
int take = 0;
|
|
365
|
+
int put = 1; // Start after the initial space
|
|
366
|
+
|
|
367
|
+
// gettimeofday(&script_start, NULL);
|
|
368
|
+
// Get to the first real non-tag letter or entity that is a letter
|
|
369
|
+
int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript);
|
|
370
|
+
next_byte_ += skip;
|
|
371
|
+
byte_length_ -= skip;
|
|
372
|
+
if (byte_length_ <= 0) {
|
|
373
|
+
// printf("]]\n");
|
|
374
|
+
return false; // No more letters to be found
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
// gettimeofday(&script_mid, NULL);
|
|
378
|
+
|
|
379
|
+
// There is at least one letter, so we know the script for this span
|
|
380
|
+
// printf("{%d} ", spanscript);
|
|
381
|
+
span->script = (UnicodeLScript)spanscript;
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
// Go over alternating spans of same-script letters and non-letters,
|
|
385
|
+
// copying letters to buffer with single spaces for each run of non-letters
|
|
386
|
+
while (take < byte_length_) {
|
|
387
|
+
// Copy run of letters in same script (&LS | LS)*
|
|
388
|
+
int letter_count = 0; // Keep track of word length
|
|
389
|
+
bool need_break = false;
|
|
390
|
+
while (take < byte_length_) {
|
|
391
|
+
// We are at a letter, nonletter, tag, or entity
|
|
392
|
+
if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
|
|
393
|
+
// printf("\"%c\" ", next_byte_[take]);
|
|
394
|
+
if (next_byte_[take] == '<') {
|
|
395
|
+
// Begining of tag
|
|
396
|
+
sc = 0;
|
|
397
|
+
break;
|
|
398
|
+
} else if (next_byte_[take] == '>') {
|
|
399
|
+
// Unexpected end of tag
|
|
400
|
+
sc = 0;
|
|
401
|
+
break;
|
|
402
|
+
} else if (next_byte_[take] == '&') {
|
|
403
|
+
// Copy entity, no advance
|
|
404
|
+
EntityToBuffer(next_byte_ + take, byte_length_ - take,
|
|
405
|
+
script_buffer_ + put, &tlen, &plen);
|
|
406
|
+
sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put);
|
|
407
|
+
}
|
|
408
|
+
} else {
|
|
409
|
+
// Real letter, safely copy up to 4 bytes, increment by 1..4
|
|
410
|
+
// Will update by 1..4 bytes at Advance, below
|
|
411
|
+
tlen = plen = cld_UniLib::OneCharLen(next_byte_ + take);
|
|
412
|
+
if (take < (byte_length_ - 3)) {
|
|
413
|
+
// Fast case
|
|
414
|
+
*reinterpret_cast<uint32*>(script_buffer_ + put) =
|
|
415
|
+
*reinterpret_cast<const uint32*>(next_byte_ + take);
|
|
416
|
+
} else {
|
|
417
|
+
// Slow case, happens 1-3 times per input document
|
|
418
|
+
memcpy(script_buffer_ + put, next_byte_ + take, plen);
|
|
419
|
+
}
|
|
420
|
+
sc = getone::GetUTF8LetterScriptNum(next_byte_ + take);
|
|
421
|
+
}
|
|
422
|
+
// printf("sc(%c)=%d ", next_byte_[take], sc);
|
|
423
|
+
// char xtmp[8]; memcpy(xtmp,script_buffer_ + put, plen);
|
|
424
|
+
// xtmp[plen] = '\0'; printf("'%s'{%d} ", xtmp, sc);
|
|
425
|
+
|
|
426
|
+
// Allow continue across a single letter in a different script:
|
|
427
|
+
// A B D = three scripts, c = common script, i = inherited script,
|
|
428
|
+
// - = don't care, ( = take position before the += below
|
|
429
|
+
// AAA(A- continue
|
|
430
|
+
//
|
|
431
|
+
// AAA(BA continue
|
|
432
|
+
// AAA(BB break
|
|
433
|
+
// AAA(Bc continue (breaks after B)
|
|
434
|
+
// AAA(BD break
|
|
435
|
+
// AAA(Bi break
|
|
436
|
+
//
|
|
437
|
+
// AAA(c- break
|
|
438
|
+
//
|
|
439
|
+
// AAA(i- continue
|
|
440
|
+
//
|
|
441
|
+
|
|
442
|
+
if ((sc != spanscript) && (sc != ULScript_Inherited)) {
|
|
443
|
+
// Might need to break this script span
|
|
444
|
+
if (sc == ULScript_Common) {
|
|
445
|
+
need_break = true;
|
|
446
|
+
} else {
|
|
447
|
+
// Look at next following character, ignoring entity as Common
|
|
448
|
+
int sc2 = getone::GetUTF8LetterScriptNum(next_byte_ + take + tlen);
|
|
449
|
+
if ((sc2 != ULScript_Common) && (sc2 != spanscript)) {
|
|
450
|
+
need_break = true;
|
|
451
|
+
}
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
if (need_break) {break;} // Non-letter or letter in wrong script
|
|
455
|
+
|
|
456
|
+
take += tlen; // Advance
|
|
457
|
+
put += plen; // Advance
|
|
458
|
+
++letter_count;
|
|
459
|
+
if (put >= getone::kMaxScriptBytes) {
|
|
460
|
+
// Buffer is full
|
|
461
|
+
span->truncated = true;
|
|
462
|
+
break;
|
|
463
|
+
}
|
|
464
|
+
} // End while letters
|
|
465
|
+
|
|
466
|
+
// Do run of non-letters (tag | &NL | NL)*
|
|
467
|
+
while (take < byte_length_) {
|
|
468
|
+
// Do fast scan to next interesting byte
|
|
469
|
+
take += ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take);
|
|
470
|
+
|
|
471
|
+
// Check for no more letters/specials
|
|
472
|
+
if (take >= byte_length_) {
|
|
473
|
+
take = byte_length_;
|
|
474
|
+
break;
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
// We are at a letter, nonletter, tag, or entity
|
|
478
|
+
if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
|
|
479
|
+
// printf("\"%c\" ", next_byte_[take]);
|
|
480
|
+
if (next_byte_[take] == '<') {
|
|
481
|
+
// Begining of tag; skip to end and go around again
|
|
482
|
+
tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take);
|
|
483
|
+
sc = 0;
|
|
484
|
+
// printf("<...> ");
|
|
485
|
+
} else if (next_byte_[take] == '>') {
|
|
486
|
+
// Unexpected end of tag; skip it and go around again
|
|
487
|
+
tlen = 1; // Over the >
|
|
488
|
+
sc = 0;
|
|
489
|
+
// printf("..> ");
|
|
490
|
+
} else if (next_byte_[take] == '&') {
|
|
491
|
+
// Expand entity, no advance
|
|
492
|
+
EntityToBuffer(next_byte_ + take, byte_length_ - take,
|
|
493
|
+
script_buffer_ + put, &tlen, &plen);
|
|
494
|
+
sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put);
|
|
495
|
+
}
|
|
496
|
+
} else {
|
|
497
|
+
// Update 1..4
|
|
498
|
+
tlen = cld_UniLib::OneCharLen(next_byte_ + take);
|
|
499
|
+
sc = getone::GetUTF8LetterScriptNum(next_byte_ + take);
|
|
500
|
+
}
|
|
501
|
+
// printf("sc[%c]=%d ", next_byte_[take], sc);
|
|
502
|
+
if (sc != 0) {break;} // Letter found
|
|
503
|
+
take += tlen; // Advance
|
|
504
|
+
} // End while not-letters
|
|
505
|
+
|
|
506
|
+
script_buffer_[put++] = ' ';
|
|
507
|
+
|
|
508
|
+
// We are at a letter again (or eos), after letter* not-letter*
|
|
509
|
+
if (sc != spanscript) {break;} // Letter in wrong script
|
|
510
|
+
if (put >= getone::kMaxScriptBytes - 8) {
|
|
511
|
+
// Buffer is almost full
|
|
512
|
+
span->truncated = true;
|
|
513
|
+
break;
|
|
514
|
+
}
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
// Update input position
|
|
518
|
+
next_byte_ += take;
|
|
519
|
+
byte_length_ -= take;
|
|
520
|
+
|
|
521
|
+
// Put four more spaces/NUL. Worst case is abcd _ _ _ \0
|
|
522
|
+
// kMaxScriptBytes | | put
|
|
523
|
+
script_buffer_[put + 0] = ' ';
|
|
524
|
+
script_buffer_[put + 1] = ' ';
|
|
525
|
+
script_buffer_[put + 2] = ' ';
|
|
526
|
+
script_buffer_[put + 3] = '\0';
|
|
527
|
+
|
|
528
|
+
span->text_bytes = put; // Does not include the last four chars above
|
|
529
|
+
|
|
530
|
+
// printf(" %d]]\n\n", put);
|
|
531
|
+
return true;
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
// Force Latin, Cyrillic, Greek scripts to be lowercase
|
|
535
|
+
void ScriptScanner::LowerScriptSpan(getone::LangSpan* span) {
|
|
536
|
+
// On Windows, text is lowercased beforehand, so no need to do anything here.
|
|
537
|
+
#if !defined(CLD_WINDOWS)
|
|
538
|
+
// If needed, lowercase all the text. If we do it sooner, might miss
|
|
539
|
+
// lowercasing an entity such as Á
|
|
540
|
+
// We only need to do this for Latn and Cyrl scripts
|
|
541
|
+
if ((span->script == ULScript_Latin) ||
|
|
542
|
+
(span->script == ULScript_Cyrillic) ||
|
|
543
|
+
(span->script == ULScript_Greek)) {
|
|
544
|
+
// Full Unicode lowercase of the entire buffer, including
|
|
545
|
+
// four pad bytes off the end
|
|
546
|
+
int consumed, filled;
|
|
547
|
+
UniLib::ToLower(span->text, span->text_bytes + 4,
|
|
548
|
+
script_buffer_lower_, getone::kMaxScriptLowerBuffer,
|
|
549
|
+
&consumed, &filled);
|
|
550
|
+
span->text = script_buffer_lower_;
|
|
551
|
+
span->text_bytes = filled - 4;
|
|
552
|
+
}
|
|
553
|
+
#endif
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
|
|
557
|
+
// Force Latin and Cyrillic scripts to be lowercase
|
|
558
|
+
bool ScriptScanner::GetOneScriptSpanLower(getone::LangSpan* span) {
|
|
559
|
+
bool ok = GetOneScriptSpan(span);
|
|
560
|
+
LowerScriptSpan(span);
|
|
561
|
+
return ok;
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
// Gets lscript number for letters; always returns
|
|
565
|
+
// 0 (common script) for non-letters
|
|
566
|
+
int getone::GetUTF8LetterScriptNum(const char* src) {
|
|
567
|
+
int srclen = cld_UniLib::OneCharLen(src);
|
|
568
|
+
const uint8* usrc = reinterpret_cast<const uint8*>(src);
|
|
569
|
+
return UTF8GenericProperty(&utf8propletterscriptnum_obj, &usrc, &srclen);
|
|
570
|
+
}
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
|
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
|
|
7
|
+
|
|
8
|
+
#include "encodings/compact_lang_det/letterscript_enum.h"
|
|
9
|
+
#include "encodings/compact_lang_det/compact_lang_det_impl.h"
|
|
10
|
+
|
|
11
|
+
namespace getone {
|
|
12
|
+
static const int kMaxScriptBuffer = 4096;
|
|
13
|
+
static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2;
|
|
14
|
+
static const int kMaxScriptBytes = kMaxScriptBuffer- 8; // Leave some room
|
|
15
|
+
static const int kMaxAnswerBuffer = 256;
|
|
16
|
+
|
|
17
|
+
typedef enum UnicodeLScript ULScript;
|
|
18
|
+
|
|
19
|
+
typedef struct {
|
|
20
|
+
char* text; // Pointer to the span, somewhere
|
|
21
|
+
int text_bytes; // Number of bytes of text in the span
|
|
22
|
+
int offset; // Offset of start of span in original input buffer
|
|
23
|
+
ULScript script; // Script of all the letters in this span
|
|
24
|
+
Language lang; // Language identified for this span
|
|
25
|
+
bool truncated; // true if buffer filled up before a
|
|
26
|
+
// different script or EOF was found
|
|
27
|
+
} LangSpan;
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
static inline bool IsContinuationByte(char c) {
|
|
31
|
+
return static_cast<signed char>(c) < -64;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
// Gets lscript number for letters; always returns
|
|
35
|
+
// 0 (common script) for non-letters
|
|
36
|
+
int GetUTF8LetterScriptNum(const char* src);
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
// Update src pointer to point to next quadgram, +2..+5
|
|
40
|
+
// Looks at src[0..4]
|
|
41
|
+
const char* AdvanceQuad(const char* src);
|
|
42
|
+
} // end namespace getone
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class ScriptScanner {
|
|
50
|
+
public:
|
|
51
|
+
ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text);
|
|
52
|
+
~ScriptScanner();
|
|
53
|
+
|
|
54
|
+
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
|
|
55
|
+
bool GetOneScriptSpan(getone::LangSpan* span);
|
|
56
|
+
|
|
57
|
+
// Force Latin and Cyrillic scripts to be lowercase
|
|
58
|
+
void LowerScriptSpan(getone::LangSpan* span);
|
|
59
|
+
|
|
60
|
+
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
|
|
61
|
+
// Force Latin and Cyrillic scripts to be lowercase
|
|
62
|
+
bool GetOneScriptSpanLower(getone::LangSpan* span);
|
|
63
|
+
|
|
64
|
+
private:
|
|
65
|
+
int SkipToFrontOfSpan(const char* src, int len, int* script);
|
|
66
|
+
|
|
67
|
+
const char* start_byte_;
|
|
68
|
+
const char* next_byte_;
|
|
69
|
+
const char* next_byte_limit_;
|
|
70
|
+
int byte_length_;
|
|
71
|
+
bool is_plain_text_;
|
|
72
|
+
char* script_buffer_; // Holds text with expanded entities
|
|
73
|
+
char* script_buffer_lower_; // Holds lowercased text
|
|
74
|
+
};
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class LangScanner {
|
|
78
|
+
public:
|
|
79
|
+
LangScanner(const CompactLangDetImpl::LangDetObj* langdetobj,
|
|
80
|
+
getone::LangSpan* spn, int smoothwidth, int smoothcandidates,
|
|
81
|
+
int maxlangs, int minlangspan);
|
|
82
|
+
~LangScanner();
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
int script() {return script_;}
|
|
86
|
+
|
|
87
|
+
// Use new text
|
|
88
|
+
// Keep smoothing state if same script, otherwise reinit smoothing
|
|
89
|
+
void NewText(getone::LangSpan* spn);
|
|
90
|
+
|
|
91
|
+
bool GetOneShortLangSpanBoot(getone::LangSpan* span); // Just for bootstrapping
|
|
92
|
+
bool GetOneLangSpanBoot(getone::LangSpan* span); // Just for bootstrapping
|
|
93
|
+
|
|
94
|
+
// The real ones
|
|
95
|
+
bool GetOneShortLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj,
|
|
96
|
+
getone::LangSpan* span);
|
|
97
|
+
bool GetOneLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj,
|
|
98
|
+
getone::LangSpan* span);
|
|
99
|
+
|
|
100
|
+
// Increases language bias by delta
|
|
101
|
+
void SetLanguageBias(const CompactLangDetImpl::LangDetObj* langdetobj,
|
|
102
|
+
Language key, int delta);
|
|
103
|
+
|
|
104
|
+
// For debugging output
|
|
105
|
+
int next_answer_;
|
|
106
|
+
char answer_buffer_[getone::kMaxAnswerBuffer];
|
|
107
|
+
char answer_buffer2_[getone::kMaxAnswerBuffer];
|
|
108
|
+
char answer_buffer3_[getone::kMaxAnswerBuffer];
|
|
109
|
+
char answer_buffer4_[getone::kMaxAnswerBuffer];
|
|
110
|
+
|
|
111
|
+
private:
|
|
112
|
+
const char* start_byte_;
|
|
113
|
+
const char* next_byte_limit_;
|
|
114
|
+
const char* next_byte_;
|
|
115
|
+
const char* onelangspan_begin_;
|
|
116
|
+
int byte_length_;
|
|
117
|
+
int script_;
|
|
118
|
+
Language spanlang_;
|
|
119
|
+
int smoothwidth_;
|
|
120
|
+
int smoothwidth_2_;
|
|
121
|
+
int smoothcandidates_;
|
|
122
|
+
int maxlangs_;
|
|
123
|
+
int minlangspan_;
|
|
124
|
+
int rb_size_;
|
|
125
|
+
int next_rb_;
|
|
126
|
+
int rb_mask_;
|
|
127
|
+
uint32* rb_;
|
|
128
|
+
int* offset_rb_;
|
|
129
|
+
};
|
|
130
|
+
|
|
131
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
|