cld-fixed 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +20 -0
- data/.rspec +2 -0
- data/Gemfile +6 -0
- data/LICENSE +27 -0
- data/README.md +34 -0
- data/Rakefile +5 -0
- data/cld.gemspec +22 -0
- data/ext/cld/Makefile.am +28 -0
- data/ext/cld/Makefile.in +790 -0
- data/ext/cld/aclocal.m4 +8895 -0
- data/ext/cld/base/basictypes.h +348 -0
- data/ext/cld/base/build_config.h +115 -0
- data/ext/cld/base/casts.h +156 -0
- data/ext/cld/base/commandlineflags.h +443 -0
- data/ext/cld/base/crash.h +41 -0
- data/ext/cld/base/dynamic_annotations.h +358 -0
- data/ext/cld/base/global_strip_options.h +59 -0
- data/ext/cld/base/log_severity.h +46 -0
- data/ext/cld/base/logging.h +1403 -0
- data/ext/cld/base/macros.h +243 -0
- data/ext/cld/base/port.h +54 -0
- data/ext/cld/base/scoped_ptr.h +428 -0
- data/ext/cld/base/stl_decl.h +0 -0
- data/ext/cld/base/stl_decl_msvc.h +107 -0
- data/ext/cld/base/string_util.h +29 -0
- data/ext/cld/base/strtoint.h +93 -0
- data/ext/cld/base/template_util.h +96 -0
- data/ext/cld/base/type_traits.h +198 -0
- data/ext/cld/base/vlog_is_on.h +143 -0
- data/ext/cld/build_aux/config.guess +1500 -0
- data/ext/cld/build_aux/config.sub +1616 -0
- data/ext/cld/build_aux/depcomp +584 -0
- data/ext/cld/build_aux/install-sh +507 -0
- data/ext/cld/build_aux/ltmain.sh +8745 -0
- data/ext/cld/build_aux/missing +367 -0
- data/ext/cld/cld_encodings.h +95 -0
- data/ext/cld/configure +17362 -0
- data/ext/cld/configure.ac +14 -0
- data/ext/cld/encodings/compact_lang_det/#cldutil.cc# +905 -0
- data/ext/cld/encodings/compact_lang_det/#cldutil.h# +1205 -0
- data/ext/cld/encodings/compact_lang_det/#compact_lang_det_impl.h# +171 -0
- data/ext/cld/encodings/compact_lang_det/#ext_lang_enc.cc# +545 -0
- data/ext/cld/encodings/compact_lang_det/#ext_lang_enc.h# +119 -0
- data/ext/cld/encodings/compact_lang_det/#getonescriptspan.cc# +570 -0
- data/ext/cld/encodings/compact_lang_det/#getonescriptspan.h# +131 -0
- data/ext/cld/encodings/compact_lang_det/#tote.cc# +299 -0
- data/ext/cld/encodings/compact_lang_det/#tote.h# +89 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
- data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
- data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
- data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
- data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
- data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
- data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
- data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
- data/ext/cld/encodings/compact_lang_det/win/#cld_unilib_windows.cc# +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
- data/ext/cld/encodings/internal/encodings.cc +12 -0
- data/ext/cld/encodings/lang_enc.h +254 -0
- data/ext/cld/encodings/proto/encodings.pb.h +169 -0
- data/ext/cld/encodings/public/encodings.h +301 -0
- data/ext/cld/extconf.rb +7 -0
- data/ext/cld/languages/internal/#languages.cc# +337 -0
- data/ext/cld/languages/internal/languages.cc +336 -0
- data/ext/cld/languages/proto/languages.pb.h +179 -0
- data/ext/cld/languages/public/languages.h +379 -0
- data/ext/cld/thunk.cc +55 -0
- data/lib/cld.rb +21 -0
- data/lib/cld/version.rb +3 -0
- data/spec/cld_spec.rb +67 -0
- data/spec/spec_helper.rb +6 -0
- metadata +193 -0
|
@@ -0,0 +1,570 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#include "encodings/compact_lang_det/getonescriptspan.h"
|
|
6
|
+
#include <stdio.h>
|
|
7
|
+
#include <string.h>
|
|
8
|
+
|
|
9
|
+
#include "encodings/lang_enc.h"
|
|
10
|
+
#include "encodings/compact_lang_det/utf8propjustletter.h"
|
|
11
|
+
#include "encodings/compact_lang_det/utf8propletterscriptnum.h"
|
|
12
|
+
#include "encodings/compact_lang_det/utf8scannotjustletterspecial.h"
|
|
13
|
+
|
|
14
|
+
#include "encodings/compact_lang_det/win/cld_basictypes.h"
|
|
15
|
+
#include "encodings/compact_lang_det/win/cld_commandlineflags.h"
|
|
16
|
+
#include "encodings/compact_lang_det/win/cld_google.h"
|
|
17
|
+
#include "encodings/compact_lang_det/win/cld_htmlutils.h"
|
|
18
|
+
#include "encodings/compact_lang_det/win/cld_unilib.h"
|
|
19
|
+
#include "encodings/compact_lang_det/win/cld_utf8statetable.h"
|
|
20
|
+
#include "encodings/compact_lang_det/win/cld_utf8utils.h"
|
|
21
|
+
|
|
22
|
+
static const Language GRAY_LANG = (Language)254;
|
|
23
|
+
|
|
24
|
+
static const int kMaxUpToWordBoundary = 50; // span < this make longer,
|
|
25
|
+
// else make shorter
|
|
26
|
+
static const int kMaxAdvanceToWordBoundary = 10; // +/- this many bytes
|
|
27
|
+
// to round to word boundary,
|
|
28
|
+
// direction above
|
|
29
|
+
|
|
30
|
+
static const char kSpecialSymbol[256] = { // true for < > &
|
|
31
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
32
|
+
0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0,
|
|
33
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
34
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
35
|
+
|
|
36
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
37
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
38
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
39
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
40
|
+
};
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
#define LT 0 // <
|
|
45
|
+
#define GT 1 // >
|
|
46
|
+
#define EX 2 // !
|
|
47
|
+
#define HY 3 // -
|
|
48
|
+
#define QU 4 // "
|
|
49
|
+
#define AP 5 // '
|
|
50
|
+
#define SL 6 // /
|
|
51
|
+
#define S_ 7
|
|
52
|
+
#define C_ 8
|
|
53
|
+
#define R_ 9
|
|
54
|
+
#define I_ 10
|
|
55
|
+
#define P_ 11
|
|
56
|
+
#define T_ 12
|
|
57
|
+
#define Y_ 13
|
|
58
|
+
#define L_ 14
|
|
59
|
+
#define E_ 15
|
|
60
|
+
#define CR 16 // <cr> or <lf>
|
|
61
|
+
#define NL 17 // non-letter: ASCII whitespace, digit, punctuation
|
|
62
|
+
#define PL 18 // possible letter, incl. &
|
|
63
|
+
#define xx 19 // <unused>
|
|
64
|
+
|
|
65
|
+
// Map byte to one of ~20 interesting categories for cheap tag parsing
|
|
66
|
+
static const uint8 kCharToSub[256] = {
|
|
67
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL,
|
|
68
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
|
|
69
|
+
NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL,
|
|
70
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL,
|
|
71
|
+
|
|
72
|
+
PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
|
|
73
|
+
P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
|
|
74
|
+
PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
|
|
75
|
+
P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
|
|
76
|
+
|
|
77
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
|
|
78
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
|
|
79
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
|
|
80
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
|
|
81
|
+
|
|
82
|
+
PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
|
|
83
|
+
PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
|
|
84
|
+
PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
|
|
85
|
+
PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
|
|
86
|
+
};
|
|
87
|
+
|
|
88
|
+
#undef LT
|
|
89
|
+
#undef GT
|
|
90
|
+
#undef EX
|
|
91
|
+
#undef HY
|
|
92
|
+
#undef QU
|
|
93
|
+
#undef AP
|
|
94
|
+
#undef SL
|
|
95
|
+
#undef S_
|
|
96
|
+
#undef C_
|
|
97
|
+
#undef R_
|
|
98
|
+
#undef I_
|
|
99
|
+
#undef P_
|
|
100
|
+
#undef T_
|
|
101
|
+
#undef Y_
|
|
102
|
+
#undef L_
|
|
103
|
+
#undef E_
|
|
104
|
+
#undef CR
|
|
105
|
+
#undef NL
|
|
106
|
+
#undef PL
|
|
107
|
+
#undef xx
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
#define OK 0
|
|
111
|
+
#define X_ 1
|
|
112
|
+
|
|
113
|
+
// State machine to do cheap parse of non-letter strings incl. tags
|
|
114
|
+
// advances <tag>
|
|
115
|
+
// | |
|
|
116
|
+
// advances <tag> ... </tag> for <script> <style>
|
|
117
|
+
// | |
|
|
118
|
+
// advances <!-- ... <tag> ... -->
|
|
119
|
+
// | |
|
|
120
|
+
// advances <tag
|
|
121
|
+
// || (0)
|
|
122
|
+
// advances <tag <tag2>
|
|
123
|
+
// || (0)
|
|
124
|
+
static const uint8 kTagParseTbl_0[] = {
|
|
125
|
+
// < > ! - " ' / S C R I P T Y L E CR NL PL xx
|
|
126
|
+
3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [0] OK
|
|
127
|
+
X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error
|
|
128
|
+
3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [2] NL*
|
|
129
|
+
X_, 2, 4, 9, 10,11, 9,13, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [3] <
|
|
130
|
+
X_, 2, 9, 5, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [4] <!
|
|
131
|
+
X_, 2, 9, 6, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [5] <!-
|
|
132
|
+
6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [6] <!--.*
|
|
133
|
+
6, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [7] <!--.*-
|
|
134
|
+
6, 2, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [8] <!--.*--
|
|
135
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [9] <.*
|
|
136
|
+
10,10,10,10, 9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*"
|
|
137
|
+
11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*'
|
|
138
|
+
X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " '
|
|
139
|
+
|
|
140
|
+
// < > ! - " ' / S C R I P T Y L E CR NL PL xx
|
|
141
|
+
X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9, 9, 9, 9,X_, // [13] <S
|
|
142
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9,15, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [14] <SC
|
|
143
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9,16, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [15] <SCR
|
|
144
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9,17, 9, 9, 9, 9, 9, 9, 9,X_, // [16] <SCRI
|
|
145
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 18, 9, 9, 9, 9, 9, 9,X_, // [17] <SCRIP
|
|
146
|
+
X_,19, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT
|
|
147
|
+
20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .*
|
|
148
|
+
19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*<
|
|
149
|
+
19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [21] <SCRIPT .*</
|
|
150
|
+
19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S
|
|
151
|
+
19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC
|
|
152
|
+
19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR
|
|
153
|
+
19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI
|
|
154
|
+
19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP
|
|
155
|
+
19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT
|
|
156
|
+
|
|
157
|
+
// < > ! - " ' / S C R I P T Y L E CR NL PL xx
|
|
158
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9,29, 9, 9, 9, 9, 9,X_, // [28] <ST
|
|
159
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9,30, 9, 9, 9, 9,X_, // [29] <STY
|
|
160
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9,31, 9, 9, 9,X_, // [30] <STYL
|
|
161
|
+
X_,32, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE
|
|
162
|
+
33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .*
|
|
163
|
+
32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*<
|
|
164
|
+
32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [34] <STYLE .*</
|
|
165
|
+
32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S
|
|
166
|
+
32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST
|
|
167
|
+
32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY
|
|
168
|
+
32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL
|
|
169
|
+
32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE
|
|
170
|
+
};
|
|
171
|
+
|
|
172
|
+
#undef OK
|
|
173
|
+
#undef X_
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
/*
|
|
177
|
+
// Convert GetTimeOfDay output to 64-bit usec
|
|
178
|
+
static inline uint64 Microseconds(const struct timeval& t) {
|
|
179
|
+
// The SumReducer uses uint64, so convert to (uint64) microseconds,
|
|
180
|
+
// not (double) seconds.
|
|
181
|
+
return t.tv_sec * 1000000ULL + t.tv_usec;
|
|
182
|
+
}
|
|
183
|
+
*/
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
// Returns true if character is < > or &
|
|
187
|
+
bool inline IsSpecial(char c) {
|
|
188
|
+
if ((c & 0xe0) == 0x20) {
|
|
189
|
+
return kSpecialSymbol[static_cast<uint8>(c)];
|
|
190
|
+
}
|
|
191
|
+
return false;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
// Quick Skip to next letter or < > & or to end of string (eos)
|
|
195
|
+
// Always return is_letter for eos
|
|
196
|
+
int ScanToLetterOrSpecial(const char* src, int len) {
|
|
197
|
+
int bytes_consumed;
|
|
198
|
+
cld::UTF8GenericScan(&utf8scannotjustletterspecial_obj, src, len,
|
|
199
|
+
&bytes_consumed);
|
|
200
|
+
return bytes_consumed;
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
// src points to non-letter, such as tag-opening '<'
|
|
206
|
+
// Return length from here to next possible letter
|
|
207
|
+
// On eos or another < before >, return 1
|
|
208
|
+
// advances <tag>
|
|
209
|
+
// | |
|
|
210
|
+
// advances <tag> ... </tag> for <script> <style>
|
|
211
|
+
// | |
|
|
212
|
+
// advances <!-- ... <tag> ... -->
|
|
213
|
+
// | |
|
|
214
|
+
// advances <tag
|
|
215
|
+
// || (1)
|
|
216
|
+
// advances <tag <tag2>
|
|
217
|
+
// || (1)
|
|
218
|
+
int ScanToPossibleLetter(const char* isrc, int len) {
|
|
219
|
+
const uint8* src = reinterpret_cast<const uint8*>(isrc);
|
|
220
|
+
const uint8* srclimit = src + len;
|
|
221
|
+
const uint8* tagParseTbl = kTagParseTbl_0;
|
|
222
|
+
int e = 0;
|
|
223
|
+
while (src < srclimit) {
|
|
224
|
+
e = tagParseTbl[kCharToSub[*src++]];
|
|
225
|
+
if ((e & ~1) == 0) {
|
|
226
|
+
// We overshot by one byte
|
|
227
|
+
--src;
|
|
228
|
+
break;
|
|
229
|
+
}
|
|
230
|
+
tagParseTbl = &kTagParseTbl_0[e * 20];
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
if (src >= srclimit) {
|
|
234
|
+
// We fell off the end of the text.
|
|
235
|
+
// It looks like the most common case for this is a truncated file, not
|
|
236
|
+
// mismatched angle brackets. So we pretend that the last char was '>'
|
|
237
|
+
return len;
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
// OK to be in state 0 or state 2 at exit
|
|
241
|
+
if ((e != 0) && (e != 2)) {
|
|
242
|
+
// Error, '<' followed by '<'
|
|
243
|
+
// We want to back up to first <, then advance by one byte past it
|
|
244
|
+
int offset = src - reinterpret_cast<const uint8*>(isrc);
|
|
245
|
+
// printf("ScanToPossibleLetter error at %d[%d] in '%s'\n",offset,e,isrc);
|
|
246
|
+
|
|
247
|
+
// Backscan to first '<' and return enough length to just get past it
|
|
248
|
+
--offset; // back up over the second '<', which caused us to stop
|
|
249
|
+
while ((0 < offset) && (isrc[offset] != '<')) {
|
|
250
|
+
// Find the first '<', which is unmatched
|
|
251
|
+
--offset;
|
|
252
|
+
}
|
|
253
|
+
// skip to just beyond first '<'
|
|
254
|
+
// printf(" returning %d\n", offset + 1);
|
|
255
|
+
return offset + 1;
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
return src - reinterpret_cast<const uint8*>(isrc);
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
ScriptScanner::ScriptScanner(const char* buffer,
|
|
264
|
+
int buffer_length,
|
|
265
|
+
bool is_plain_text)
|
|
266
|
+
: start_byte_(buffer),
|
|
267
|
+
next_byte_(buffer),
|
|
268
|
+
next_byte_limit_(buffer + buffer_length),
|
|
269
|
+
byte_length_(buffer_length),
|
|
270
|
+
is_plain_text_(is_plain_text) {
|
|
271
|
+
script_buffer_ = new char[getone::kMaxScriptBuffer];
|
|
272
|
+
script_buffer_lower_ = new char[getone::kMaxScriptLowerBuffer];
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
ScriptScanner::~ScriptScanner() {
|
|
276
|
+
delete[] script_buffer_;
|
|
277
|
+
delete[] script_buffer_lower_;
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
// Get to the first real non-tag letter or entity that is a letter
|
|
284
|
+
// Sets script of that letter
|
|
285
|
+
// Return len if no more letters
|
|
286
|
+
int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) {
|
|
287
|
+
int sc = UNKNOWN_LSCRIPT;
|
|
288
|
+
int skip = 0;
|
|
289
|
+
int tlen, plen;
|
|
290
|
+
|
|
291
|
+
// Do run of non-letters (tag | &NL | NL)*
|
|
292
|
+
while (skip < len) {
|
|
293
|
+
// Do fast scan to next interesting byte
|
|
294
|
+
// int oldskip = skip;
|
|
295
|
+
skip += ScanToLetterOrSpecial(src + skip, len - skip);
|
|
296
|
+
// TEMP
|
|
297
|
+
// printf("ScanToLetterOrSpecial[%d] 0x%02x => [%d] 0x%02x\n",
|
|
298
|
+
// oldskip, src[oldskip], skip, src[skip]);
|
|
299
|
+
|
|
300
|
+
// Check for no more letters/specials
|
|
301
|
+
if (skip >= len) {
|
|
302
|
+
// All done
|
|
303
|
+
return len;
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
// We are at a letter, nonletter, tag, or entity
|
|
307
|
+
if (IsSpecial(src[skip]) && !is_plain_text_) {
|
|
308
|
+
if (src[skip] == '<') {
|
|
309
|
+
// Begining of tag; skip to end and go around again
|
|
310
|
+
tlen = ScanToPossibleLetter(src + skip, len - skip);
|
|
311
|
+
sc = 0;
|
|
312
|
+
// printf("<...> ");
|
|
313
|
+
} else if (src[skip] == '>') {
|
|
314
|
+
// Unexpected end of tag; skip it and go around again
|
|
315
|
+
tlen = 1; // Over the >
|
|
316
|
+
sc = 0;
|
|
317
|
+
// printf("..> ");
|
|
318
|
+
} else if (src[skip] == '&') {
|
|
319
|
+
// Expand entity, no advance
|
|
320
|
+
char temp[4];
|
|
321
|
+
EntityToBuffer(src + skip, len - skip,
|
|
322
|
+
temp, &tlen, &plen);
|
|
323
|
+
sc = getone::GetUTF8LetterScriptNum(temp);
|
|
324
|
+
// printf("#(%02x%02x)=%d ", temp[0], temp[1], sc);
|
|
325
|
+
}
|
|
326
|
+
} else {
|
|
327
|
+
// Update 1..4 bytes
|
|
328
|
+
tlen = cld_UniLib::OneCharLen(src + skip);
|
|
329
|
+
sc = getone::GetUTF8LetterScriptNum(src + skip);
|
|
330
|
+
// printf("#(%02x%02x)=%d ", src[skip], src[skip+1], sc);
|
|
331
|
+
}
|
|
332
|
+
// TEMP
|
|
333
|
+
// printf("sc=%d ", sc);
|
|
334
|
+
if (sc != 0) {break;} // Letter found
|
|
335
|
+
skip += tlen; // Advance
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
*script = sc;
|
|
339
|
+
return skip;
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
|
|
345
|
+
// Buffer has leading space and all text is lowercased
|
|
346
|
+
bool ScriptScanner::GetOneScriptSpan(getone::LangSpan* span) {
|
|
347
|
+
span->text = script_buffer_;
|
|
348
|
+
span->text_bytes = 0;
|
|
349
|
+
span->offset = next_byte_ - start_byte_;
|
|
350
|
+
span->script = UNKNOWN_LSCRIPT;
|
|
351
|
+
span->lang = UNKNOWN_LANGUAGE;
|
|
352
|
+
span->truncated = false;
|
|
353
|
+
|
|
354
|
+
// printf("GetOneScriptSpan[[ ");
|
|
355
|
+
// struct timeval script_start, script_mid, script_end;
|
|
356
|
+
|
|
357
|
+
int spanscript; // The script of this span
|
|
358
|
+
int sc = UNKNOWN_LSCRIPT; // The script of next character
|
|
359
|
+
int tlen, plen;
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
script_buffer_[0] = ' '; // Always a space at front of output
|
|
363
|
+
script_buffer_[1] = '\0';
|
|
364
|
+
int take = 0;
|
|
365
|
+
int put = 1; // Start after the initial space
|
|
366
|
+
|
|
367
|
+
// gettimeofday(&script_start, NULL);
|
|
368
|
+
// Get to the first real non-tag letter or entity that is a letter
|
|
369
|
+
int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript);
|
|
370
|
+
next_byte_ += skip;
|
|
371
|
+
byte_length_ -= skip;
|
|
372
|
+
if (byte_length_ <= 0) {
|
|
373
|
+
// printf("]]\n");
|
|
374
|
+
return false; // No more letters to be found
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
// gettimeofday(&script_mid, NULL);
|
|
378
|
+
|
|
379
|
+
// There is at least one letter, so we know the script for this span
|
|
380
|
+
// printf("{%d} ", spanscript);
|
|
381
|
+
span->script = (UnicodeLScript)spanscript;
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
// Go over alternating spans of same-script letters and non-letters,
|
|
385
|
+
// copying letters to buffer with single spaces for each run of non-letters
|
|
386
|
+
while (take < byte_length_) {
|
|
387
|
+
// Copy run of letters in same script (&LS | LS)*
|
|
388
|
+
int letter_count = 0; // Keep track of word length
|
|
389
|
+
bool need_break = false;
|
|
390
|
+
while (take < byte_length_) {
|
|
391
|
+
// We are at a letter, nonletter, tag, or entity
|
|
392
|
+
if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
|
|
393
|
+
// printf("\"%c\" ", next_byte_[take]);
|
|
394
|
+
if (next_byte_[take] == '<') {
|
|
395
|
+
// Begining of tag
|
|
396
|
+
sc = 0;
|
|
397
|
+
break;
|
|
398
|
+
} else if (next_byte_[take] == '>') {
|
|
399
|
+
// Unexpected end of tag
|
|
400
|
+
sc = 0;
|
|
401
|
+
break;
|
|
402
|
+
} else if (next_byte_[take] == '&') {
|
|
403
|
+
// Copy entity, no advance
|
|
404
|
+
EntityToBuffer(next_byte_ + take, byte_length_ - take,
|
|
405
|
+
script_buffer_ + put, &tlen, &plen);
|
|
406
|
+
sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put);
|
|
407
|
+
}
|
|
408
|
+
} else {
|
|
409
|
+
// Real letter, safely copy up to 4 bytes, increment by 1..4
|
|
410
|
+
// Will update by 1..4 bytes at Advance, below
|
|
411
|
+
tlen = plen = cld_UniLib::OneCharLen(next_byte_ + take);
|
|
412
|
+
if (take < (byte_length_ - 3)) {
|
|
413
|
+
// Fast case
|
|
414
|
+
*reinterpret_cast<uint32*>(script_buffer_ + put) =
|
|
415
|
+
*reinterpret_cast<const uint32*>(next_byte_ + take);
|
|
416
|
+
} else {
|
|
417
|
+
// Slow case, happens 1-3 times per input document
|
|
418
|
+
memcpy(script_buffer_ + put, next_byte_ + take, plen);
|
|
419
|
+
}
|
|
420
|
+
sc = getone::GetUTF8LetterScriptNum(next_byte_ + take);
|
|
421
|
+
}
|
|
422
|
+
// printf("sc(%c)=%d ", next_byte_[take], sc);
|
|
423
|
+
// char xtmp[8]; memcpy(xtmp,script_buffer_ + put, plen);
|
|
424
|
+
// xtmp[plen] = '\0'; printf("'%s'{%d} ", xtmp, sc);
|
|
425
|
+
|
|
426
|
+
// Allow continue across a single letter in a different script:
|
|
427
|
+
// A B D = three scripts, c = common script, i = inherited script,
|
|
428
|
+
// - = don't care, ( = take position before the += below
|
|
429
|
+
// AAA(A- continue
|
|
430
|
+
//
|
|
431
|
+
// AAA(BA continue
|
|
432
|
+
// AAA(BB break
|
|
433
|
+
// AAA(Bc continue (breaks after B)
|
|
434
|
+
// AAA(BD break
|
|
435
|
+
// AAA(Bi break
|
|
436
|
+
//
|
|
437
|
+
// AAA(c- break
|
|
438
|
+
//
|
|
439
|
+
// AAA(i- continue
|
|
440
|
+
//
|
|
441
|
+
|
|
442
|
+
if ((sc != spanscript) && (sc != ULScript_Inherited)) {
|
|
443
|
+
// Might need to break this script span
|
|
444
|
+
if (sc == ULScript_Common) {
|
|
445
|
+
need_break = true;
|
|
446
|
+
} else {
|
|
447
|
+
// Look at next following character, ignoring entity as Common
|
|
448
|
+
int sc2 = getone::GetUTF8LetterScriptNum(next_byte_ + take + tlen);
|
|
449
|
+
if ((sc2 != ULScript_Common) && (sc2 != spanscript)) {
|
|
450
|
+
need_break = true;
|
|
451
|
+
}
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
if (need_break) {break;} // Non-letter or letter in wrong script
|
|
455
|
+
|
|
456
|
+
take += tlen; // Advance
|
|
457
|
+
put += plen; // Advance
|
|
458
|
+
++letter_count;
|
|
459
|
+
if (put >= getone::kMaxScriptBytes) {
|
|
460
|
+
// Buffer is full
|
|
461
|
+
span->truncated = true;
|
|
462
|
+
break;
|
|
463
|
+
}
|
|
464
|
+
} // End while letters
|
|
465
|
+
|
|
466
|
+
// Do run of non-letters (tag | &NL | NL)*
|
|
467
|
+
while (take < byte_length_) {
|
|
468
|
+
// Do fast scan to next interesting byte
|
|
469
|
+
take += ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take);
|
|
470
|
+
|
|
471
|
+
// Check for no more letters/specials
|
|
472
|
+
if (take >= byte_length_) {
|
|
473
|
+
take = byte_length_;
|
|
474
|
+
break;
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
// We are at a letter, nonletter, tag, or entity
|
|
478
|
+
if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
|
|
479
|
+
// printf("\"%c\" ", next_byte_[take]);
|
|
480
|
+
if (next_byte_[take] == '<') {
|
|
481
|
+
// Begining of tag; skip to end and go around again
|
|
482
|
+
tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take);
|
|
483
|
+
sc = 0;
|
|
484
|
+
// printf("<...> ");
|
|
485
|
+
} else if (next_byte_[take] == '>') {
|
|
486
|
+
// Unexpected end of tag; skip it and go around again
|
|
487
|
+
tlen = 1; // Over the >
|
|
488
|
+
sc = 0;
|
|
489
|
+
// printf("..> ");
|
|
490
|
+
} else if (next_byte_[take] == '&') {
|
|
491
|
+
// Expand entity, no advance
|
|
492
|
+
EntityToBuffer(next_byte_ + take, byte_length_ - take,
|
|
493
|
+
script_buffer_ + put, &tlen, &plen);
|
|
494
|
+
sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put);
|
|
495
|
+
}
|
|
496
|
+
} else {
|
|
497
|
+
// Update 1..4
|
|
498
|
+
tlen = cld_UniLib::OneCharLen(next_byte_ + take);
|
|
499
|
+
sc = getone::GetUTF8LetterScriptNum(next_byte_ + take);
|
|
500
|
+
}
|
|
501
|
+
// printf("sc[%c]=%d ", next_byte_[take], sc);
|
|
502
|
+
if (sc != 0) {break;} // Letter found
|
|
503
|
+
take += tlen; // Advance
|
|
504
|
+
} // End while not-letters
|
|
505
|
+
|
|
506
|
+
script_buffer_[put++] = ' ';
|
|
507
|
+
|
|
508
|
+
// We are at a letter again (or eos), after letter* not-letter*
|
|
509
|
+
if (sc != spanscript) {break;} // Letter in wrong script
|
|
510
|
+
if (put >= getone::kMaxScriptBytes - 8) {
|
|
511
|
+
// Buffer is almost full
|
|
512
|
+
span->truncated = true;
|
|
513
|
+
break;
|
|
514
|
+
}
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
// Update input position
|
|
518
|
+
next_byte_ += take;
|
|
519
|
+
byte_length_ -= take;
|
|
520
|
+
|
|
521
|
+
// Put four more spaces/NUL. Worst case is abcd _ _ _ \0
|
|
522
|
+
// kMaxScriptBytes | | put
|
|
523
|
+
script_buffer_[put + 0] = ' ';
|
|
524
|
+
script_buffer_[put + 1] = ' ';
|
|
525
|
+
script_buffer_[put + 2] = ' ';
|
|
526
|
+
script_buffer_[put + 3] = '\0';
|
|
527
|
+
|
|
528
|
+
span->text_bytes = put; // Does not include the last four chars above
|
|
529
|
+
|
|
530
|
+
// printf(" %d]]\n\n", put);
|
|
531
|
+
return true;
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
// Force Latin, Cyrillic, Greek scripts to be lowercase
|
|
535
|
+
void ScriptScanner::LowerScriptSpan(getone::LangSpan* span) {
|
|
536
|
+
// On Windows, text is lowercased beforehand, so no need to do anything here.
|
|
537
|
+
#if !defined(CLD_WINDOWS)
|
|
538
|
+
// If needed, lowercase all the text. If we do it sooner, might miss
|
|
539
|
+
// lowercasing an entity such as Á
|
|
540
|
+
// We only need to do this for Latn and Cyrl scripts
|
|
541
|
+
if ((span->script == ULScript_Latin) ||
|
|
542
|
+
(span->script == ULScript_Cyrillic) ||
|
|
543
|
+
(span->script == ULScript_Greek)) {
|
|
544
|
+
// Full Unicode lowercase of the entire buffer, including
|
|
545
|
+
// four pad bytes off the end
|
|
546
|
+
int consumed, filled;
|
|
547
|
+
UniLib::ToLower(span->text, span->text_bytes + 4,
|
|
548
|
+
script_buffer_lower_, getone::kMaxScriptLowerBuffer,
|
|
549
|
+
&consumed, &filled);
|
|
550
|
+
span->text = script_buffer_lower_;
|
|
551
|
+
span->text_bytes = filled - 4;
|
|
552
|
+
}
|
|
553
|
+
#endif
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
|
|
557
|
+
// Force Latin and Cyrillic scripts to be lowercase
|
|
558
|
+
bool ScriptScanner::GetOneScriptSpanLower(getone::LangSpan* span) {
|
|
559
|
+
bool ok = GetOneScriptSpan(span);
|
|
560
|
+
LowerScriptSpan(span);
|
|
561
|
+
return ok;
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
// Gets lscript number for letters; always returns
|
|
565
|
+
// 0 (common script) for non-letters
|
|
566
|
+
int getone::GetUTF8LetterScriptNum(const char* src) {
|
|
567
|
+
int srclen = cld_UniLib::OneCharLen(src);
|
|
568
|
+
const uint8* usrc = reinterpret_cast<const uint8*>(src);
|
|
569
|
+
return UTF8GenericProperty(&utf8propletterscriptnum_obj, &usrc, &srclen);
|
|
570
|
+
}
|