cld-fixed 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +20 -0
- data/.rspec +2 -0
- data/Gemfile +6 -0
- data/LICENSE +27 -0
- data/README.md +34 -0
- data/Rakefile +5 -0
- data/cld.gemspec +22 -0
- data/ext/cld/Makefile.am +28 -0
- data/ext/cld/Makefile.in +790 -0
- data/ext/cld/aclocal.m4 +8895 -0
- data/ext/cld/base/basictypes.h +348 -0
- data/ext/cld/base/build_config.h +115 -0
- data/ext/cld/base/casts.h +156 -0
- data/ext/cld/base/commandlineflags.h +443 -0
- data/ext/cld/base/crash.h +41 -0
- data/ext/cld/base/dynamic_annotations.h +358 -0
- data/ext/cld/base/global_strip_options.h +59 -0
- data/ext/cld/base/log_severity.h +46 -0
- data/ext/cld/base/logging.h +1403 -0
- data/ext/cld/base/macros.h +243 -0
- data/ext/cld/base/port.h +54 -0
- data/ext/cld/base/scoped_ptr.h +428 -0
- data/ext/cld/base/stl_decl.h +0 -0
- data/ext/cld/base/stl_decl_msvc.h +107 -0
- data/ext/cld/base/string_util.h +29 -0
- data/ext/cld/base/strtoint.h +93 -0
- data/ext/cld/base/template_util.h +96 -0
- data/ext/cld/base/type_traits.h +198 -0
- data/ext/cld/base/vlog_is_on.h +143 -0
- data/ext/cld/build_aux/config.guess +1500 -0
- data/ext/cld/build_aux/config.sub +1616 -0
- data/ext/cld/build_aux/depcomp +584 -0
- data/ext/cld/build_aux/install-sh +507 -0
- data/ext/cld/build_aux/ltmain.sh +8745 -0
- data/ext/cld/build_aux/missing +367 -0
- data/ext/cld/cld_encodings.h +95 -0
- data/ext/cld/configure +17362 -0
- data/ext/cld/configure.ac +14 -0
- data/ext/cld/encodings/compact_lang_det/#cldutil.cc# +905 -0
- data/ext/cld/encodings/compact_lang_det/#cldutil.h# +1205 -0
- data/ext/cld/encodings/compact_lang_det/#compact_lang_det_impl.h# +171 -0
- data/ext/cld/encodings/compact_lang_det/#ext_lang_enc.cc# +545 -0
- data/ext/cld/encodings/compact_lang_det/#ext_lang_enc.h# +119 -0
- data/ext/cld/encodings/compact_lang_det/#getonescriptspan.cc# +570 -0
- data/ext/cld/encodings/compact_lang_det/#getonescriptspan.h# +131 -0
- data/ext/cld/encodings/compact_lang_det/#tote.cc# +299 -0
- data/ext/cld/encodings/compact_lang_det/#tote.h# +89 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
- data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
- data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
- data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
- data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
- data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
- data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
- data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
- data/ext/cld/encodings/compact_lang_det/win/#cld_unilib_windows.cc# +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
- data/ext/cld/encodings/internal/encodings.cc +12 -0
- data/ext/cld/encodings/lang_enc.h +254 -0
- data/ext/cld/encodings/proto/encodings.pb.h +169 -0
- data/ext/cld/encodings/public/encodings.h +301 -0
- data/ext/cld/extconf.rb +7 -0
- data/ext/cld/languages/internal/#languages.cc# +337 -0
- data/ext/cld/languages/internal/languages.cc +336 -0
- data/ext/cld/languages/proto/languages.pb.h +179 -0
- data/ext/cld/languages/public/languages.h +379 -0
- data/ext/cld/thunk.cc +55 -0
- data/lib/cld.rb +21 -0
- data/lib/cld/version.rb +3 -0
- data/spec/cld_spec.rb +67 -0
- data/spec/spec_helper.rb +6 -0
- metadata +193 -0
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
// Copyright (c) 2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
//
|
|
5
|
+
// This file extends lang_enc.h with additional languages and extended routines.
|
|
6
|
+
// It is current with Unicode 5.1 (March 2008)
|
|
7
|
+
//
|
|
8
|
+
|
|
9
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_EXT_LANG_ENC_H__
|
|
10
|
+
#define ENCODINGS_COMPACT_LANG_DET_EXT_LANG_ENC_H__
|
|
11
|
+
|
|
12
|
+
#include "languages/public/languages.h"
|
|
13
|
+
#include "encodings/compact_lang_det/letterscript_enum.h"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
// Leave a small gap after the base languages, so adding one or two is easy.
|
|
17
|
+
// Just reduce the gap here (currently 5 entries)
|
|
18
|
+
|
|
19
|
+
// Montengrin added, so reducing this from 5 to 4. dsites 2008.10.06
|
|
20
|
+
#define EXT_LANGUAGE_BASE (NUM_LANGUAGES + 4)
|
|
21
|
+
|
|
22
|
+
// Google UI languages
|
|
23
|
+
#define X_BORK_BORK_BORK (Language)(EXT_LANGUAGE_BASE+0)
|
|
24
|
+
#define X_PIG_LATIN (Language)(EXT_LANGUAGE_BASE+1)
|
|
25
|
+
#define X_HACKER (Language)(EXT_LANGUAGE_BASE+2)
|
|
26
|
+
#define X_KLINGON (Language)(EXT_LANGUAGE_BASE+3)
|
|
27
|
+
#define X_ELMER_FUDD (Language)(EXT_LANGUAGE_BASE+4)
|
|
28
|
+
|
|
29
|
+
// Pseudo-languages for Unicode scripts that express a single language
|
|
30
|
+
#define X_OGHAM (Language)(EXT_LANGUAGE_BASE+5)
|
|
31
|
+
#define X_RUNIC (Language)(EXT_LANGUAGE_BASE+6)
|
|
32
|
+
#define X_YI (Language)(EXT_LANGUAGE_BASE+7)
|
|
33
|
+
#define X_OLD_ITALIC (Language)(EXT_LANGUAGE_BASE+8)
|
|
34
|
+
#define X_GOTHIC (Language)(EXT_LANGUAGE_BASE+9)
|
|
35
|
+
#define X_DESERET (Language)(EXT_LANGUAGE_BASE+10)
|
|
36
|
+
#define X_HANUNOO (Language)(EXT_LANGUAGE_BASE+11)
|
|
37
|
+
#define X_BUHID (Language)(EXT_LANGUAGE_BASE+12)
|
|
38
|
+
#define X_TAGBANWA (Language)(EXT_LANGUAGE_BASE+13)
|
|
39
|
+
#define X_TAI_LE (Language)(EXT_LANGUAGE_BASE+14)
|
|
40
|
+
#define X_LINEAR_B (Language)(EXT_LANGUAGE_BASE+15)
|
|
41
|
+
#define X_UGARITIC (Language)(EXT_LANGUAGE_BASE+16)
|
|
42
|
+
#define X_SHAVIAN (Language)(EXT_LANGUAGE_BASE+17)
|
|
43
|
+
#define X_OSMANYA (Language)(EXT_LANGUAGE_BASE+18)
|
|
44
|
+
#define X_CYPRIOT (Language)(EXT_LANGUAGE_BASE+19)
|
|
45
|
+
#define X_BUGINESE (Language)(EXT_LANGUAGE_BASE+20)
|
|
46
|
+
#define X_COPTIC (Language)(EXT_LANGUAGE_BASE+21)
|
|
47
|
+
#define X_NEW_TAI_LUE (Language)(EXT_LANGUAGE_BASE+22)
|
|
48
|
+
#define X_GLAGOLITIC (Language)(EXT_LANGUAGE_BASE+23)
|
|
49
|
+
#define X_TIFINAGH (Language)(EXT_LANGUAGE_BASE+24)
|
|
50
|
+
#define X_SYLOTI_NAGRI (Language)(EXT_LANGUAGE_BASE+25)
|
|
51
|
+
#define X_OLD_PERSIAN (Language)(EXT_LANGUAGE_BASE+26)
|
|
52
|
+
#define X_KHAROSHTHI (Language)(EXT_LANGUAGE_BASE+27)
|
|
53
|
+
#define X_BALINESE (Language)(EXT_LANGUAGE_BASE+28)
|
|
54
|
+
#define X_CUNEIFORM (Language)(EXT_LANGUAGE_BASE+29)
|
|
55
|
+
#define X_PHOENICIAN (Language)(EXT_LANGUAGE_BASE+30)
|
|
56
|
+
#define X_PHAGS_PA (Language)(EXT_LANGUAGE_BASE+31)
|
|
57
|
+
#define X_NKO (Language)(EXT_LANGUAGE_BASE+32)
|
|
58
|
+
|
|
59
|
+
// Unicode 5.1
|
|
60
|
+
#define X_SUDANESE (Language)(EXT_LANGUAGE_BASE+33)
|
|
61
|
+
#define X_LEPCHA (Language)(EXT_LANGUAGE_BASE+34)
|
|
62
|
+
#define X_OL_CHIKI (Language)(EXT_LANGUAGE_BASE+35)
|
|
63
|
+
#define X_VAI (Language)(EXT_LANGUAGE_BASE+36)
|
|
64
|
+
#define X_SAURASHTRA (Language)(EXT_LANGUAGE_BASE+37)
|
|
65
|
+
#define X_KAYAH_LI (Language)(EXT_LANGUAGE_BASE+38)
|
|
66
|
+
#define X_REJANG (Language)(EXT_LANGUAGE_BASE+39)
|
|
67
|
+
#define X_LYCIAN (Language)(EXT_LANGUAGE_BASE+40)
|
|
68
|
+
#define X_CARIAN (Language)(EXT_LANGUAGE_BASE+41)
|
|
69
|
+
#define X_LYDIAN (Language)(EXT_LANGUAGE_BASE+42)
|
|
70
|
+
#define X_CHAM (Language)(EXT_LANGUAGE_BASE+43)
|
|
71
|
+
|
|
72
|
+
#define EXT_NUM_LANGUAGES (Language)(EXT_LANGUAGE_BASE+44)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
// ExtLanguageName
|
|
77
|
+
// ------------
|
|
78
|
+
// Given the Language, returns its string name used as the output by
|
|
79
|
+
// the lang/enc identifier, e.g. "Korean"
|
|
80
|
+
// "invalid_language" if the input is invalid.
|
|
81
|
+
extern const char* ExtLanguageName(const Language lang);
|
|
82
|
+
|
|
83
|
+
// ExtLanguageDeclaredName
|
|
84
|
+
// ------------
|
|
85
|
+
// Given the Language, returns its Language enum spelling, for use by
|
|
86
|
+
// programs that create C declarations, e.g. "KOREAN"
|
|
87
|
+
// "UNKNOWN_LANGUAGE" if the input is invalid.
|
|
88
|
+
extern const char* ExtLanguageDeclaredName(const Language lang);
|
|
89
|
+
|
|
90
|
+
// ExtLanguageCode
|
|
91
|
+
// ------------
|
|
92
|
+
// Given the Language, return the language code, e.g. "ko"
|
|
93
|
+
// This is determined by
|
|
94
|
+
// the following (in order of preference):
|
|
95
|
+
// - ISO-639-1 two-letter language code
|
|
96
|
+
// (all except those mentioned below)
|
|
97
|
+
// - ISO-639-2 three-letter bibliographic language code
|
|
98
|
+
// (Tibetan, Dhivehi, Cherokee, Syriac)
|
|
99
|
+
// - Google-specific language code
|
|
100
|
+
// (ChineseT ("zh-TW"), Teragram Unknown, Unknown,
|
|
101
|
+
// Portuguese-Portugal, Portuguese-Brazil, Limbu)
|
|
102
|
+
extern const char * ExtLanguageCode(const Language lang);
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
// Convert "en-Latn-GB" to ENGLISH
|
|
106
|
+
// Normalize to PORTUGUESE, not PORTUGUESE_B nor PORTUGUESE_P
|
|
107
|
+
// Consider for later: NORWEGIAN, NORWEGIAN_N
|
|
108
|
+
// Consider for later: SCOTS, SCOTS_GAELIC
|
|
109
|
+
// Consider for later: SERBO_CROATIAN, SERBIAN, CROATIAN, BOSNIAN
|
|
110
|
+
//
|
|
111
|
+
Language GetLanguageFromNumberOrName(const char* src);
|
|
112
|
+
|
|
113
|
+
// Convert "en-Latn-GB" to ULScript_Latin
|
|
114
|
+
UnicodeLScript GetLScriptFromNumberOrName(const char* src);
|
|
115
|
+
|
|
116
|
+
// Merge together some languages, such as bo/hr/sr
|
|
117
|
+
Language NormalizeLanguage(Language lang);
|
|
118
|
+
|
|
119
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_EXT_LANG_ENC_H__
|
|
@@ -0,0 +1,570 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#include "encodings/compact_lang_det/getonescriptspan.h"
|
|
6
|
+
#include <stdio.h>
|
|
7
|
+
#include <string.h>
|
|
8
|
+
|
|
9
|
+
#include "encodings/lang_enc.h"
|
|
10
|
+
#include "encodings/compact_lang_det/utf8propjustletter.h"
|
|
11
|
+
#include "encodings/compact_lang_det/utf8propletterscriptnum.h"
|
|
12
|
+
#include "encodings/compact_lang_det/utf8scannotjustletterspecial.h"
|
|
13
|
+
|
|
14
|
+
#include "encodings/compact_lang_det/win/cld_basictypes.h"
|
|
15
|
+
#include "encodings/compact_lang_det/win/cld_commandlineflags.h"
|
|
16
|
+
#include "encodings/compact_lang_det/win/cld_google.h"
|
|
17
|
+
#include "encodings/compact_lang_det/win/cld_htmlutils.h"
|
|
18
|
+
#include "encodings/compact_lang_det/win/cld_unilib.h"
|
|
19
|
+
#include "encodings/compact_lang_det/win/cld_utf8statetable.h"
|
|
20
|
+
#include "encodings/compact_lang_det/win/cld_utf8utils.h"
|
|
21
|
+
|
|
22
|
+
static const Language GRAY_LANG = (Language)254;
|
|
23
|
+
|
|
24
|
+
static const int kMaxUpToWordBoundary = 50; // span < this make longer,
|
|
25
|
+
// else make shorter
|
|
26
|
+
static const int kMaxAdvanceToWordBoundary = 10; // +/- this many bytes
|
|
27
|
+
// to round to word boundary,
|
|
28
|
+
// direction above
|
|
29
|
+
|
|
30
|
+
static const char kSpecialSymbol[256] = { // true for < > &
|
|
31
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
32
|
+
0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0,
|
|
33
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
34
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
35
|
+
|
|
36
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
37
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
38
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
39
|
+
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
|
40
|
+
};
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
#define LT 0 // <
|
|
45
|
+
#define GT 1 // >
|
|
46
|
+
#define EX 2 // !
|
|
47
|
+
#define HY 3 // -
|
|
48
|
+
#define QU 4 // "
|
|
49
|
+
#define AP 5 // '
|
|
50
|
+
#define SL 6 // /
|
|
51
|
+
#define S_ 7
|
|
52
|
+
#define C_ 8
|
|
53
|
+
#define R_ 9
|
|
54
|
+
#define I_ 10
|
|
55
|
+
#define P_ 11
|
|
56
|
+
#define T_ 12
|
|
57
|
+
#define Y_ 13
|
|
58
|
+
#define L_ 14
|
|
59
|
+
#define E_ 15
|
|
60
|
+
#define CR 16 // <cr> or <lf>
|
|
61
|
+
#define NL 17 // non-letter: ASCII whitespace, digit, punctuation
|
|
62
|
+
#define PL 18 // possible letter, incl. &
|
|
63
|
+
#define xx 19 // <unused>
|
|
64
|
+
|
|
65
|
+
// Map byte to one of ~20 interesting categories for cheap tag parsing
|
|
66
|
+
static const uint8 kCharToSub[256] = {
|
|
67
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL,
|
|
68
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
|
|
69
|
+
NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL,
|
|
70
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL,
|
|
71
|
+
|
|
72
|
+
PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
|
|
73
|
+
P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
|
|
74
|
+
PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
|
|
75
|
+
P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
|
|
76
|
+
|
|
77
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
|
|
78
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
|
|
79
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
|
|
80
|
+
NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
|
|
81
|
+
|
|
82
|
+
PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
|
|
83
|
+
PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
|
|
84
|
+
PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
|
|
85
|
+
PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
|
|
86
|
+
};
|
|
87
|
+
|
|
88
|
+
#undef LT
|
|
89
|
+
#undef GT
|
|
90
|
+
#undef EX
|
|
91
|
+
#undef HY
|
|
92
|
+
#undef QU
|
|
93
|
+
#undef AP
|
|
94
|
+
#undef SL
|
|
95
|
+
#undef S_
|
|
96
|
+
#undef C_
|
|
97
|
+
#undef R_
|
|
98
|
+
#undef I_
|
|
99
|
+
#undef P_
|
|
100
|
+
#undef T_
|
|
101
|
+
#undef Y_
|
|
102
|
+
#undef L_
|
|
103
|
+
#undef E_
|
|
104
|
+
#undef CR
|
|
105
|
+
#undef NL
|
|
106
|
+
#undef PL
|
|
107
|
+
#undef xx
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
#define OK 0
|
|
111
|
+
#define X_ 1
|
|
112
|
+
|
|
113
|
+
// State machine to do cheap parse of non-letter strings incl. tags
|
|
114
|
+
// advances <tag>
|
|
115
|
+
// | |
|
|
116
|
+
// advances <tag> ... </tag> for <script> <style>
|
|
117
|
+
// | |
|
|
118
|
+
// advances <!-- ... <tag> ... -->
|
|
119
|
+
// | |
|
|
120
|
+
// advances <tag
|
|
121
|
+
// || (0)
|
|
122
|
+
// advances <tag <tag2>
|
|
123
|
+
// || (0)
|
|
124
|
+
static const uint8 kTagParseTbl_0[] = {
|
|
125
|
+
// < > ! - " ' / S C R I P T Y L E CR NL PL xx
|
|
126
|
+
3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [0] OK
|
|
127
|
+
X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error
|
|
128
|
+
3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [2] NL*
|
|
129
|
+
X_, 2, 4, 9, 10,11, 9,13, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [3] <
|
|
130
|
+
X_, 2, 9, 5, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [4] <!
|
|
131
|
+
X_, 2, 9, 6, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [5] <!-
|
|
132
|
+
6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [6] <!--.*
|
|
133
|
+
6, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [7] <!--.*-
|
|
134
|
+
6, 2, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [8] <!--.*--
|
|
135
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [9] <.*
|
|
136
|
+
10,10,10,10, 9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*"
|
|
137
|
+
11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*'
|
|
138
|
+
X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " '
|
|
139
|
+
|
|
140
|
+
// < > ! - " ' / S C R I P T Y L E CR NL PL xx
|
|
141
|
+
X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9, 9, 9, 9,X_, // [13] <S
|
|
142
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9,15, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [14] <SC
|
|
143
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9,16, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [15] <SCR
|
|
144
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9,17, 9, 9, 9, 9, 9, 9, 9,X_, // [16] <SCRI
|
|
145
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 18, 9, 9, 9, 9, 9, 9,X_, // [17] <SCRIP
|
|
146
|
+
X_,19, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT
|
|
147
|
+
20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .*
|
|
148
|
+
19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*<
|
|
149
|
+
19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [21] <SCRIPT .*</
|
|
150
|
+
19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S
|
|
151
|
+
19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC
|
|
152
|
+
19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR
|
|
153
|
+
19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI
|
|
154
|
+
19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP
|
|
155
|
+
19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT
|
|
156
|
+
|
|
157
|
+
// < > ! - " ' / S C R I P T Y L E CR NL PL xx
|
|
158
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9,29, 9, 9, 9, 9, 9,X_, // [28] <ST
|
|
159
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9,30, 9, 9, 9, 9,X_, // [29] <STY
|
|
160
|
+
X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9,31, 9, 9, 9,X_, // [30] <STYL
|
|
161
|
+
X_,32, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE
|
|
162
|
+
33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .*
|
|
163
|
+
32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*<
|
|
164
|
+
32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [34] <STYLE .*</
|
|
165
|
+
32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S
|
|
166
|
+
32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST
|
|
167
|
+
32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY
|
|
168
|
+
32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL
|
|
169
|
+
32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE
|
|
170
|
+
};
|
|
171
|
+
|
|
172
|
+
#undef OK
|
|
173
|
+
#undef X_
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
/*
|
|
177
|
+
// Convert GetTimeOfDay output to 64-bit usec
|
|
178
|
+
static inline uint64 Microseconds(const struct timeval& t) {
|
|
179
|
+
// The SumReducer uses uint64, so convert to (uint64) microseconds,
|
|
180
|
+
// not (double) seconds.
|
|
181
|
+
return t.tv_sec * 1000000ULL + t.tv_usec;
|
|
182
|
+
}
|
|
183
|
+
*/
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
// Returns true if character is < > or &
|
|
187
|
+
bool inline IsSpecial(char c) {
|
|
188
|
+
if ((c & 0xe0) == 0x20) {
|
|
189
|
+
return kSpecialSymbol[static_cast<uint8>(c)];
|
|
190
|
+
}
|
|
191
|
+
return false;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
// Quick Skip to next letter or < > & or to end of string (eos)
|
|
195
|
+
// Always return is_letter for eos
|
|
196
|
+
int ScanToLetterOrSpecial(const char* src, int len) {
|
|
197
|
+
int bytes_consumed;
|
|
198
|
+
cld::UTF8GenericScan(&utf8scannotjustletterspecial_obj, src, len,
|
|
199
|
+
&bytes_consumed);
|
|
200
|
+
return bytes_consumed;
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
// src points to non-letter, such as tag-opening '<'
|
|
206
|
+
// Return length from here to next possible letter
|
|
207
|
+
// On eos or another < before >, return 1
|
|
208
|
+
// advances <tag>
|
|
209
|
+
// | |
|
|
210
|
+
// advances <tag> ... </tag> for <script> <style>
|
|
211
|
+
// | |
|
|
212
|
+
// advances <!-- ... <tag> ... -->
|
|
213
|
+
// | |
|
|
214
|
+
// advances <tag
|
|
215
|
+
// || (1)
|
|
216
|
+
// advances <tag <tag2>
|
|
217
|
+
// || (1)
|
|
218
|
+
int ScanToPossibleLetter(const char* isrc, int len) {
|
|
219
|
+
const uint8* src = reinterpret_cast<const uint8*>(isrc);
|
|
220
|
+
const uint8* srclimit = src + len;
|
|
221
|
+
const uint8* tagParseTbl = kTagParseTbl_0;
|
|
222
|
+
int e = 0;
|
|
223
|
+
while (src < srclimit) {
|
|
224
|
+
e = tagParseTbl[kCharToSub[*src++]];
|
|
225
|
+
if ((e & ~1) == 0) {
|
|
226
|
+
// We overshot by one byte
|
|
227
|
+
--src;
|
|
228
|
+
break;
|
|
229
|
+
}
|
|
230
|
+
tagParseTbl = &kTagParseTbl_0[e * 20];
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
if (src >= srclimit) {
|
|
234
|
+
// We fell off the end of the text.
|
|
235
|
+
// It looks like the most common case for this is a truncated file, not
|
|
236
|
+
// mismatched angle brackets. So we pretend that the last char was '>'
|
|
237
|
+
return len;
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
// OK to be in state 0 or state 2 at exit
|
|
241
|
+
if ((e != 0) && (e != 2)) {
|
|
242
|
+
// Error, '<' followed by '<'
|
|
243
|
+
// We want to back up to first <, then advance by one byte past it
|
|
244
|
+
int offset = src - reinterpret_cast<const uint8*>(isrc);
|
|
245
|
+
// printf("ScanToPossibleLetter error at %d[%d] in '%s'\n",offset,e,isrc);
|
|
246
|
+
|
|
247
|
+
// Backscan to first '<' and return enough length to just get past it
|
|
248
|
+
--offset; // back up over the second '<', which caused us to stop
|
|
249
|
+
while ((0 < offset) && (isrc[offset] != '<')) {
|
|
250
|
+
// Find the first '<', which is unmatched
|
|
251
|
+
--offset;
|
|
252
|
+
}
|
|
253
|
+
// skip to just beyond first '<'
|
|
254
|
+
// printf(" returning %d\n", offset + 1);
|
|
255
|
+
return offset + 1;
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
return src - reinterpret_cast<const uint8*>(isrc);
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
ScriptScanner::ScriptScanner(const char* buffer,
|
|
264
|
+
int buffer_length,
|
|
265
|
+
bool is_plain_text)
|
|
266
|
+
: start_byte_(buffer),
|
|
267
|
+
next_byte_(buffer),
|
|
268
|
+
next_byte_limit_(buffer + buffer_length),
|
|
269
|
+
byte_length_(buffer_length),
|
|
270
|
+
is_plain_text_(is_plain_text) {
|
|
271
|
+
script_buffer_ = new char[getone::kMaxScriptBuffer];
|
|
272
|
+
script_buffer_lower_ = new char[getone::kMaxScriptLowerBuffer];
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
ScriptScanner::~ScriptScanner() {
|
|
276
|
+
delete[] script_buffer_;
|
|
277
|
+
delete[] script_buffer_lower_;
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
// Get to the first real non-tag letter or entity that is a letter
|
|
284
|
+
// Sets script of that letter
|
|
285
|
+
// Return len if no more letters
|
|
286
|
+
int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) {
|
|
287
|
+
int sc = UNKNOWN_LSCRIPT;
|
|
288
|
+
int skip = 0;
|
|
289
|
+
int tlen, plen;
|
|
290
|
+
|
|
291
|
+
// Do run of non-letters (tag | &NL | NL)*
|
|
292
|
+
while (skip < len) {
|
|
293
|
+
// Do fast scan to next interesting byte
|
|
294
|
+
// int oldskip = skip;
|
|
295
|
+
skip += ScanToLetterOrSpecial(src + skip, len - skip);
|
|
296
|
+
// TEMP
|
|
297
|
+
// printf("ScanToLetterOrSpecial[%d] 0x%02x => [%d] 0x%02x\n",
|
|
298
|
+
// oldskip, src[oldskip], skip, src[skip]);
|
|
299
|
+
|
|
300
|
+
// Check for no more letters/specials
|
|
301
|
+
if (skip >= len) {
|
|
302
|
+
// All done
|
|
303
|
+
return len;
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
// We are at a letter, nonletter, tag, or entity
|
|
307
|
+
if (IsSpecial(src[skip]) && !is_plain_text_) {
|
|
308
|
+
if (src[skip] == '<') {
|
|
309
|
+
// Begining of tag; skip to end and go around again
|
|
310
|
+
tlen = ScanToPossibleLetter(src + skip, len - skip);
|
|
311
|
+
sc = 0;
|
|
312
|
+
// printf("<...> ");
|
|
313
|
+
} else if (src[skip] == '>') {
|
|
314
|
+
// Unexpected end of tag; skip it and go around again
|
|
315
|
+
tlen = 1; // Over the >
|
|
316
|
+
sc = 0;
|
|
317
|
+
// printf("..> ");
|
|
318
|
+
} else if (src[skip] == '&') {
|
|
319
|
+
// Expand entity, no advance
|
|
320
|
+
char temp[4];
|
|
321
|
+
EntityToBuffer(src + skip, len - skip,
|
|
322
|
+
temp, &tlen, &plen);
|
|
323
|
+
sc = getone::GetUTF8LetterScriptNum(temp);
|
|
324
|
+
// printf("#(%02x%02x)=%d ", temp[0], temp[1], sc);
|
|
325
|
+
}
|
|
326
|
+
} else {
|
|
327
|
+
// Update 1..4 bytes
|
|
328
|
+
tlen = cld_UniLib::OneCharLen(src + skip);
|
|
329
|
+
sc = getone::GetUTF8LetterScriptNum(src + skip);
|
|
330
|
+
// printf("#(%02x%02x)=%d ", src[skip], src[skip+1], sc);
|
|
331
|
+
}
|
|
332
|
+
// TEMP
|
|
333
|
+
// printf("sc=%d ", sc);
|
|
334
|
+
if (sc != 0) {break;} // Letter found
|
|
335
|
+
skip += tlen; // Advance
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
*script = sc;
|
|
339
|
+
return skip;
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
|
|
345
|
+
// Buffer has leading space and all text is lowercased
|
|
346
|
+
bool ScriptScanner::GetOneScriptSpan(getone::LangSpan* span) {
|
|
347
|
+
span->text = script_buffer_;
|
|
348
|
+
span->text_bytes = 0;
|
|
349
|
+
span->offset = next_byte_ - start_byte_;
|
|
350
|
+
span->script = UNKNOWN_LSCRIPT;
|
|
351
|
+
span->lang = UNKNOWN_LANGUAGE;
|
|
352
|
+
span->truncated = false;
|
|
353
|
+
|
|
354
|
+
// printf("GetOneScriptSpan[[ ");
|
|
355
|
+
// struct timeval script_start, script_mid, script_end;
|
|
356
|
+
|
|
357
|
+
int spanscript; // The script of this span
|
|
358
|
+
int sc = UNKNOWN_LSCRIPT; // The script of next character
|
|
359
|
+
int tlen, plen;
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
script_buffer_[0] = ' '; // Always a space at front of output
|
|
363
|
+
script_buffer_[1] = '\0';
|
|
364
|
+
int take = 0;
|
|
365
|
+
int put = 1; // Start after the initial space
|
|
366
|
+
|
|
367
|
+
// gettimeofday(&script_start, NULL);
|
|
368
|
+
// Get to the first real non-tag letter or entity that is a letter
|
|
369
|
+
int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript);
|
|
370
|
+
next_byte_ += skip;
|
|
371
|
+
byte_length_ -= skip;
|
|
372
|
+
if (byte_length_ <= 0) {
|
|
373
|
+
// printf("]]\n");
|
|
374
|
+
return false; // No more letters to be found
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
// gettimeofday(&script_mid, NULL);
|
|
378
|
+
|
|
379
|
+
// There is at least one letter, so we know the script for this span
|
|
380
|
+
// printf("{%d} ", spanscript);
|
|
381
|
+
span->script = (UnicodeLScript)spanscript;
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
// Go over alternating spans of same-script letters and non-letters,
|
|
385
|
+
// copying letters to buffer with single spaces for each run of non-letters
|
|
386
|
+
while (take < byte_length_) {
|
|
387
|
+
// Copy run of letters in same script (&LS | LS)*
|
|
388
|
+
int letter_count = 0; // Keep track of word length
|
|
389
|
+
bool need_break = false;
|
|
390
|
+
while (take < byte_length_) {
|
|
391
|
+
// We are at a letter, nonletter, tag, or entity
|
|
392
|
+
if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
|
|
393
|
+
// printf("\"%c\" ", next_byte_[take]);
|
|
394
|
+
if (next_byte_[take] == '<') {
|
|
395
|
+
// Begining of tag
|
|
396
|
+
sc = 0;
|
|
397
|
+
break;
|
|
398
|
+
} else if (next_byte_[take] == '>') {
|
|
399
|
+
// Unexpected end of tag
|
|
400
|
+
sc = 0;
|
|
401
|
+
break;
|
|
402
|
+
} else if (next_byte_[take] == '&') {
|
|
403
|
+
// Copy entity, no advance
|
|
404
|
+
EntityToBuffer(next_byte_ + take, byte_length_ - take,
|
|
405
|
+
script_buffer_ + put, &tlen, &plen);
|
|
406
|
+
sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put);
|
|
407
|
+
}
|
|
408
|
+
} else {
|
|
409
|
+
// Real letter, safely copy up to 4 bytes, increment by 1..4
|
|
410
|
+
// Will update by 1..4 bytes at Advance, below
|
|
411
|
+
tlen = plen = cld_UniLib::OneCharLen(next_byte_ + take);
|
|
412
|
+
if (take < (byte_length_ - 3)) {
|
|
413
|
+
// Fast case
|
|
414
|
+
*reinterpret_cast<uint32*>(script_buffer_ + put) =
|
|
415
|
+
*reinterpret_cast<const uint32*>(next_byte_ + take);
|
|
416
|
+
} else {
|
|
417
|
+
// Slow case, happens 1-3 times per input document
|
|
418
|
+
memcpy(script_buffer_ + put, next_byte_ + take, plen);
|
|
419
|
+
}
|
|
420
|
+
sc = getone::GetUTF8LetterScriptNum(next_byte_ + take);
|
|
421
|
+
}
|
|
422
|
+
// printf("sc(%c)=%d ", next_byte_[take], sc);
|
|
423
|
+
// char xtmp[8]; memcpy(xtmp,script_buffer_ + put, plen);
|
|
424
|
+
// xtmp[plen] = '\0'; printf("'%s'{%d} ", xtmp, sc);
|
|
425
|
+
|
|
426
|
+
// Allow continue across a single letter in a different script:
|
|
427
|
+
// A B D = three scripts, c = common script, i = inherited script,
|
|
428
|
+
// - = don't care, ( = take position before the += below
|
|
429
|
+
// AAA(A- continue
|
|
430
|
+
//
|
|
431
|
+
// AAA(BA continue
|
|
432
|
+
// AAA(BB break
|
|
433
|
+
// AAA(Bc continue (breaks after B)
|
|
434
|
+
// AAA(BD break
|
|
435
|
+
// AAA(Bi break
|
|
436
|
+
//
|
|
437
|
+
// AAA(c- break
|
|
438
|
+
//
|
|
439
|
+
// AAA(i- continue
|
|
440
|
+
//
|
|
441
|
+
|
|
442
|
+
if ((sc != spanscript) && (sc != ULScript_Inherited)) {
|
|
443
|
+
// Might need to break this script span
|
|
444
|
+
if (sc == ULScript_Common) {
|
|
445
|
+
need_break = true;
|
|
446
|
+
} else {
|
|
447
|
+
// Look at next following character, ignoring entity as Common
|
|
448
|
+
int sc2 = getone::GetUTF8LetterScriptNum(next_byte_ + take + tlen);
|
|
449
|
+
if ((sc2 != ULScript_Common) && (sc2 != spanscript)) {
|
|
450
|
+
need_break = true;
|
|
451
|
+
}
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
if (need_break) {break;} // Non-letter or letter in wrong script
|
|
455
|
+
|
|
456
|
+
take += tlen; // Advance
|
|
457
|
+
put += plen; // Advance
|
|
458
|
+
++letter_count;
|
|
459
|
+
if (put >= getone::kMaxScriptBytes) {
|
|
460
|
+
// Buffer is full
|
|
461
|
+
span->truncated = true;
|
|
462
|
+
break;
|
|
463
|
+
}
|
|
464
|
+
} // End while letters
|
|
465
|
+
|
|
466
|
+
// Do run of non-letters (tag | &NL | NL)*
|
|
467
|
+
while (take < byte_length_) {
|
|
468
|
+
// Do fast scan to next interesting byte
|
|
469
|
+
take += ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take);
|
|
470
|
+
|
|
471
|
+
// Check for no more letters/specials
|
|
472
|
+
if (take >= byte_length_) {
|
|
473
|
+
take = byte_length_;
|
|
474
|
+
break;
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
// We are at a letter, nonletter, tag, or entity
|
|
478
|
+
if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
|
|
479
|
+
// printf("\"%c\" ", next_byte_[take]);
|
|
480
|
+
if (next_byte_[take] == '<') {
|
|
481
|
+
// Begining of tag; skip to end and go around again
|
|
482
|
+
tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take);
|
|
483
|
+
sc = 0;
|
|
484
|
+
// printf("<...> ");
|
|
485
|
+
} else if (next_byte_[take] == '>') {
|
|
486
|
+
// Unexpected end of tag; skip it and go around again
|
|
487
|
+
tlen = 1; // Over the >
|
|
488
|
+
sc = 0;
|
|
489
|
+
// printf("..> ");
|
|
490
|
+
} else if (next_byte_[take] == '&') {
|
|
491
|
+
// Expand entity, no advance
|
|
492
|
+
EntityToBuffer(next_byte_ + take, byte_length_ - take,
|
|
493
|
+
script_buffer_ + put, &tlen, &plen);
|
|
494
|
+
sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put);
|
|
495
|
+
}
|
|
496
|
+
} else {
|
|
497
|
+
// Update 1..4
|
|
498
|
+
tlen = cld_UniLib::OneCharLen(next_byte_ + take);
|
|
499
|
+
sc = getone::GetUTF8LetterScriptNum(next_byte_ + take);
|
|
500
|
+
}
|
|
501
|
+
// printf("sc[%c]=%d ", next_byte_[take], sc);
|
|
502
|
+
if (sc != 0) {break;} // Letter found
|
|
503
|
+
take += tlen; // Advance
|
|
504
|
+
} // End while not-letters
|
|
505
|
+
|
|
506
|
+
script_buffer_[put++] = ' ';
|
|
507
|
+
|
|
508
|
+
// We are at a letter again (or eos), after letter* not-letter*
|
|
509
|
+
if (sc != spanscript) {break;} // Letter in wrong script
|
|
510
|
+
if (put >= getone::kMaxScriptBytes - 8) {
|
|
511
|
+
// Buffer is almost full
|
|
512
|
+
span->truncated = true;
|
|
513
|
+
break;
|
|
514
|
+
}
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
// Update input position
|
|
518
|
+
next_byte_ += take;
|
|
519
|
+
byte_length_ -= take;
|
|
520
|
+
|
|
521
|
+
// Put four more spaces/NUL. Worst case is abcd _ _ _ \0
|
|
522
|
+
// kMaxScriptBytes | | put
|
|
523
|
+
script_buffer_[put + 0] = ' ';
|
|
524
|
+
script_buffer_[put + 1] = ' ';
|
|
525
|
+
script_buffer_[put + 2] = ' ';
|
|
526
|
+
script_buffer_[put + 3] = '\0';
|
|
527
|
+
|
|
528
|
+
span->text_bytes = put; // Does not include the last four chars above
|
|
529
|
+
|
|
530
|
+
// printf(" %d]]\n\n", put);
|
|
531
|
+
return true;
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
// Force Latin, Cyrillic, Greek scripts to be lowercase
|
|
535
|
+
void ScriptScanner::LowerScriptSpan(getone::LangSpan* span) {
|
|
536
|
+
// On Windows, text is lowercased beforehand, so no need to do anything here.
|
|
537
|
+
#if !defined(CLD_WINDOWS)
|
|
538
|
+
// If needed, lowercase all the text. If we do it sooner, might miss
|
|
539
|
+
// lowercasing an entity such as Á
|
|
540
|
+
// We only need to do this for Latn and Cyrl scripts
|
|
541
|
+
if ((span->script == ULScript_Latin) ||
|
|
542
|
+
(span->script == ULScript_Cyrillic) ||
|
|
543
|
+
(span->script == ULScript_Greek)) {
|
|
544
|
+
// Full Unicode lowercase of the entire buffer, including
|
|
545
|
+
// four pad bytes off the end
|
|
546
|
+
int consumed, filled;
|
|
547
|
+
UniLib::ToLower(span->text, span->text_bytes + 4,
|
|
548
|
+
script_buffer_lower_, getone::kMaxScriptLowerBuffer,
|
|
549
|
+
&consumed, &filled);
|
|
550
|
+
span->text = script_buffer_lower_;
|
|
551
|
+
span->text_bytes = filled - 4;
|
|
552
|
+
}
|
|
553
|
+
#endif
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
|
|
557
|
+
// Force Latin and Cyrillic scripts to be lowercase
|
|
558
|
+
bool ScriptScanner::GetOneScriptSpanLower(getone::LangSpan* span) {
|
|
559
|
+
bool ok = GetOneScriptSpan(span);
|
|
560
|
+
LowerScriptSpan(span);
|
|
561
|
+
return ok;
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
// Gets lscript number for letters; always returns
|
|
565
|
+
// 0 (common script) for non-letters
|
|
566
|
+
int getone::GetUTF8LetterScriptNum(const char* src) {
|
|
567
|
+
int srclen = cld_UniLib::OneCharLen(src);
|
|
568
|
+
const uint8* usrc = reinterpret_cast<const uint8*>(src);
|
|
569
|
+
return UTF8GenericProperty(&utf8propletterscriptnum_obj, &usrc, &srclen);
|
|
570
|
+
}
|