cld-fixed 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +20 -0
- data/.rspec +2 -0
- data/Gemfile +6 -0
- data/LICENSE +27 -0
- data/README.md +34 -0
- data/Rakefile +5 -0
- data/cld.gemspec +22 -0
- data/ext/cld/Makefile.am +28 -0
- data/ext/cld/Makefile.in +790 -0
- data/ext/cld/aclocal.m4 +8895 -0
- data/ext/cld/base/basictypes.h +348 -0
- data/ext/cld/base/build_config.h +115 -0
- data/ext/cld/base/casts.h +156 -0
- data/ext/cld/base/commandlineflags.h +443 -0
- data/ext/cld/base/crash.h +41 -0
- data/ext/cld/base/dynamic_annotations.h +358 -0
- data/ext/cld/base/global_strip_options.h +59 -0
- data/ext/cld/base/log_severity.h +46 -0
- data/ext/cld/base/logging.h +1403 -0
- data/ext/cld/base/macros.h +243 -0
- data/ext/cld/base/port.h +54 -0
- data/ext/cld/base/scoped_ptr.h +428 -0
- data/ext/cld/base/stl_decl.h +0 -0
- data/ext/cld/base/stl_decl_msvc.h +107 -0
- data/ext/cld/base/string_util.h +29 -0
- data/ext/cld/base/strtoint.h +93 -0
- data/ext/cld/base/template_util.h +96 -0
- data/ext/cld/base/type_traits.h +198 -0
- data/ext/cld/base/vlog_is_on.h +143 -0
- data/ext/cld/build_aux/config.guess +1500 -0
- data/ext/cld/build_aux/config.sub +1616 -0
- data/ext/cld/build_aux/depcomp +584 -0
- data/ext/cld/build_aux/install-sh +507 -0
- data/ext/cld/build_aux/ltmain.sh +8745 -0
- data/ext/cld/build_aux/missing +367 -0
- data/ext/cld/cld_encodings.h +95 -0
- data/ext/cld/configure +17362 -0
- data/ext/cld/configure.ac +14 -0
- data/ext/cld/encodings/compact_lang_det/#cldutil.cc# +905 -0
- data/ext/cld/encodings/compact_lang_det/#cldutil.h# +1205 -0
- data/ext/cld/encodings/compact_lang_det/#compact_lang_det_impl.h# +171 -0
- data/ext/cld/encodings/compact_lang_det/#ext_lang_enc.cc# +545 -0
- data/ext/cld/encodings/compact_lang_det/#ext_lang_enc.h# +119 -0
- data/ext/cld/encodings/compact_lang_det/#getonescriptspan.cc# +570 -0
- data/ext/cld/encodings/compact_lang_det/#getonescriptspan.h# +131 -0
- data/ext/cld/encodings/compact_lang_det/#tote.cc# +299 -0
- data/ext/cld/encodings/compact_lang_det/#tote.h# +89 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
- data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
- data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
- data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
- data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
- data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
- data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
- data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
- data/ext/cld/encodings/compact_lang_det/win/#cld_unilib_windows.cc# +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
- data/ext/cld/encodings/internal/encodings.cc +12 -0
- data/ext/cld/encodings/lang_enc.h +254 -0
- data/ext/cld/encodings/proto/encodings.pb.h +169 -0
- data/ext/cld/encodings/public/encodings.h +301 -0
- data/ext/cld/extconf.rb +7 -0
- data/ext/cld/languages/internal/#languages.cc# +337 -0
- data/ext/cld/languages/internal/languages.cc +336 -0
- data/ext/cld/languages/proto/languages.pb.h +179 -0
- data/ext/cld/languages/public/languages.h +379 -0
- data/ext/cld/thunk.cc +55 -0
- data/lib/cld.rb +21 -0
- data/lib/cld/version.rb +3 -0
- data/spec/cld_spec.rb +67 -0
- data/spec/spec_helper.rb +6 -0
- metadata +193 -0
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
|
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
|
|
7
|
+
|
|
8
|
+
#include "encodings/compact_lang_det/letterscript_enum.h"
|
|
9
|
+
#include "encodings/compact_lang_det/compact_lang_det_impl.h"
|
|
10
|
+
|
|
11
|
+
namespace getone {
|
|
12
|
+
static const int kMaxScriptBuffer = 4096;
|
|
13
|
+
static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2;
|
|
14
|
+
static const int kMaxScriptBytes = kMaxScriptBuffer- 8; // Leave some room
|
|
15
|
+
static const int kMaxAnswerBuffer = 256;
|
|
16
|
+
|
|
17
|
+
typedef enum UnicodeLScript ULScript;
|
|
18
|
+
|
|
19
|
+
typedef struct {
|
|
20
|
+
char* text; // Pointer to the span, somewhere
|
|
21
|
+
int text_bytes; // Number of bytes of text in the span
|
|
22
|
+
int offset; // Offset of start of span in original input buffer
|
|
23
|
+
ULScript script; // Script of all the letters in this span
|
|
24
|
+
Language lang; // Language identified for this span
|
|
25
|
+
bool truncated; // true if buffer filled up before a
|
|
26
|
+
// different script or EOF was found
|
|
27
|
+
} LangSpan;
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
static inline bool IsContinuationByte(char c) {
|
|
31
|
+
return static_cast<signed char>(c) < -64;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
// Gets lscript number for letters; always returns
|
|
35
|
+
// 0 (common script) for non-letters
|
|
36
|
+
int GetUTF8LetterScriptNum(const char* src);
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
// Update src pointer to point to next quadgram, +2..+5
|
|
40
|
+
// Looks at src[0..4]
|
|
41
|
+
const char* AdvanceQuad(const char* src);
|
|
42
|
+
} // end namespace getone
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class ScriptScanner {
|
|
50
|
+
public:
|
|
51
|
+
ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text);
|
|
52
|
+
~ScriptScanner();
|
|
53
|
+
|
|
54
|
+
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
|
|
55
|
+
bool GetOneScriptSpan(getone::LangSpan* span);
|
|
56
|
+
|
|
57
|
+
// Force Latin and Cyrillic scripts to be lowercase
|
|
58
|
+
void LowerScriptSpan(getone::LangSpan* span);
|
|
59
|
+
|
|
60
|
+
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
|
|
61
|
+
// Force Latin and Cyrillic scripts to be lowercase
|
|
62
|
+
bool GetOneScriptSpanLower(getone::LangSpan* span);
|
|
63
|
+
|
|
64
|
+
private:
|
|
65
|
+
int SkipToFrontOfSpan(const char* src, int len, int* script);
|
|
66
|
+
|
|
67
|
+
const char* start_byte_;
|
|
68
|
+
const char* next_byte_;
|
|
69
|
+
const char* next_byte_limit_;
|
|
70
|
+
int byte_length_;
|
|
71
|
+
bool is_plain_text_;
|
|
72
|
+
char* script_buffer_; // Holds text with expanded entities
|
|
73
|
+
char* script_buffer_lower_; // Holds lowercased text
|
|
74
|
+
};
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class LangScanner {
|
|
78
|
+
public:
|
|
79
|
+
LangScanner(const CompactLangDetImpl::LangDetObj* langdetobj,
|
|
80
|
+
getone::LangSpan* spn, int smoothwidth, int smoothcandidates,
|
|
81
|
+
int maxlangs, int minlangspan);
|
|
82
|
+
~LangScanner();
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
int script() {return script_;}
|
|
86
|
+
|
|
87
|
+
// Use new text
|
|
88
|
+
// Keep smoothing state if same script, otherwise reinit smoothing
|
|
89
|
+
void NewText(getone::LangSpan* spn);
|
|
90
|
+
|
|
91
|
+
bool GetOneShortLangSpanBoot(getone::LangSpan* span); // Just for bootstrapping
|
|
92
|
+
bool GetOneLangSpanBoot(getone::LangSpan* span); // Just for bootstrapping
|
|
93
|
+
|
|
94
|
+
// The real ones
|
|
95
|
+
bool GetOneShortLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj,
|
|
96
|
+
getone::LangSpan* span);
|
|
97
|
+
bool GetOneLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj,
|
|
98
|
+
getone::LangSpan* span);
|
|
99
|
+
|
|
100
|
+
// Increases language bias by delta
|
|
101
|
+
void SetLanguageBias(const CompactLangDetImpl::LangDetObj* langdetobj,
|
|
102
|
+
Language key, int delta);
|
|
103
|
+
|
|
104
|
+
// For debugging output
|
|
105
|
+
int next_answer_;
|
|
106
|
+
char answer_buffer_[getone::kMaxAnswerBuffer];
|
|
107
|
+
char answer_buffer2_[getone::kMaxAnswerBuffer];
|
|
108
|
+
char answer_buffer3_[getone::kMaxAnswerBuffer];
|
|
109
|
+
char answer_buffer4_[getone::kMaxAnswerBuffer];
|
|
110
|
+
|
|
111
|
+
private:
|
|
112
|
+
const char* start_byte_;
|
|
113
|
+
const char* next_byte_limit_;
|
|
114
|
+
const char* next_byte_;
|
|
115
|
+
const char* onelangspan_begin_;
|
|
116
|
+
int byte_length_;
|
|
117
|
+
int script_;
|
|
118
|
+
Language spanlang_;
|
|
119
|
+
int smoothwidth_;
|
|
120
|
+
int smoothwidth_2_;
|
|
121
|
+
int smoothcandidates_;
|
|
122
|
+
int maxlangs_;
|
|
123
|
+
int minlangspan_;
|
|
124
|
+
int rb_size_;
|
|
125
|
+
int next_rb_;
|
|
126
|
+
int rb_mask_;
|
|
127
|
+
uint32* rb_;
|
|
128
|
+
int* offset_rb_;
|
|
129
|
+
};
|
|
130
|
+
|
|
131
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#include "encodings/compact_lang_det/letterscript_enum.h"
|
|
6
|
+
|
|
7
|
+
#include "encodings/compact_lang_det/win/cld_logging.h"
|
|
8
|
+
|
|
9
|
+
static const char* kUnicodeLScriptNames[ULScript_NUM_SCRIPTS] = {
|
|
10
|
+
"Common",
|
|
11
|
+
"Latin",
|
|
12
|
+
"Greek",
|
|
13
|
+
"Cyrillic",
|
|
14
|
+
"Armenian",
|
|
15
|
+
"Hebrew",
|
|
16
|
+
"Arabic",
|
|
17
|
+
"Syriac",
|
|
18
|
+
"Thaana",
|
|
19
|
+
"Devanagari",
|
|
20
|
+
"Bengali",
|
|
21
|
+
"Gurmukhi",
|
|
22
|
+
"Gujarati",
|
|
23
|
+
"Oriya",
|
|
24
|
+
"Tamil",
|
|
25
|
+
"Telugu",
|
|
26
|
+
"Kannada",
|
|
27
|
+
"Malayalam",
|
|
28
|
+
"Sinhala",
|
|
29
|
+
"Thai",
|
|
30
|
+
"Lao",
|
|
31
|
+
"Tibetan",
|
|
32
|
+
"Myanmar",
|
|
33
|
+
"Georgian",
|
|
34
|
+
"HanCJK",
|
|
35
|
+
"Ethiopic",
|
|
36
|
+
"Cherokee",
|
|
37
|
+
"Canadian_Aboriginal",
|
|
38
|
+
"Ogham",
|
|
39
|
+
"Runic",
|
|
40
|
+
"Khmer",
|
|
41
|
+
"Mongolian",
|
|
42
|
+
"Yi",
|
|
43
|
+
"Old_Italic",
|
|
44
|
+
"Gothic",
|
|
45
|
+
"Deseret",
|
|
46
|
+
"Inherited",
|
|
47
|
+
"Tagalog",
|
|
48
|
+
"Hanunoo",
|
|
49
|
+
"Buhid",
|
|
50
|
+
"Tagbanwa",
|
|
51
|
+
"Limbu",
|
|
52
|
+
"Tai_Le",
|
|
53
|
+
"Linear_B",
|
|
54
|
+
"Ugaritic",
|
|
55
|
+
"Shavian",
|
|
56
|
+
"Osmanya",
|
|
57
|
+
"Cypriot",
|
|
58
|
+
"Buginese",
|
|
59
|
+
"Coptic",
|
|
60
|
+
"New_Tai_Lue",
|
|
61
|
+
"Glagolitic",
|
|
62
|
+
"Tifinagh",
|
|
63
|
+
"Syloti_Nagri",
|
|
64
|
+
"Old_Persian",
|
|
65
|
+
"Kharoshthi",
|
|
66
|
+
"Balinese",
|
|
67
|
+
"Cuneiform",
|
|
68
|
+
"Phoenician",
|
|
69
|
+
"Phags_Pa",
|
|
70
|
+
"Nko",
|
|
71
|
+
|
|
72
|
+
// Unicode 5.1 beta
|
|
73
|
+
"Sundanese",
|
|
74
|
+
"Lepcha",
|
|
75
|
+
"Ol_Chiki",
|
|
76
|
+
"Vai",
|
|
77
|
+
"Saurashtra",
|
|
78
|
+
"Kayah_Li",
|
|
79
|
+
"Rejang",
|
|
80
|
+
"Lycian",
|
|
81
|
+
"Carian",
|
|
82
|
+
"Lydian",
|
|
83
|
+
"Cham",
|
|
84
|
+
};
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
// Unicode 5.1 beta script names from
|
|
88
|
+
// http://www.unicode.org/Public/5.1.0/diffs/5.0.0-5.1.0.all.2.diffs
|
|
89
|
+
// NOTE: 'Vai ' => "Vaii" to make four letters, not three
|
|
90
|
+
// see http://unicode.org/iso15924/iso15924-codes.html
|
|
91
|
+
const char* const kLScriptName4[ULScript_NUM_SCRIPTS] = {
|
|
92
|
+
"Zyyy", "Latn", "Grek", "Cyrl", "Armn", "Hebr", "Arab", "Syrc",
|
|
93
|
+
"Thaa", "Deva", "Beng", "Guru", "Gujr", "Orya", "Taml", "Telu",
|
|
94
|
+
"Knda", "Mlym", "Sinh", "Thai", "Laoo", "Tibt", "Mymr", "Geor",
|
|
95
|
+
"Hani", "Ethi", "Cher", "Cans", "Ogam", "Runr", "Khmr", "Mong",
|
|
96
|
+
|
|
97
|
+
"Yiii", "Ital", "Goth", "Dsrt", "Zzzz", "Tglg", "Hano", "Buhd",
|
|
98
|
+
"Tagb", "Limb", "Tale", "Linb", "Ugar", "Shaw", "Osma", "Cprt",
|
|
99
|
+
"Bugi", "Copt", "Talu", "Glag", "Tfng", "Sylo", "Xpeo", "Khar",
|
|
100
|
+
"Bali", "Xsux", "Phnx", "Phag", "Nkoo",
|
|
101
|
+
|
|
102
|
+
// Unicode 5.1 beta
|
|
103
|
+
"Sund", "Lepc", "Olck", "Vaii", "Saur", "Kali", "Rjng", "Lyci",
|
|
104
|
+
"Cari", "Lydi", "Cham",
|
|
105
|
+
};
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
const char* UnicodeLScriptName(const UnicodeLScript ls) {
|
|
109
|
+
CHECK(ls >= 0 && ls < ULScript_NUM_SCRIPTS);
|
|
110
|
+
return kUnicodeLScriptNames[ls];
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
const char* UnicodeLScriptCode(const UnicodeLScript ls) {
|
|
115
|
+
CHECK(ls >= 0 && ls < ULScript_NUM_SCRIPTS);
|
|
116
|
+
return kLScriptName4[ls];
|
|
117
|
+
}
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_LETTERSCRIPT_ENUM_H__
|
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_LETTERSCRIPT_ENUM_H__
|
|
7
|
+
|
|
8
|
+
enum UnicodeLScript {
|
|
9
|
+
ULScript_Common,
|
|
10
|
+
ULScript_Latin,
|
|
11
|
+
ULScript_Greek,
|
|
12
|
+
ULScript_Cyrillic,
|
|
13
|
+
ULScript_Armenian,
|
|
14
|
+
ULScript_Hebrew,
|
|
15
|
+
ULScript_Arabic,
|
|
16
|
+
ULScript_Syriac,
|
|
17
|
+
ULScript_Thaana,
|
|
18
|
+
ULScript_Devanagari,
|
|
19
|
+
ULScript_Bengali,
|
|
20
|
+
ULScript_Gurmukhi,
|
|
21
|
+
ULScript_Gujarati,
|
|
22
|
+
ULScript_Oriya,
|
|
23
|
+
ULScript_Tamil,
|
|
24
|
+
ULScript_Telugu,
|
|
25
|
+
ULScript_Kannada,
|
|
26
|
+
ULScript_Malayalam,
|
|
27
|
+
ULScript_Sinhala,
|
|
28
|
+
ULScript_Thai,
|
|
29
|
+
ULScript_Lao,
|
|
30
|
+
ULScript_Tibetan,
|
|
31
|
+
ULScript_Myanmar,
|
|
32
|
+
ULScript_Georgian,
|
|
33
|
+
ULScript_HanCJK,
|
|
34
|
+
ULScript_Ethiopic,
|
|
35
|
+
ULScript_Cherokee,
|
|
36
|
+
ULScript_Canadian_Aboriginal,
|
|
37
|
+
ULScript_Ogham,
|
|
38
|
+
ULScript_Runic,
|
|
39
|
+
ULScript_Khmer,
|
|
40
|
+
ULScript_Mongolian,
|
|
41
|
+
ULScript_Yi,
|
|
42
|
+
ULScript_Old_Italic,
|
|
43
|
+
ULScript_Gothic,
|
|
44
|
+
ULScript_Deseret,
|
|
45
|
+
ULScript_Inherited,
|
|
46
|
+
ULScript_Tagalog,
|
|
47
|
+
ULScript_Hanunoo,
|
|
48
|
+
ULScript_Buhid,
|
|
49
|
+
ULScript_Tagbanwa,
|
|
50
|
+
ULScript_Limbu,
|
|
51
|
+
ULScript_Tai_Le,
|
|
52
|
+
ULScript_Linear_B,
|
|
53
|
+
ULScript_Ugaritic,
|
|
54
|
+
ULScript_Shavian,
|
|
55
|
+
ULScript_Osmanya,
|
|
56
|
+
ULScript_Cypriot,
|
|
57
|
+
ULScript_Buginese,
|
|
58
|
+
ULScript_Coptic,
|
|
59
|
+
ULScript_New_Tai_Lue,
|
|
60
|
+
ULScript_Glagolitic,
|
|
61
|
+
ULScript_Tifinagh,
|
|
62
|
+
ULScript_Syloti_Nagri,
|
|
63
|
+
ULScript_Old_Persian,
|
|
64
|
+
ULScript_Kharoshthi,
|
|
65
|
+
ULScript_Balinese,
|
|
66
|
+
ULScript_Cuneiform,
|
|
67
|
+
ULScript_Phoenician,
|
|
68
|
+
ULScript_Phags_Pa,
|
|
69
|
+
ULScript_Nko,
|
|
70
|
+
|
|
71
|
+
// Unicode 5.1
|
|
72
|
+
ULScript_Sundanese,
|
|
73
|
+
ULScript_Lepcha,
|
|
74
|
+
ULScript_Ol_Chiki,
|
|
75
|
+
ULScript_Vai,
|
|
76
|
+
ULScript_Saurashtra,
|
|
77
|
+
ULScript_Kayah_Li,
|
|
78
|
+
ULScript_Rejang,
|
|
79
|
+
ULScript_Lycian,
|
|
80
|
+
ULScript_Carian,
|
|
81
|
+
ULScript_Lydian,
|
|
82
|
+
ULScript_Cham,
|
|
83
|
+
ULScript_NUM_SCRIPTS
|
|
84
|
+
};
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
static const UnicodeLScript UNKNOWN_LSCRIPT = ULScript_Common;
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
// Return the name corresponding to the script ls, e.g. "Latin".
|
|
91
|
+
// It is a fatal error if ls is not a valid UnicodeLScript.
|
|
92
|
+
const char* UnicodeLScriptName(const UnicodeLScript ls);
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
// Return the 4-letter code corresponding to the script ls, e.g. "Latn".
|
|
96
|
+
// It is a fatal error if ls is not a valid UnicodeLScript.
|
|
97
|
+
const char* UnicodeLScriptCode(const UnicodeLScript ls);
|
|
98
|
+
|
|
99
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_LETTERSCRIPT_ENUM_H__
|
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
// Remember a subset of a sequence of values, using a modest amount of memory
|
|
6
|
+
|
|
7
|
+
/***
|
|
8
|
+
Design:
|
|
9
|
+
Accumulate in powers of three, using 3-way median to collapse entries.
|
|
10
|
+
At any given time, there is one most-dense (highest power of 3) range of
|
|
11
|
+
entries and a series of less-dense ranges that hold 0..2 entries each. There
|
|
12
|
+
is a bounded-size storage array of S cells for all the entries.
|
|
13
|
+
|
|
14
|
+
The overflow detect is set up so that a new higher power of 3, K+1, is
|
|
15
|
+
triggered precisely when range K has 3n entries and all ranges < K have
|
|
16
|
+
zero entries.
|
|
17
|
+
|
|
18
|
+
In general, think of the range sizes as a multi-digit base 3 number, except
|
|
19
|
+
the highest digit may exceed 2:
|
|
20
|
+
|
|
21
|
+
3**6 3**5 3**4 3**3 3**2 3**1 3**0 K=2
|
|
22
|
+
0 0 0 0 3n-1 2 2 unused:1
|
|
23
|
+
|
|
24
|
+
There are a total of 3n-1 + 2 + 2 entries in use. Assume a size limit S at
|
|
25
|
+
one more than that, and we add a new 3**0 entry and "carry" by performing
|
|
26
|
+
medians on any group of 3 elements:
|
|
27
|
+
|
|
28
|
+
3**6 3**5 3**4 3**3 3**2 3**1 3**0 K=2
|
|
29
|
+
0 0 0 0 3n-1 2 3 unused:0
|
|
30
|
+
0 0 0 0 3n-1 3 0 carry unused:2
|
|
31
|
+
0 0 0 0 3n 0 0 carry unused:4
|
|
32
|
+
|
|
33
|
+
To accumulate 2 entries at all levels < K and 3 just before the first carry at
|
|
34
|
+
level 0, we need 2*K + 1 unused cells after doing all carries, or five cells
|
|
35
|
+
in this case. Since we only have 4 cells in the example above, we need to
|
|
36
|
+
make room by starting a new power of three:
|
|
37
|
+
|
|
38
|
+
3**6 3**5 3**4 3**3 3**2 3**1 3**0
|
|
39
|
+
0 0 0 0 3n 0 0 K=2 unused:4
|
|
40
|
+
0 0 0 n 0 0 0 K=3 unused:2n+4
|
|
41
|
+
|
|
42
|
+
In the code below, we don't worry about overflow from the topmost place.
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
***/
|
|
46
|
+
|
|
47
|
+
#include "encodings/compact_lang_det/subsetsequence.h"
|
|
48
|
+
#include <stdio.h>
|
|
49
|
+
|
|
50
|
+
#include "encodings/compact_lang_det/win/cld_logging.h"
|
|
51
|
+
|
|
52
|
+
void DumpInts(const char* label, const int* v, int n) {
|
|
53
|
+
printf("%s ", label);
|
|
54
|
+
for (int i = 0; i < n; ++i) {
|
|
55
|
+
printf("%d ", v[i]);
|
|
56
|
+
}
|
|
57
|
+
printf("\n");
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
void DumpUint8s(const char* label, const uint8* v, int n) {
|
|
61
|
+
printf("%s ", label);
|
|
62
|
+
for (int i = 0; i < n; ++i) {
|
|
63
|
+
printf("%d ", v[i]);
|
|
64
|
+
}
|
|
65
|
+
printf("\n");
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// Return median of seq_[sub] .. seq_[sub+2], favoring middle element
|
|
69
|
+
uint8 SubsetSequence::Median3(int sub) {
|
|
70
|
+
if (seq_[sub] == seq_[sub + 1]) {
|
|
71
|
+
return seq_[sub];
|
|
72
|
+
}
|
|
73
|
+
if (seq_[sub] == seq_[sub + 2]) {
|
|
74
|
+
return seq_[sub];
|
|
75
|
+
}
|
|
76
|
+
return seq_[sub + 1];
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
void SubsetSequence::Init() {
|
|
80
|
+
// printf("Init\n");
|
|
81
|
+
|
|
82
|
+
k_ = 0;
|
|
83
|
+
count_[0] = 0;
|
|
84
|
+
next_e_ = 0;
|
|
85
|
+
seq_[0] = 0; // Default value if no calls to Add
|
|
86
|
+
|
|
87
|
+
// Want largest <= kMaxSeq_ that allows reserve and makes count_[k_] = 0 mod 3
|
|
88
|
+
int reserve = (2 * k_ + 1);
|
|
89
|
+
level_limit_e_ = kMaxSeq_ - reserve;
|
|
90
|
+
level_limit_e_ = (level_limit_e_ / 3) * 3; // Round down to multiple of 3
|
|
91
|
+
limit_e_ = level_limit_e_;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
// Compress level k by 3x, creating level k+1
|
|
95
|
+
void SubsetSequence::NewLevel() {
|
|
96
|
+
// printf("NewLevel 3 ** %d\n", k_ + 1);
|
|
97
|
+
//DumpUint8s("count[k]", count_, k_ + 1);
|
|
98
|
+
//DumpUint8s("seq[next]", seq_, next_e_);
|
|
99
|
+
|
|
100
|
+
// Incoming level must be an exact multiple of three in size
|
|
101
|
+
CHECK((count_[k_] % 3) == 0);
|
|
102
|
+
int k_size = count_[k_];
|
|
103
|
+
int new_size = k_size / 3;
|
|
104
|
+
|
|
105
|
+
// Compress down by 3x, via median
|
|
106
|
+
for (int j = 0; j < new_size; ++j) {
|
|
107
|
+
seq_[j] = Median3(j * 3);
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
// Update counts
|
|
111
|
+
count_[k_] = 0;
|
|
112
|
+
// Else Overflow -- just continue with 3x dense Level K
|
|
113
|
+
if (k_ < (kMaxLevel_ - 1)) {++k_;}
|
|
114
|
+
count_[k_] = new_size;
|
|
115
|
+
|
|
116
|
+
// Update limits
|
|
117
|
+
next_e_ = new_size;
|
|
118
|
+
limit_e_ = next_e_ + 3;
|
|
119
|
+
|
|
120
|
+
// Want largest <= kMaxSeq_ that allows reserve and makes count_[k_] = 0 mod 3
|
|
121
|
+
int reserve = (2 * k_ + 1);
|
|
122
|
+
level_limit_e_ = kMaxSeq_ - reserve;
|
|
123
|
+
level_limit_e_ = (level_limit_e_ / 3) * 3; // Round down to multiple of 3
|
|
124
|
+
//
|
|
125
|
+
//DumpUint8s("after: count[k]", count_, k_ + 1);
|
|
126
|
+
//DumpUint8s("after: seq[next]", seq_, next_e_);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
void SubsetSequence::DoCarries() {
|
|
130
|
+
CHECK(count_[k_] > 3); // We depend on count_[k_] being > 3 to stop while
|
|
131
|
+
// Make room by carrying
|
|
132
|
+
|
|
133
|
+
//DumpUint8s("DoCarries count[k]", count_, k_ + 1);
|
|
134
|
+
//DumpUint8s("DoCarries seq[next]", seq_, next_e_);
|
|
135
|
+
|
|
136
|
+
int i = 0;
|
|
137
|
+
while (count_[i] == 3) {
|
|
138
|
+
next_e_ -= 3;
|
|
139
|
+
seq_[next_e_] = Median3(next_e_);
|
|
140
|
+
++next_e_;
|
|
141
|
+
count_[i] = 0;
|
|
142
|
+
++count_[i + 1];
|
|
143
|
+
++i;
|
|
144
|
+
}
|
|
145
|
+
limit_e_ = next_e_ + 3;
|
|
146
|
+
|
|
147
|
+
//DumpUint8s("after: DoCarries count[k]", count_, k_ + 1);
|
|
148
|
+
//DumpUint8s("after: DoCarries seq[next]", seq_, next_e_);
|
|
149
|
+
|
|
150
|
+
// If we just fully carried into level K,
|
|
151
|
+
// Make sure there is now enough room, else start level K + 1
|
|
152
|
+
if (i >= k_) {
|
|
153
|
+
CHECK(count_[k_] == next_e_);
|
|
154
|
+
if (next_e_ >= level_limit_e_) {
|
|
155
|
+
NewLevel();
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
void SubsetSequence::Add(uint8 e) {
|
|
161
|
+
// Add an entry then carry as needed
|
|
162
|
+
seq_[next_e_] = e;
|
|
163
|
+
++next_e_;
|
|
164
|
+
++count_[0];
|
|
165
|
+
|
|
166
|
+
if (next_e_ >= limit_e_) {
|
|
167
|
+
DoCarries();
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
// Collapse tail end by simple median across disparate-weight values,
|
|
173
|
+
// dropping or duplicating last value if need be.
|
|
174
|
+
// This routine is idempotent.
|
|
175
|
+
void SubsetSequence::Flush() {
|
|
176
|
+
// printf("Flush %d\n", count_[k_]);
|
|
177
|
+
int start_tail = count_[k_];
|
|
178
|
+
int size_tail = next_e_ - start_tail;
|
|
179
|
+
if ((size_tail % 3) == 2) {
|
|
180
|
+
seq_[next_e_] = seq_[next_e_ - 1]; // Duplicate last value
|
|
181
|
+
++size_tail;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
// Compress tail down by 3x, via median
|
|
185
|
+
int new_size = size_tail / 3; // May delete last value
|
|
186
|
+
for (int j = 0; j < new_size; ++j) {
|
|
187
|
+
seq_[start_tail + j] = Median3(start_tail + j * 3);
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
next_e_ = start_tail + new_size;
|
|
191
|
+
count_[k_] = next_e_;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
// Extract representative pattern of exactly N values into dst[0..n-1]
|
|
196
|
+
// This routine may be called multiple times, but it may downsample as a
|
|
197
|
+
// side effect, causing subsequent calls with larger N to get poor answers.
|
|
198
|
+
void SubsetSequence::Extract(int to_n, uint8* dst) {
|
|
199
|
+
// Collapse partial-carries in tail
|
|
200
|
+
Flush();
|
|
201
|
+
|
|
202
|
+
// Just use Bresenham to resample
|
|
203
|
+
int from_n = next_e_;
|
|
204
|
+
if (to_n >= from_n) {
|
|
205
|
+
// Up-sample from_n => to_n
|
|
206
|
+
int err = to_n - 1; // bias toward no overshoot
|
|
207
|
+
int j = 0;
|
|
208
|
+
for (int i = 0; i < to_n; ++i) {
|
|
209
|
+
dst[i] = seq_[j];
|
|
210
|
+
err -= from_n;
|
|
211
|
+
if (err < 0) {
|
|
212
|
+
++j;
|
|
213
|
+
err += to_n;
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
} else {
|
|
217
|
+
// Get to the point that the number of samples is <= 3 * to_n
|
|
218
|
+
while (next_e_ > (to_n * 3)) {
|
|
219
|
+
// Compress down by 3x, via median
|
|
220
|
+
// printf("Extract, median %d / 3\n", next_e_);
|
|
221
|
+
if ((next_e_ % 3) == 2) {
|
|
222
|
+
seq_[next_e_] = seq_[next_e_ - 1]; // Duplicate last value
|
|
223
|
+
++next_e_;
|
|
224
|
+
}
|
|
225
|
+
int new_size = next_e_ / 3; // May delete last value
|
|
226
|
+
for (int j = 0; j < new_size; ++j) {
|
|
227
|
+
seq_[j] = Median3(j * 3);
|
|
228
|
+
}
|
|
229
|
+
next_e_ = new_size;
|
|
230
|
+
count_[k_] = next_e_;
|
|
231
|
+
}
|
|
232
|
+
from_n = next_e_;
|
|
233
|
+
|
|
234
|
+
if (to_n == from_n) {
|
|
235
|
+
// Copy verbatim
|
|
236
|
+
for (int i = 0; i < to_n; ++i) {
|
|
237
|
+
dst[i] = seq_[i];
|
|
238
|
+
}
|
|
239
|
+
return;
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
// Down-sample from_n => to_n, using medians
|
|
243
|
+
int err = 0; // Bias to immediate median sample
|
|
244
|
+
int j = 0;
|
|
245
|
+
for (int i = 0; i < from_n; ++i) {
|
|
246
|
+
err -= to_n;
|
|
247
|
+
if (err < 0) {
|
|
248
|
+
if (i <= (next_e_ - 2)) {
|
|
249
|
+
dst[j] = Median3(i);
|
|
250
|
+
} else {
|
|
251
|
+
dst[j] = seq_[i];
|
|
252
|
+
}
|
|
253
|
+
++j;
|
|
254
|
+
err += from_n;
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
}
|