cld 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +27 -0
- data/Manifest +106 -0
- data/README.rdoc +173 -0
- data/Rakefile +15 -0
- data/base/basictypes.h +348 -0
- data/base/build_config.h +115 -0
- data/base/casts.h +156 -0
- data/base/commandlineflags.h +443 -0
- data/base/crash.h +41 -0
- data/base/dynamic_annotations.h +358 -0
- data/base/global_strip_options.h +59 -0
- data/base/log_severity.h +46 -0
- data/base/logging.h +1403 -0
- data/base/macros.h +243 -0
- data/base/port.h +54 -0
- data/base/scoped_ptr.h +428 -0
- data/base/stl_decl.h +0 -0
- data/base/stl_decl_msvc.h +107 -0
- data/base/string_util.h +29 -0
- data/base/strtoint.h +93 -0
- data/base/template_util.h +96 -0
- data/base/type_traits.h +198 -0
- data/base/vlog_is_on.h +143 -0
- data/build.sh +48 -0
- data/build.win.cmd +28 -0
- data/cld.gemspec +30 -0
- data/cld_encodings.h +95 -0
- data/encodings/compact_lang_det/#cldutil.cc# +905 -0
- data/encodings/compact_lang_det/#cldutil.h# +1205 -0
- data/encodings/compact_lang_det/#compact_lang_det_impl.h# +171 -0
- data/encodings/compact_lang_det/#ext_lang_enc.cc# +545 -0
- data/encodings/compact_lang_det/#ext_lang_enc.h# +119 -0
- data/encodings/compact_lang_det/#getonescriptspan.cc# +570 -0
- data/encodings/compact_lang_det/#getonescriptspan.h# +131 -0
- data/encodings/compact_lang_det/#tote.cc# +299 -0
- data/encodings/compact_lang_det/#tote.h# +89 -0
- data/encodings/compact_lang_det/cldutil.cc +905 -0
- data/encodings/compact_lang_det/cldutil.h +1205 -0
- data/encodings/compact_lang_det/cldutil_dbg.h +76 -0
- data/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
- data/encodings/compact_lang_det/compact_lang_det.cc +62 -0
- data/encodings/compact_lang_det/compact_lang_det.h +145 -0
- data/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
- data/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
- data/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
- data/encodings/compact_lang_det/compile.cmd +1 -0
- data/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
- data/encodings/compact_lang_det/ext_lang_enc.h +119 -0
- data/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
- data/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
- data/encodings/compact_lang_det/getonescriptspan.cc +570 -0
- data/encodings/compact_lang_det/getonescriptspan.h +131 -0
- data/encodings/compact_lang_det/letterscript_enum.cc +117 -0
- data/encodings/compact_lang_det/letterscript_enum.h +99 -0
- data/encodings/compact_lang_det/subsetsequence.cc +259 -0
- data/encodings/compact_lang_det/subsetsequence.h +44 -0
- data/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
- data/encodings/compact_lang_det/tote.cc +299 -0
- data/encodings/compact_lang_det/tote.h +89 -0
- data/encodings/compact_lang_det/unittest_data.h +193 -0
- data/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
- data/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
- data/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
- data/encodings/compact_lang_det/win/#cld_unilib_windows.cc# +29 -0
- data/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
- data/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
- data/encodings/compact_lang_det/win/cld_google.h +18 -0
- data/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
- data/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
- data/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
- data/encodings/compact_lang_det/win/cld_logging.h +21 -0
- data/encodings/compact_lang_det/win/cld_macros.h +19 -0
- data/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
- data/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
- data/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
- data/encodings/compact_lang_det/win/cld_unilib.h +15 -0
- data/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
- data/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
- data/encodings/compact_lang_det/win/cld_utf.h +24 -0
- data/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
- data/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
- data/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
- data/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
- data/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
- data/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
- data/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
- data/encodings/internal/encodings.cc +12 -0
- data/encodings/lang_enc.h +254 -0
- data/encodings/proto/encodings.pb.h +169 -0
- data/encodings/public/encodings.h +301 -0
- data/ext/cld/extconf.rb +7 -0
- data/languages/internal/#languages.cc# +337 -0
- data/languages/internal/languages.cc +337 -0
- data/languages/proto/languages.pb.h +179 -0
- data/languages/public/languages.h +379 -0
- data/lib/cld.rb +12 -0
- data/test/test.rb +570 -0
- data/thunk.cc +131 -0
- metadata +168 -0
@@ -0,0 +1,131 @@
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
|
7
|
+
|
8
|
+
#include "encodings/compact_lang_det/letterscript_enum.h"
|
9
|
+
#include "encodings/compact_lang_det/compact_lang_det_impl.h"
|
10
|
+
|
11
|
+
namespace getone {
|
12
|
+
static const int kMaxScriptBuffer = 4096;
|
13
|
+
static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2;
|
14
|
+
static const int kMaxScriptBytes = kMaxScriptBuffer- 8; // Leave some room
|
15
|
+
static const int kMaxAnswerBuffer = 256;
|
16
|
+
|
17
|
+
typedef enum UnicodeLScript ULScript;
|
18
|
+
|
19
|
+
typedef struct {
|
20
|
+
char* text; // Pointer to the span, somewhere
|
21
|
+
int text_bytes; // Number of bytes of text in the span
|
22
|
+
int offset; // Offset of start of span in original input buffer
|
23
|
+
ULScript script; // Script of all the letters in this span
|
24
|
+
Language lang; // Language identified for this span
|
25
|
+
bool truncated; // true if buffer filled up before a
|
26
|
+
// different script or EOF was found
|
27
|
+
} LangSpan;
|
28
|
+
|
29
|
+
|
30
|
+
static inline bool IsContinuationByte(char c) {
|
31
|
+
return static_cast<signed char>(c) < -64;
|
32
|
+
}
|
33
|
+
|
34
|
+
// Gets lscript number for letters; always returns
|
35
|
+
// 0 (common script) for non-letters
|
36
|
+
int GetUTF8LetterScriptNum(const char* src);
|
37
|
+
|
38
|
+
|
39
|
+
// Update src pointer to point to next quadgram, +2..+5
|
40
|
+
// Looks at src[0..4]
|
41
|
+
const char* AdvanceQuad(const char* src);
|
42
|
+
} // end namespace getone
|
43
|
+
|
44
|
+
|
45
|
+
|
46
|
+
|
47
|
+
|
48
|
+
|
49
|
+
class ScriptScanner {
|
50
|
+
public:
|
51
|
+
ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text);
|
52
|
+
~ScriptScanner();
|
53
|
+
|
54
|
+
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
|
55
|
+
bool GetOneScriptSpan(getone::LangSpan* span);
|
56
|
+
|
57
|
+
// Force Latin and Cyrillic scripts to be lowercase
|
58
|
+
void LowerScriptSpan(getone::LangSpan* span);
|
59
|
+
|
60
|
+
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
|
61
|
+
// Force Latin and Cyrillic scripts to be lowercase
|
62
|
+
bool GetOneScriptSpanLower(getone::LangSpan* span);
|
63
|
+
|
64
|
+
private:
|
65
|
+
int SkipToFrontOfSpan(const char* src, int len, int* script);
|
66
|
+
|
67
|
+
const char* start_byte_;
|
68
|
+
const char* next_byte_;
|
69
|
+
const char* next_byte_limit_;
|
70
|
+
int byte_length_;
|
71
|
+
bool is_plain_text_;
|
72
|
+
char* script_buffer_; // Holds text with expanded entities
|
73
|
+
char* script_buffer_lower_; // Holds lowercased text
|
74
|
+
};
|
75
|
+
|
76
|
+
|
77
|
+
class LangScanner {
|
78
|
+
public:
|
79
|
+
LangScanner(const CompactLangDetImpl::LangDetObj* langdetobj,
|
80
|
+
getone::LangSpan* spn, int smoothwidth, int smoothcandidates,
|
81
|
+
int maxlangs, int minlangspan);
|
82
|
+
~LangScanner();
|
83
|
+
|
84
|
+
|
85
|
+
int script() {return script_;}
|
86
|
+
|
87
|
+
// Use new text
|
88
|
+
// Keep smoothing state if same script, otherwise reinit smoothing
|
89
|
+
void NewText(getone::LangSpan* spn);
|
90
|
+
|
91
|
+
bool GetOneShortLangSpanBoot(getone::LangSpan* span); // Just for bootstrapping
|
92
|
+
bool GetOneLangSpanBoot(getone::LangSpan* span); // Just for bootstrapping
|
93
|
+
|
94
|
+
// The real ones
|
95
|
+
bool GetOneShortLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj,
|
96
|
+
getone::LangSpan* span);
|
97
|
+
bool GetOneLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj,
|
98
|
+
getone::LangSpan* span);
|
99
|
+
|
100
|
+
// Increases language bias by delta
|
101
|
+
void SetLanguageBias(const CompactLangDetImpl::LangDetObj* langdetobj,
|
102
|
+
Language key, int delta);
|
103
|
+
|
104
|
+
// For debugging output
|
105
|
+
int next_answer_;
|
106
|
+
char answer_buffer_[getone::kMaxAnswerBuffer];
|
107
|
+
char answer_buffer2_[getone::kMaxAnswerBuffer];
|
108
|
+
char answer_buffer3_[getone::kMaxAnswerBuffer];
|
109
|
+
char answer_buffer4_[getone::kMaxAnswerBuffer];
|
110
|
+
|
111
|
+
private:
|
112
|
+
const char* start_byte_;
|
113
|
+
const char* next_byte_limit_;
|
114
|
+
const char* next_byte_;
|
115
|
+
const char* onelangspan_begin_;
|
116
|
+
int byte_length_;
|
117
|
+
int script_;
|
118
|
+
Language spanlang_;
|
119
|
+
int smoothwidth_;
|
120
|
+
int smoothwidth_2_;
|
121
|
+
int smoothcandidates_;
|
122
|
+
int maxlangs_;
|
123
|
+
int minlangspan_;
|
124
|
+
int rb_size_;
|
125
|
+
int next_rb_;
|
126
|
+
int rb_mask_;
|
127
|
+
uint32* rb_;
|
128
|
+
int* offset_rb_;
|
129
|
+
};
|
130
|
+
|
131
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
|
@@ -0,0 +1,299 @@
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
#include "encodings/compact_lang_det/tote.h"
|
6
|
+
#include <string.h> // memset
|
7
|
+
|
8
|
+
#include "encodings/compact_lang_det/win/cld_logging.h"
|
9
|
+
|
10
|
+
|
11
|
+
// Take a set of <key, value> pairs and tote them up.
|
12
|
+
// After explicitly sorting, retrieve top key, value pairs
|
13
|
+
Tote::Tote() {
|
14
|
+
gram_count_ = 0;
|
15
|
+
incr_count_ = 0;
|
16
|
+
byte_count_ = 0;
|
17
|
+
memset(key_, 0, sizeof(key_));
|
18
|
+
// No need to initialize values
|
19
|
+
}
|
20
|
+
|
21
|
+
Tote::~Tote() {
|
22
|
+
}
|
23
|
+
|
24
|
+
void Tote::Reinit() {
|
25
|
+
gram_count_ = 0;
|
26
|
+
incr_count_ = 0;
|
27
|
+
byte_count_ = 0;
|
28
|
+
memset(key_, 0, sizeof(key_));
|
29
|
+
// No need to initialize values
|
30
|
+
}
|
31
|
+
|
32
|
+
// Increment count of quadgrams/trigrams/unigrams scored
|
33
|
+
void Tote::AddGram() {
|
34
|
+
++gram_count_;
|
35
|
+
}
|
36
|
+
|
37
|
+
// Three-way associative, guaranteeing that the largest two counts are always
|
38
|
+
// in the data structure. kMaxSize must be a multiple of 3, and is tied to the
|
39
|
+
// subscript calculations here, which are for 8 sets of 3-way associative
|
40
|
+
// buckets. The subscripts for set N are [N], [N+8], and [N+16] used in a
|
41
|
+
// slightly-weird way: The initial probe point is [N] or [N+8], whichever
|
42
|
+
// is specified by key mod 16. In most cases (nearly *all* cases except Latin
|
43
|
+
// script), this entry matches and we update/return. The second probe is
|
44
|
+
// the other of [N] and [N+8]. The third probe is only used as a fallback to
|
45
|
+
// these two, and is there only for the rare case that there are three or more
|
46
|
+
// languages with Language enum values equal mod 8, contending within the same
|
47
|
+
// bucket. This can only happen in Latin and (rarely) Cyrillic scripts, because
|
48
|
+
// the other scripts have fewer than 17 languages total.
|
49
|
+
// If you change kMaxSize, change the constants 7/8/15/16 below
|
50
|
+
void Tote::Add(uint8 ikey, int idelta) {
|
51
|
+
DCHECK(ikey != 0);
|
52
|
+
++incr_count_;
|
53
|
+
|
54
|
+
// Look for existing entry
|
55
|
+
int sub0 = ikey & 15;
|
56
|
+
if (key_[sub0] == ikey) {
|
57
|
+
value_[sub0] += idelta;
|
58
|
+
return;
|
59
|
+
}
|
60
|
+
int sub1 = sub0 ^ 8;
|
61
|
+
if (key_[sub1] == ikey) {
|
62
|
+
value_[sub1] += idelta;
|
63
|
+
return;
|
64
|
+
}
|
65
|
+
int sub2 = (ikey & 7) + 16;
|
66
|
+
if (key_[sub2] == ikey) {
|
67
|
+
value_[sub2] += idelta;
|
68
|
+
return;
|
69
|
+
}
|
70
|
+
|
71
|
+
// Allocate new entry
|
72
|
+
int alloc = -1;
|
73
|
+
if (key_[sub0] == 0) {
|
74
|
+
alloc = sub0;
|
75
|
+
} else if (key_[sub1] == 0) {
|
76
|
+
alloc = sub1;
|
77
|
+
} else if (key_[sub2] == 0) {
|
78
|
+
alloc = sub2;
|
79
|
+
} else {
|
80
|
+
// All choices allocated, need to replace smallest one
|
81
|
+
alloc = sub0;
|
82
|
+
if (value_[sub1] < value_[alloc]) {alloc = sub1;}
|
83
|
+
if (value_[sub2] < value_[alloc]) {alloc = sub2;}
|
84
|
+
}
|
85
|
+
key_[alloc] = ikey;
|
86
|
+
value_[alloc] = idelta;
|
87
|
+
return;
|
88
|
+
}
|
89
|
+
|
90
|
+
// Return current top key
|
91
|
+
int Tote::CurrentTopKey() {
|
92
|
+
int top_key = 0;
|
93
|
+
int top_value = -1;
|
94
|
+
for (int sub = 0; sub < kMaxSize_; ++sub) {
|
95
|
+
if (key_[sub] == 0) {continue;}
|
96
|
+
if (top_value < value_[sub]) {
|
97
|
+
top_value = value_[sub];
|
98
|
+
top_key = key_[sub];
|
99
|
+
}
|
100
|
+
}
|
101
|
+
return top_key;
|
102
|
+
}
|
103
|
+
|
104
|
+
|
105
|
+
// Sort first n entries by decreasing order of value
|
106
|
+
// If key==0 other fields are not valid, treat value as -1
|
107
|
+
void Tote::Sort(int n) {
|
108
|
+
// This is n**2, but n is small
|
109
|
+
for (int sub = 0; sub < n; ++sub) {
|
110
|
+
if (key_[sub] == 0) {value_[sub] = -1;}
|
111
|
+
|
112
|
+
// Bubble sort key[sub] and entry[sub]
|
113
|
+
for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) {
|
114
|
+
if (key_[sub2] == 0) {value_[sub2] = -1;}
|
115
|
+
if (value_[sub] < value_[sub2]) {
|
116
|
+
// swap
|
117
|
+
uint8 tmpk = key_[sub];
|
118
|
+
key_[sub] = key_[sub2];
|
119
|
+
key_[sub2] = tmpk;
|
120
|
+
int tmpv = value_[sub];
|
121
|
+
value_[sub] = value_[sub2];
|
122
|
+
value_[sub2] = tmpv;
|
123
|
+
}
|
124
|
+
}
|
125
|
+
}
|
126
|
+
}
|
127
|
+
|
128
|
+
void Tote::Dump(FILE* f) {
|
129
|
+
for (int sub = 0; sub < kMaxSize_; ++sub) {
|
130
|
+
if (key_[sub] > 0) {
|
131
|
+
fprintf(f, "[%2d] %3d %8d\n", sub, key_[sub], value_[sub]);
|
132
|
+
}
|
133
|
+
}
|
134
|
+
fprintf(f, "%d %d %d\n", gram_count_, incr_count_, byte_count_);
|
135
|
+
}
|
136
|
+
|
137
|
+
|
138
|
+
|
139
|
+
|
140
|
+
// Take a set of <key, value> pairs and tote them up.
|
141
|
+
// After explicitly sorting, retrieve top key, value pairs
|
142
|
+
ToteWithReliability::ToteWithReliability() {
|
143
|
+
// No need to initialize score_ or value_
|
144
|
+
incr_count_ = 0;
|
145
|
+
sorted_ = 0;
|
146
|
+
memset(closepair_, 0, sizeof(closepair_));
|
147
|
+
memset(key_, 0, sizeof(key_));
|
148
|
+
}
|
149
|
+
|
150
|
+
ToteWithReliability::~ToteWithReliability() {
|
151
|
+
}
|
152
|
+
|
153
|
+
void ToteWithReliability::Reinit() {
|
154
|
+
// No need to initialize score_ or value_
|
155
|
+
incr_count_ = 0;
|
156
|
+
sorted_ = 0;
|
157
|
+
memset(closepair_, 0, sizeof(closepair_));
|
158
|
+
memset(key_, 0, sizeof(key_));
|
159
|
+
////ss_.Init();
|
160
|
+
}
|
161
|
+
|
162
|
+
// Weight reliability by ibytes
|
163
|
+
// Also see three-way associative comments above for Tote
|
164
|
+
void ToteWithReliability::Add(uint8 ikey, int ibytes,
|
165
|
+
int score, int ireliability) {
|
166
|
+
DCHECK(ikey != 0);
|
167
|
+
CHECK(sorted_ == 0);
|
168
|
+
++incr_count_;
|
169
|
+
|
170
|
+
// Look for existing entry
|
171
|
+
int sub0 = ikey & 15;
|
172
|
+
if (key_[sub0] == ikey) {
|
173
|
+
value_[sub0] += ibytes;
|
174
|
+
score_[sub0] += score;
|
175
|
+
reliability_[sub0] += ireliability * ibytes;
|
176
|
+
return;
|
177
|
+
}
|
178
|
+
int sub1 = sub0 ^ 8;
|
179
|
+
if (key_[sub1] == ikey) {
|
180
|
+
value_[sub1] += ibytes;
|
181
|
+
score_[sub1] += score;
|
182
|
+
reliability_[sub1] += ireliability * ibytes;
|
183
|
+
return;
|
184
|
+
}
|
185
|
+
int sub2 = (ikey & 7) + 16;
|
186
|
+
if (key_[sub2] == ikey) {
|
187
|
+
value_[sub2] += ibytes;
|
188
|
+
score_[sub2] += score;
|
189
|
+
reliability_[sub2] += ireliability * ibytes;
|
190
|
+
return;
|
191
|
+
}
|
192
|
+
|
193
|
+
// Allocate new entry
|
194
|
+
int alloc = -1;
|
195
|
+
if (key_[sub0] == 0) {
|
196
|
+
alloc = sub0;
|
197
|
+
} else if (key_[sub1] == 0) {
|
198
|
+
alloc = sub1;
|
199
|
+
} else if (key_[sub2] == 0) {
|
200
|
+
alloc = sub2;
|
201
|
+
} else {
|
202
|
+
// All choices allocated, need to replace smallest one
|
203
|
+
alloc = sub0;
|
204
|
+
if (value_[sub1] < value_[alloc]) {alloc = sub1;}
|
205
|
+
if (value_[sub2] < value_[alloc]) {alloc = sub2;}
|
206
|
+
}
|
207
|
+
key_[alloc] = ikey;
|
208
|
+
value_[alloc] = ibytes;
|
209
|
+
score_[alloc] = score;
|
210
|
+
reliability_[alloc] = ireliability * ibytes;
|
211
|
+
return;
|
212
|
+
}
|
213
|
+
|
214
|
+
// Find subscript of a given packed language, or -1
|
215
|
+
int ToteWithReliability::Find(uint8 ikey) {
|
216
|
+
DCHECK(ikey != 0);
|
217
|
+
|
218
|
+
if (sorted_) {
|
219
|
+
// Linear search if sorted
|
220
|
+
for (int sub = 0; sub < kMaxSize_; ++sub) {
|
221
|
+
if (key_[sub] == ikey) {return sub;}
|
222
|
+
}
|
223
|
+
return -1;
|
224
|
+
}
|
225
|
+
|
226
|
+
// Look for existing entry
|
227
|
+
int sub0 = ikey & 15;
|
228
|
+
if (key_[sub0] == ikey) {
|
229
|
+
return sub0;
|
230
|
+
}
|
231
|
+
int sub1 = sub0 ^ 8;
|
232
|
+
if (key_[sub1] == ikey) {
|
233
|
+
return sub1;
|
234
|
+
}
|
235
|
+
int sub2 = (ikey & 7) + 16;
|
236
|
+
if (key_[sub2] == ikey) {
|
237
|
+
return sub2;
|
238
|
+
}
|
239
|
+
|
240
|
+
return -1;
|
241
|
+
}
|
242
|
+
|
243
|
+
// Return current top key
|
244
|
+
int ToteWithReliability::CurrentTopKey() {
|
245
|
+
int top_key = 0;
|
246
|
+
int top_value = -1;
|
247
|
+
for (int sub = 0; sub < kMaxSize_; ++sub) {
|
248
|
+
if (key_[sub] == 0) {continue;}
|
249
|
+
if (top_value < value_[sub]) {
|
250
|
+
top_value = value_[sub];
|
251
|
+
top_key = key_[sub];
|
252
|
+
}
|
253
|
+
}
|
254
|
+
return top_key;
|
255
|
+
}
|
256
|
+
|
257
|
+
|
258
|
+
// Sort first n entries by decreasing order of value
|
259
|
+
// If key==0 other fields are not valid, treat value as -1
|
260
|
+
void ToteWithReliability::Sort(int n) {
|
261
|
+
// This is n**2, but n is small
|
262
|
+
for (int sub = 0; sub < n; ++sub) {
|
263
|
+
if (key_[sub] == 0) {value_[sub] = -1;}
|
264
|
+
|
265
|
+
// Bubble sort key[sub] and entry[sub]
|
266
|
+
for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) {
|
267
|
+
if (key_[sub2] == 0) {value_[sub2] = -1;}
|
268
|
+
if (value_[sub] < value_[sub2]) {
|
269
|
+
// swap
|
270
|
+
uint8 tmpk = key_[sub];
|
271
|
+
key_[sub] = key_[sub2];
|
272
|
+
key_[sub2] = tmpk;
|
273
|
+
|
274
|
+
int tmpv = value_[sub];
|
275
|
+
value_[sub] = value_[sub2];
|
276
|
+
value_[sub2] = tmpv;
|
277
|
+
|
278
|
+
double tmps = score_[sub];
|
279
|
+
score_[sub] = score_[sub2];
|
280
|
+
score_[sub2] = tmps;
|
281
|
+
|
282
|
+
int tmpr = reliability_[sub];
|
283
|
+
reliability_[sub] = reliability_[sub2];
|
284
|
+
reliability_[sub2] = tmpr;
|
285
|
+
}
|
286
|
+
}
|
287
|
+
}
|
288
|
+
sorted_ = 1;
|
289
|
+
}
|
290
|
+
|
291
|
+
void ToteWithReliability::Dump(FILE* f) {
|
292
|
+
for (int sub = 0; sub < kMaxSize_; ++sub) {
|
293
|
+
if (key_[sub] > 0) {
|
294
|
+
fprintf(f, "[%2d] %3d %6d %5d %4d\n",
|
295
|
+
sub, key_[sub], value_[sub], score_[sub], reliability_[sub]);
|
296
|
+
}
|
297
|
+
}
|
298
|
+
fprintf(f, " %d#\n", incr_count_);
|
299
|
+
}
|
@@ -0,0 +1,89 @@
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_TOTE_H_
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_TOTE_H_
|
7
|
+
|
8
|
+
#include <stdio.h>
|
9
|
+
#include "encodings/compact_lang_det/win/cld_basictypes.h"
|
10
|
+
|
11
|
+
// Take a set of <key, value> pairs and tote them up.
|
12
|
+
// After explicitly sorting, retrieve top key, value pairs
|
13
|
+
class Tote {
|
14
|
+
public:
|
15
|
+
Tote();
|
16
|
+
~Tote();
|
17
|
+
void Reinit();
|
18
|
+
void AddGram();
|
19
|
+
void Add(uint8 ikey, int idelta);
|
20
|
+
void AddBytes(int ibytes) {byte_count_ += ibytes;}
|
21
|
+
int CurrentTopKey();
|
22
|
+
void Sort(int n);
|
23
|
+
void Dump(FILE* f);
|
24
|
+
uint16 GetGramCount() const {return gram_count_;}
|
25
|
+
uint16 GetIncrCount() const {return incr_count_;}
|
26
|
+
int GetByteCount() const {return byte_count_;}
|
27
|
+
int MaxSize() const {return kMaxSize_;}
|
28
|
+
uint8 Key(int i) const {return key_[i];}
|
29
|
+
int Value(int i) const {return value_[i];}
|
30
|
+
void SetGramCount(uint16 v) {gram_count_ = v;}
|
31
|
+
void SetIncrCount(uint16 v) {incr_count_ = v;}
|
32
|
+
void SetKey(int i, int v) {key_[i] = v;}
|
33
|
+
void SetValue(int i, int v) {value_[i] = v;}
|
34
|
+
|
35
|
+
private:
|
36
|
+
static const int kMaxSize_ = 24;
|
37
|
+
uint16 gram_count_; // Number of quadgrams/etc. scored
|
38
|
+
uint16 incr_count_; // Number of Add calls (1-3 per gram)
|
39
|
+
int byte_count_; // Bytes of text scored
|
40
|
+
// Align at multiple of 8 bytes
|
41
|
+
uint8 key_[kMaxSize_]; // Lang unassigned = 0, valid = 1..255
|
42
|
+
int value_[kMaxSize_]; // Probability score sum
|
43
|
+
};
|
44
|
+
|
45
|
+
|
46
|
+
// Take a set of <key, value, reliability> triples and tote them up.
|
47
|
+
// After explicitly sorting, retrieve top key, value, reliability triples
|
48
|
+
class ToteWithReliability {
|
49
|
+
public:
|
50
|
+
ToteWithReliability();
|
51
|
+
~ToteWithReliability();
|
52
|
+
void Reinit();
|
53
|
+
void Add(uint8 ikey, int ibytes, int score, int ireliability);
|
54
|
+
int Find(uint8 ikey);
|
55
|
+
void AddClosePair(int subscr, int val) {closepair_[subscr] += val;}
|
56
|
+
int CurrentTopKey();
|
57
|
+
void Sort(int n);
|
58
|
+
void Dump(FILE* f);
|
59
|
+
|
60
|
+
////void AddSeq(uint8 ikey) {ss_.Add(ikey);}
|
61
|
+
////void ExtractSeq(int n, uint8* dst) {ss_.Extract(n, dst);}
|
62
|
+
|
63
|
+
int GetIncrCount() const {return incr_count_;}
|
64
|
+
int GetClosePair(int subscr) const {return closepair_[subscr];}
|
65
|
+
int MaxSize() const {return kMaxSize_;}
|
66
|
+
uint8 Key(int i) const {return key_[i];}
|
67
|
+
int Value(int i) const {return value_[i];}
|
68
|
+
int Score(int i) const {return score_[i];}
|
69
|
+
int Reliability(int i) const {return reliability_[i];}
|
70
|
+
void SetKey(int i, int v) {key_[i] = v;}
|
71
|
+
void SetValue(int i, int v) {value_[i] = v;}
|
72
|
+
void SetScore(int i, int v) {score_[i] = v;}
|
73
|
+
void SetReliability(int i, int v) {reliability_[i] = v;}
|
74
|
+
|
75
|
+
private:
|
76
|
+
static const int kMaxSize_ = 24;
|
77
|
+
static const int kMaxClosePairSize_ = 8;
|
78
|
+
int incr_count_; // Number of Add calls
|
79
|
+
int sorted_; // Contents have been sorted, cannot Add
|
80
|
+
// Align at multiple of 8 bytes
|
81
|
+
int closepair_[kMaxClosePairSize_];
|
82
|
+
uint8 key_[kMaxSize_]; // Lang unassigned = 0, valid = 1..255
|
83
|
+
int value_[kMaxSize_]; // Bytecount this lang
|
84
|
+
int score_[kMaxSize_]; // Probability score sum
|
85
|
+
int reliability_[kMaxSize_]; // Percentage 0..100
|
86
|
+
////SubsetSequence ss_;
|
87
|
+
};
|
88
|
+
|
89
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_TOTE_H_
|