cld-fixed 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +20 -0
- data/.rspec +2 -0
- data/Gemfile +6 -0
- data/LICENSE +27 -0
- data/README.md +34 -0
- data/Rakefile +5 -0
- data/cld.gemspec +22 -0
- data/ext/cld/Makefile.am +28 -0
- data/ext/cld/Makefile.in +790 -0
- data/ext/cld/aclocal.m4 +8895 -0
- data/ext/cld/base/basictypes.h +348 -0
- data/ext/cld/base/build_config.h +115 -0
- data/ext/cld/base/casts.h +156 -0
- data/ext/cld/base/commandlineflags.h +443 -0
- data/ext/cld/base/crash.h +41 -0
- data/ext/cld/base/dynamic_annotations.h +358 -0
- data/ext/cld/base/global_strip_options.h +59 -0
- data/ext/cld/base/log_severity.h +46 -0
- data/ext/cld/base/logging.h +1403 -0
- data/ext/cld/base/macros.h +243 -0
- data/ext/cld/base/port.h +54 -0
- data/ext/cld/base/scoped_ptr.h +428 -0
- data/ext/cld/base/stl_decl.h +0 -0
- data/ext/cld/base/stl_decl_msvc.h +107 -0
- data/ext/cld/base/string_util.h +29 -0
- data/ext/cld/base/strtoint.h +93 -0
- data/ext/cld/base/template_util.h +96 -0
- data/ext/cld/base/type_traits.h +198 -0
- data/ext/cld/base/vlog_is_on.h +143 -0
- data/ext/cld/build_aux/config.guess +1500 -0
- data/ext/cld/build_aux/config.sub +1616 -0
- data/ext/cld/build_aux/depcomp +584 -0
- data/ext/cld/build_aux/install-sh +507 -0
- data/ext/cld/build_aux/ltmain.sh +8745 -0
- data/ext/cld/build_aux/missing +367 -0
- data/ext/cld/cld_encodings.h +95 -0
- data/ext/cld/configure +17362 -0
- data/ext/cld/configure.ac +14 -0
- data/ext/cld/encodings/compact_lang_det/#cldutil.cc# +905 -0
- data/ext/cld/encodings/compact_lang_det/#cldutil.h# +1205 -0
- data/ext/cld/encodings/compact_lang_det/#compact_lang_det_impl.h# +171 -0
- data/ext/cld/encodings/compact_lang_det/#ext_lang_enc.cc# +545 -0
- data/ext/cld/encodings/compact_lang_det/#ext_lang_enc.h# +119 -0
- data/ext/cld/encodings/compact_lang_det/#getonescriptspan.cc# +570 -0
- data/ext/cld/encodings/compact_lang_det/#getonescriptspan.h# +131 -0
- data/ext/cld/encodings/compact_lang_det/#tote.cc# +299 -0
- data/ext/cld/encodings/compact_lang_det/#tote.h# +89 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
- data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
- data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
- data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
- data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
- data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
- data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
- data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
- data/ext/cld/encodings/compact_lang_det/win/#cld_unilib_windows.cc# +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
- data/ext/cld/encodings/internal/encodings.cc +12 -0
- data/ext/cld/encodings/lang_enc.h +254 -0
- data/ext/cld/encodings/proto/encodings.pb.h +169 -0
- data/ext/cld/encodings/public/encodings.h +301 -0
- data/ext/cld/extconf.rb +7 -0
- data/ext/cld/languages/internal/#languages.cc# +337 -0
- data/ext/cld/languages/internal/languages.cc +336 -0
- data/ext/cld/languages/proto/languages.pb.h +179 -0
- data/ext/cld/languages/public/languages.h +379 -0
- data/ext/cld/thunk.cc +55 -0
- data/lib/cld.rb +21 -0
- data/lib/cld/version.rb +3 -0
- data/spec/cld_spec.rb +67 -0
- data/spec/spec_helper.rb +6 -0
- metadata +193 -0
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
|
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
|
|
7
|
+
|
|
8
|
+
#include "encodings/compact_lang_det/letterscript_enum.h"
|
|
9
|
+
#include "encodings/compact_lang_det/compact_lang_det_impl.h"
|
|
10
|
+
|
|
11
|
+
namespace getone {
|
|
12
|
+
static const int kMaxScriptBuffer = 4096;
|
|
13
|
+
static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2;
|
|
14
|
+
static const int kMaxScriptBytes = kMaxScriptBuffer- 8; // Leave some room
|
|
15
|
+
static const int kMaxAnswerBuffer = 256;
|
|
16
|
+
|
|
17
|
+
typedef enum UnicodeLScript ULScript;
|
|
18
|
+
|
|
19
|
+
typedef struct {
|
|
20
|
+
char* text; // Pointer to the span, somewhere
|
|
21
|
+
int text_bytes; // Number of bytes of text in the span
|
|
22
|
+
int offset; // Offset of start of span in original input buffer
|
|
23
|
+
ULScript script; // Script of all the letters in this span
|
|
24
|
+
Language lang; // Language identified for this span
|
|
25
|
+
bool truncated; // true if buffer filled up before a
|
|
26
|
+
// different script or EOF was found
|
|
27
|
+
} LangSpan;
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
static inline bool IsContinuationByte(char c) {
|
|
31
|
+
return static_cast<signed char>(c) < -64;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
// Gets lscript number for letters; always returns
|
|
35
|
+
// 0 (common script) for non-letters
|
|
36
|
+
int GetUTF8LetterScriptNum(const char* src);
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
// Update src pointer to point to next quadgram, +2..+5
|
|
40
|
+
// Looks at src[0..4]
|
|
41
|
+
const char* AdvanceQuad(const char* src);
|
|
42
|
+
} // end namespace getone
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class ScriptScanner {
|
|
50
|
+
public:
|
|
51
|
+
ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text);
|
|
52
|
+
~ScriptScanner();
|
|
53
|
+
|
|
54
|
+
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
|
|
55
|
+
bool GetOneScriptSpan(getone::LangSpan* span);
|
|
56
|
+
|
|
57
|
+
// Force Latin and Cyrillic scripts to be lowercase
|
|
58
|
+
void LowerScriptSpan(getone::LangSpan* span);
|
|
59
|
+
|
|
60
|
+
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
|
|
61
|
+
// Force Latin and Cyrillic scripts to be lowercase
|
|
62
|
+
bool GetOneScriptSpanLower(getone::LangSpan* span);
|
|
63
|
+
|
|
64
|
+
private:
|
|
65
|
+
int SkipToFrontOfSpan(const char* src, int len, int* script);
|
|
66
|
+
|
|
67
|
+
const char* start_byte_;
|
|
68
|
+
const char* next_byte_;
|
|
69
|
+
const char* next_byte_limit_;
|
|
70
|
+
int byte_length_;
|
|
71
|
+
bool is_plain_text_;
|
|
72
|
+
char* script_buffer_; // Holds text with expanded entities
|
|
73
|
+
char* script_buffer_lower_; // Holds lowercased text
|
|
74
|
+
};
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class LangScanner {
|
|
78
|
+
public:
|
|
79
|
+
LangScanner(const CompactLangDetImpl::LangDetObj* langdetobj,
|
|
80
|
+
getone::LangSpan* spn, int smoothwidth, int smoothcandidates,
|
|
81
|
+
int maxlangs, int minlangspan);
|
|
82
|
+
~LangScanner();
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
int script() {return script_;}
|
|
86
|
+
|
|
87
|
+
// Use new text
|
|
88
|
+
// Keep smoothing state if same script, otherwise reinit smoothing
|
|
89
|
+
void NewText(getone::LangSpan* spn);
|
|
90
|
+
|
|
91
|
+
bool GetOneShortLangSpanBoot(getone::LangSpan* span); // Just for bootstrapping
|
|
92
|
+
bool GetOneLangSpanBoot(getone::LangSpan* span); // Just for bootstrapping
|
|
93
|
+
|
|
94
|
+
// The real ones
|
|
95
|
+
bool GetOneShortLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj,
|
|
96
|
+
getone::LangSpan* span);
|
|
97
|
+
bool GetOneLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj,
|
|
98
|
+
getone::LangSpan* span);
|
|
99
|
+
|
|
100
|
+
// Increases language bias by delta
|
|
101
|
+
void SetLanguageBias(const CompactLangDetImpl::LangDetObj* langdetobj,
|
|
102
|
+
Language key, int delta);
|
|
103
|
+
|
|
104
|
+
// For debugging output
|
|
105
|
+
int next_answer_;
|
|
106
|
+
char answer_buffer_[getone::kMaxAnswerBuffer];
|
|
107
|
+
char answer_buffer2_[getone::kMaxAnswerBuffer];
|
|
108
|
+
char answer_buffer3_[getone::kMaxAnswerBuffer];
|
|
109
|
+
char answer_buffer4_[getone::kMaxAnswerBuffer];
|
|
110
|
+
|
|
111
|
+
private:
|
|
112
|
+
const char* start_byte_;
|
|
113
|
+
const char* next_byte_limit_;
|
|
114
|
+
const char* next_byte_;
|
|
115
|
+
const char* onelangspan_begin_;
|
|
116
|
+
int byte_length_;
|
|
117
|
+
int script_;
|
|
118
|
+
Language spanlang_;
|
|
119
|
+
int smoothwidth_;
|
|
120
|
+
int smoothwidth_2_;
|
|
121
|
+
int smoothcandidates_;
|
|
122
|
+
int maxlangs_;
|
|
123
|
+
int minlangspan_;
|
|
124
|
+
int rb_size_;
|
|
125
|
+
int next_rb_;
|
|
126
|
+
int rb_mask_;
|
|
127
|
+
uint32* rb_;
|
|
128
|
+
int* offset_rb_;
|
|
129
|
+
};
|
|
130
|
+
|
|
131
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
|
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#include "encodings/compact_lang_det/tote.h"
|
|
6
|
+
#include <string.h> // memset
|
|
7
|
+
|
|
8
|
+
#include "encodings/compact_lang_det/win/cld_logging.h"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
// Take a set of <key, value> pairs and tote them up.
|
|
12
|
+
// After explicitly sorting, retrieve top key, value pairs
|
|
13
|
+
Tote::Tote() {
|
|
14
|
+
gram_count_ = 0;
|
|
15
|
+
incr_count_ = 0;
|
|
16
|
+
byte_count_ = 0;
|
|
17
|
+
memset(key_, 0, sizeof(key_));
|
|
18
|
+
// No need to initialize values
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
Tote::~Tote() {
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
void Tote::Reinit() {
|
|
25
|
+
gram_count_ = 0;
|
|
26
|
+
incr_count_ = 0;
|
|
27
|
+
byte_count_ = 0;
|
|
28
|
+
memset(key_, 0, sizeof(key_));
|
|
29
|
+
// No need to initialize values
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
// Increment count of quadgrams/trigrams/unigrams scored
|
|
33
|
+
void Tote::AddGram() {
|
|
34
|
+
++gram_count_;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// Three-way associative, guaranteeing that the largest two counts are always
|
|
38
|
+
// in the data structure. kMaxSize must be a multiple of 3, and is tied to the
|
|
39
|
+
// subscript calculations here, which are for 8 sets of 3-way associative
|
|
40
|
+
// buckets. The subscripts for set N are [N], [N+8], and [N+16] used in a
|
|
41
|
+
// slightly-weird way: The initial probe point is [N] or [N+8], whichever
|
|
42
|
+
// is specified by key mod 16. In most cases (nearly *all* cases except Latin
|
|
43
|
+
// script), this entry matches and we update/return. The second probe is
|
|
44
|
+
// the other of [N] and [N+8]. The third probe is only used as a fallback to
|
|
45
|
+
// these two, and is there only for the rare case that there are three or more
|
|
46
|
+
// languages with Language enum values equal mod 8, contending within the same
|
|
47
|
+
// bucket. This can only happen in Latin and (rarely) Cyrillic scripts, because
|
|
48
|
+
// the other scripts have fewer than 17 languages total.
|
|
49
|
+
// If you change kMaxSize, change the constants 7/8/15/16 below
|
|
50
|
+
void Tote::Add(uint8 ikey, int idelta) {
|
|
51
|
+
DCHECK(ikey != 0);
|
|
52
|
+
++incr_count_;
|
|
53
|
+
|
|
54
|
+
// Look for existing entry
|
|
55
|
+
int sub0 = ikey & 15;
|
|
56
|
+
if (key_[sub0] == ikey) {
|
|
57
|
+
value_[sub0] += idelta;
|
|
58
|
+
return;
|
|
59
|
+
}
|
|
60
|
+
int sub1 = sub0 ^ 8;
|
|
61
|
+
if (key_[sub1] == ikey) {
|
|
62
|
+
value_[sub1] += idelta;
|
|
63
|
+
return;
|
|
64
|
+
}
|
|
65
|
+
int sub2 = (ikey & 7) + 16;
|
|
66
|
+
if (key_[sub2] == ikey) {
|
|
67
|
+
value_[sub2] += idelta;
|
|
68
|
+
return;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// Allocate new entry
|
|
72
|
+
int alloc = -1;
|
|
73
|
+
if (key_[sub0] == 0) {
|
|
74
|
+
alloc = sub0;
|
|
75
|
+
} else if (key_[sub1] == 0) {
|
|
76
|
+
alloc = sub1;
|
|
77
|
+
} else if (key_[sub2] == 0) {
|
|
78
|
+
alloc = sub2;
|
|
79
|
+
} else {
|
|
80
|
+
// All choices allocated, need to replace smallest one
|
|
81
|
+
alloc = sub0;
|
|
82
|
+
if (value_[sub1] < value_[alloc]) {alloc = sub1;}
|
|
83
|
+
if (value_[sub2] < value_[alloc]) {alloc = sub2;}
|
|
84
|
+
}
|
|
85
|
+
key_[alloc] = ikey;
|
|
86
|
+
value_[alloc] = idelta;
|
|
87
|
+
return;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
// Return current top key
|
|
91
|
+
int Tote::CurrentTopKey() {
|
|
92
|
+
int top_key = 0;
|
|
93
|
+
int top_value = -1;
|
|
94
|
+
for (int sub = 0; sub < kMaxSize_; ++sub) {
|
|
95
|
+
if (key_[sub] == 0) {continue;}
|
|
96
|
+
if (top_value < value_[sub]) {
|
|
97
|
+
top_value = value_[sub];
|
|
98
|
+
top_key = key_[sub];
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
return top_key;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
// Sort first n entries by decreasing order of value
|
|
106
|
+
// If key==0 other fields are not valid, treat value as -1
|
|
107
|
+
void Tote::Sort(int n) {
|
|
108
|
+
// This is n**2, but n is small
|
|
109
|
+
for (int sub = 0; sub < n; ++sub) {
|
|
110
|
+
if (key_[sub] == 0) {value_[sub] = -1;}
|
|
111
|
+
|
|
112
|
+
// Bubble sort key[sub] and entry[sub]
|
|
113
|
+
for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) {
|
|
114
|
+
if (key_[sub2] == 0) {value_[sub2] = -1;}
|
|
115
|
+
if (value_[sub] < value_[sub2]) {
|
|
116
|
+
// swap
|
|
117
|
+
uint8 tmpk = key_[sub];
|
|
118
|
+
key_[sub] = key_[sub2];
|
|
119
|
+
key_[sub2] = tmpk;
|
|
120
|
+
int tmpv = value_[sub];
|
|
121
|
+
value_[sub] = value_[sub2];
|
|
122
|
+
value_[sub2] = tmpv;
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
void Tote::Dump(FILE* f) {
|
|
129
|
+
for (int sub = 0; sub < kMaxSize_; ++sub) {
|
|
130
|
+
if (key_[sub] > 0) {
|
|
131
|
+
fprintf(f, "[%2d] %3d %8d\n", sub, key_[sub], value_[sub]);
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
fprintf(f, "%d %d %d\n", gram_count_, incr_count_, byte_count_);
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
// Take a set of <key, value> pairs and tote them up.
|
|
141
|
+
// After explicitly sorting, retrieve top key, value pairs
|
|
142
|
+
ToteWithReliability::ToteWithReliability() {
|
|
143
|
+
// No need to initialize score_ or value_
|
|
144
|
+
incr_count_ = 0;
|
|
145
|
+
sorted_ = 0;
|
|
146
|
+
memset(closepair_, 0, sizeof(closepair_));
|
|
147
|
+
memset(key_, 0, sizeof(key_));
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
ToteWithReliability::~ToteWithReliability() {
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
void ToteWithReliability::Reinit() {
|
|
154
|
+
// No need to initialize score_ or value_
|
|
155
|
+
incr_count_ = 0;
|
|
156
|
+
sorted_ = 0;
|
|
157
|
+
memset(closepair_, 0, sizeof(closepair_));
|
|
158
|
+
memset(key_, 0, sizeof(key_));
|
|
159
|
+
////ss_.Init();
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// Weight reliability by ibytes
|
|
163
|
+
// Also see three-way associative comments above for Tote
|
|
164
|
+
void ToteWithReliability::Add(uint8 ikey, int ibytes,
|
|
165
|
+
int score, int ireliability) {
|
|
166
|
+
DCHECK(ikey != 0);
|
|
167
|
+
CHECK(sorted_ == 0);
|
|
168
|
+
++incr_count_;
|
|
169
|
+
|
|
170
|
+
// Look for existing entry
|
|
171
|
+
int sub0 = ikey & 15;
|
|
172
|
+
if (key_[sub0] == ikey) {
|
|
173
|
+
value_[sub0] += ibytes;
|
|
174
|
+
score_[sub0] += score;
|
|
175
|
+
reliability_[sub0] += ireliability * ibytes;
|
|
176
|
+
return;
|
|
177
|
+
}
|
|
178
|
+
int sub1 = sub0 ^ 8;
|
|
179
|
+
if (key_[sub1] == ikey) {
|
|
180
|
+
value_[sub1] += ibytes;
|
|
181
|
+
score_[sub1] += score;
|
|
182
|
+
reliability_[sub1] += ireliability * ibytes;
|
|
183
|
+
return;
|
|
184
|
+
}
|
|
185
|
+
int sub2 = (ikey & 7) + 16;
|
|
186
|
+
if (key_[sub2] == ikey) {
|
|
187
|
+
value_[sub2] += ibytes;
|
|
188
|
+
score_[sub2] += score;
|
|
189
|
+
reliability_[sub2] += ireliability * ibytes;
|
|
190
|
+
return;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
// Allocate new entry
|
|
194
|
+
int alloc = -1;
|
|
195
|
+
if (key_[sub0] == 0) {
|
|
196
|
+
alloc = sub0;
|
|
197
|
+
} else if (key_[sub1] == 0) {
|
|
198
|
+
alloc = sub1;
|
|
199
|
+
} else if (key_[sub2] == 0) {
|
|
200
|
+
alloc = sub2;
|
|
201
|
+
} else {
|
|
202
|
+
// All choices allocated, need to replace smallest one
|
|
203
|
+
alloc = sub0;
|
|
204
|
+
if (value_[sub1] < value_[alloc]) {alloc = sub1;}
|
|
205
|
+
if (value_[sub2] < value_[alloc]) {alloc = sub2;}
|
|
206
|
+
}
|
|
207
|
+
key_[alloc] = ikey;
|
|
208
|
+
value_[alloc] = ibytes;
|
|
209
|
+
score_[alloc] = score;
|
|
210
|
+
reliability_[alloc] = ireliability * ibytes;
|
|
211
|
+
return;
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
// Find subscript of a given packed language, or -1
|
|
215
|
+
int ToteWithReliability::Find(uint8 ikey) {
|
|
216
|
+
DCHECK(ikey != 0);
|
|
217
|
+
|
|
218
|
+
if (sorted_) {
|
|
219
|
+
// Linear search if sorted
|
|
220
|
+
for (int sub = 0; sub < kMaxSize_; ++sub) {
|
|
221
|
+
if (key_[sub] == ikey) {return sub;}
|
|
222
|
+
}
|
|
223
|
+
return -1;
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
// Look for existing entry
|
|
227
|
+
int sub0 = ikey & 15;
|
|
228
|
+
if (key_[sub0] == ikey) {
|
|
229
|
+
return sub0;
|
|
230
|
+
}
|
|
231
|
+
int sub1 = sub0 ^ 8;
|
|
232
|
+
if (key_[sub1] == ikey) {
|
|
233
|
+
return sub1;
|
|
234
|
+
}
|
|
235
|
+
int sub2 = (ikey & 7) + 16;
|
|
236
|
+
if (key_[sub2] == ikey) {
|
|
237
|
+
return sub2;
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
return -1;
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
// Return current top key
|
|
244
|
+
int ToteWithReliability::CurrentTopKey() {
|
|
245
|
+
int top_key = 0;
|
|
246
|
+
int top_value = -1;
|
|
247
|
+
for (int sub = 0; sub < kMaxSize_; ++sub) {
|
|
248
|
+
if (key_[sub] == 0) {continue;}
|
|
249
|
+
if (top_value < value_[sub]) {
|
|
250
|
+
top_value = value_[sub];
|
|
251
|
+
top_key = key_[sub];
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
return top_key;
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
// Sort first n entries by decreasing order of value
|
|
259
|
+
// If key==0 other fields are not valid, treat value as -1
|
|
260
|
+
void ToteWithReliability::Sort(int n) {
|
|
261
|
+
// This is n**2, but n is small
|
|
262
|
+
for (int sub = 0; sub < n; ++sub) {
|
|
263
|
+
if (key_[sub] == 0) {value_[sub] = -1;}
|
|
264
|
+
|
|
265
|
+
// Bubble sort key[sub] and entry[sub]
|
|
266
|
+
for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) {
|
|
267
|
+
if (key_[sub2] == 0) {value_[sub2] = -1;}
|
|
268
|
+
if (value_[sub] < value_[sub2]) {
|
|
269
|
+
// swap
|
|
270
|
+
uint8 tmpk = key_[sub];
|
|
271
|
+
key_[sub] = key_[sub2];
|
|
272
|
+
key_[sub2] = tmpk;
|
|
273
|
+
|
|
274
|
+
int tmpv = value_[sub];
|
|
275
|
+
value_[sub] = value_[sub2];
|
|
276
|
+
value_[sub2] = tmpv;
|
|
277
|
+
|
|
278
|
+
double tmps = score_[sub];
|
|
279
|
+
score_[sub] = score_[sub2];
|
|
280
|
+
score_[sub2] = tmps;
|
|
281
|
+
|
|
282
|
+
int tmpr = reliability_[sub];
|
|
283
|
+
reliability_[sub] = reliability_[sub2];
|
|
284
|
+
reliability_[sub2] = tmpr;
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
sorted_ = 1;
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
void ToteWithReliability::Dump(FILE* f) {
|
|
292
|
+
for (int sub = 0; sub < kMaxSize_; ++sub) {
|
|
293
|
+
if (key_[sub] > 0) {
|
|
294
|
+
fprintf(f, "[%2d] %3d %6d %5d %4d\n",
|
|
295
|
+
sub, key_[sub], value_[sub], score_[sub], reliability_[sub]);
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
fprintf(f, " %d#\n", incr_count_);
|
|
299
|
+
}
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_TOTE_H_
|
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_TOTE_H_
|
|
7
|
+
|
|
8
|
+
#include <stdio.h>
|
|
9
|
+
#include "encodings/compact_lang_det/win/cld_basictypes.h"
|
|
10
|
+
|
|
11
|
+
// Take a set of <key, value> pairs and tote them up.
|
|
12
|
+
// After explicitly sorting, retrieve top key, value pairs
|
|
13
|
+
class Tote {
|
|
14
|
+
public:
|
|
15
|
+
Tote();
|
|
16
|
+
~Tote();
|
|
17
|
+
void Reinit();
|
|
18
|
+
void AddGram();
|
|
19
|
+
void Add(uint8 ikey, int idelta);
|
|
20
|
+
void AddBytes(int ibytes) {byte_count_ += ibytes;}
|
|
21
|
+
int CurrentTopKey();
|
|
22
|
+
void Sort(int n);
|
|
23
|
+
void Dump(FILE* f);
|
|
24
|
+
uint16 GetGramCount() const {return gram_count_;}
|
|
25
|
+
uint16 GetIncrCount() const {return incr_count_;}
|
|
26
|
+
int GetByteCount() const {return byte_count_;}
|
|
27
|
+
int MaxSize() const {return kMaxSize_;}
|
|
28
|
+
uint8 Key(int i) const {return key_[i];}
|
|
29
|
+
int Value(int i) const {return value_[i];}
|
|
30
|
+
void SetGramCount(uint16 v) {gram_count_ = v;}
|
|
31
|
+
void SetIncrCount(uint16 v) {incr_count_ = v;}
|
|
32
|
+
void SetKey(int i, int v) {key_[i] = v;}
|
|
33
|
+
void SetValue(int i, int v) {value_[i] = v;}
|
|
34
|
+
|
|
35
|
+
private:
|
|
36
|
+
static const int kMaxSize_ = 24;
|
|
37
|
+
uint16 gram_count_; // Number of quadgrams/etc. scored
|
|
38
|
+
uint16 incr_count_; // Number of Add calls (1-3 per gram)
|
|
39
|
+
int byte_count_; // Bytes of text scored
|
|
40
|
+
// Align at multiple of 8 bytes
|
|
41
|
+
uint8 key_[kMaxSize_]; // Lang unassigned = 0, valid = 1..255
|
|
42
|
+
int value_[kMaxSize_]; // Probability score sum
|
|
43
|
+
};
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
// Take a set of <key, value, reliability> triples and tote them up.
|
|
47
|
+
// After explicitly sorting, retrieve top key, value, reliability triples
|
|
48
|
+
class ToteWithReliability {
|
|
49
|
+
public:
|
|
50
|
+
ToteWithReliability();
|
|
51
|
+
~ToteWithReliability();
|
|
52
|
+
void Reinit();
|
|
53
|
+
void Add(uint8 ikey, int ibytes, int score, int ireliability);
|
|
54
|
+
int Find(uint8 ikey);
|
|
55
|
+
void AddClosePair(int subscr, int val) {closepair_[subscr] += val;}
|
|
56
|
+
int CurrentTopKey();
|
|
57
|
+
void Sort(int n);
|
|
58
|
+
void Dump(FILE* f);
|
|
59
|
+
|
|
60
|
+
////void AddSeq(uint8 ikey) {ss_.Add(ikey);}
|
|
61
|
+
////void ExtractSeq(int n, uint8* dst) {ss_.Extract(n, dst);}
|
|
62
|
+
|
|
63
|
+
int GetIncrCount() const {return incr_count_;}
|
|
64
|
+
int GetClosePair(int subscr) const {return closepair_[subscr];}
|
|
65
|
+
int MaxSize() const {return kMaxSize_;}
|
|
66
|
+
uint8 Key(int i) const {return key_[i];}
|
|
67
|
+
int Value(int i) const {return value_[i];}
|
|
68
|
+
int Score(int i) const {return score_[i];}
|
|
69
|
+
int Reliability(int i) const {return reliability_[i];}
|
|
70
|
+
void SetKey(int i, int v) {key_[i] = v;}
|
|
71
|
+
void SetValue(int i, int v) {value_[i] = v;}
|
|
72
|
+
void SetScore(int i, int v) {score_[i] = v;}
|
|
73
|
+
void SetReliability(int i, int v) {reliability_[i] = v;}
|
|
74
|
+
|
|
75
|
+
private:
|
|
76
|
+
static const int kMaxSize_ = 24;
|
|
77
|
+
static const int kMaxClosePairSize_ = 8;
|
|
78
|
+
int incr_count_; // Number of Add calls
|
|
79
|
+
int sorted_; // Contents have been sorted, cannot Add
|
|
80
|
+
// Align at multiple of 8 bytes
|
|
81
|
+
int closepair_[kMaxClosePairSize_];
|
|
82
|
+
uint8 key_[kMaxSize_]; // Lang unassigned = 0, valid = 1..255
|
|
83
|
+
int value_[kMaxSize_]; // Bytecount this lang
|
|
84
|
+
int score_[kMaxSize_]; // Probability score sum
|
|
85
|
+
int reliability_[kMaxSize_]; // Percentage 0..100
|
|
86
|
+
////SubsetSequence ss_;
|
|
87
|
+
};
|
|
88
|
+
|
|
89
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_TOTE_H_
|