cld-fixed 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +20 -0
- data/.rspec +2 -0
- data/Gemfile +6 -0
- data/LICENSE +27 -0
- data/README.md +34 -0
- data/Rakefile +5 -0
- data/cld.gemspec +22 -0
- data/ext/cld/Makefile.am +28 -0
- data/ext/cld/Makefile.in +790 -0
- data/ext/cld/aclocal.m4 +8895 -0
- data/ext/cld/base/basictypes.h +348 -0
- data/ext/cld/base/build_config.h +115 -0
- data/ext/cld/base/casts.h +156 -0
- data/ext/cld/base/commandlineflags.h +443 -0
- data/ext/cld/base/crash.h +41 -0
- data/ext/cld/base/dynamic_annotations.h +358 -0
- data/ext/cld/base/global_strip_options.h +59 -0
- data/ext/cld/base/log_severity.h +46 -0
- data/ext/cld/base/logging.h +1403 -0
- data/ext/cld/base/macros.h +243 -0
- data/ext/cld/base/port.h +54 -0
- data/ext/cld/base/scoped_ptr.h +428 -0
- data/ext/cld/base/stl_decl.h +0 -0
- data/ext/cld/base/stl_decl_msvc.h +107 -0
- data/ext/cld/base/string_util.h +29 -0
- data/ext/cld/base/strtoint.h +93 -0
- data/ext/cld/base/template_util.h +96 -0
- data/ext/cld/base/type_traits.h +198 -0
- data/ext/cld/base/vlog_is_on.h +143 -0
- data/ext/cld/build_aux/config.guess +1500 -0
- data/ext/cld/build_aux/config.sub +1616 -0
- data/ext/cld/build_aux/depcomp +584 -0
- data/ext/cld/build_aux/install-sh +507 -0
- data/ext/cld/build_aux/ltmain.sh +8745 -0
- data/ext/cld/build_aux/missing +367 -0
- data/ext/cld/cld_encodings.h +95 -0
- data/ext/cld/configure +17362 -0
- data/ext/cld/configure.ac +14 -0
- data/ext/cld/encodings/compact_lang_det/#cldutil.cc# +905 -0
- data/ext/cld/encodings/compact_lang_det/#cldutil.h# +1205 -0
- data/ext/cld/encodings/compact_lang_det/#compact_lang_det_impl.h# +171 -0
- data/ext/cld/encodings/compact_lang_det/#ext_lang_enc.cc# +545 -0
- data/ext/cld/encodings/compact_lang_det/#ext_lang_enc.h# +119 -0
- data/ext/cld/encodings/compact_lang_det/#getonescriptspan.cc# +570 -0
- data/ext/cld/encodings/compact_lang_det/#getonescriptspan.h# +131 -0
- data/ext/cld/encodings/compact_lang_det/#tote.cc# +299 -0
- data/ext/cld/encodings/compact_lang_det/#tote.h# +89 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
- data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
- data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
- data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
- data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
- data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
- data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
- data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
- data/ext/cld/encodings/compact_lang_det/win/#cld_unilib_windows.cc# +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
- data/ext/cld/encodings/internal/encodings.cc +12 -0
- data/ext/cld/encodings/lang_enc.h +254 -0
- data/ext/cld/encodings/proto/encodings.pb.h +169 -0
- data/ext/cld/encodings/public/encodings.h +301 -0
- data/ext/cld/extconf.rb +7 -0
- data/ext/cld/languages/internal/#languages.cc# +337 -0
- data/ext/cld/languages/internal/languages.cc +336 -0
- data/ext/cld/languages/proto/languages.pb.h +179 -0
- data/ext/cld/languages/public/languages.h +379 -0
- data/ext/cld/thunk.cc +55 -0
- data/lib/cld.rb +21 -0
- data/lib/cld/version.rb +3 -0
- data/spec/cld_spec.rb +67 -0
- data/spec/spec_helper.rb +6 -0
- metadata +193 -0
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
// Remember a subset of a sequence of values, using a modest amount of memory
|
|
6
|
+
|
|
7
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_SUBSETSEQUENCE_H_
|
|
8
|
+
#define ENCODINGS_COMPACT_LANG_DET_SUBSETSEQUENCE_H_
|
|
9
|
+
|
|
10
|
+
#include "encodings/compact_lang_det/win/cld_basictypes.h"
|
|
11
|
+
#include "encodings/compact_lang_det/win/cld_google.h"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class SubsetSequence {
|
|
15
|
+
public:
|
|
16
|
+
void Init();
|
|
17
|
+
void Add(uint8 e);
|
|
18
|
+
void Extract(int n, uint8* dst);
|
|
19
|
+
SubsetSequence() {Init();}
|
|
20
|
+
~SubsetSequence() {};
|
|
21
|
+
|
|
22
|
+
private:
|
|
23
|
+
uint8 Median3(int sub);
|
|
24
|
+
void NewLevel();
|
|
25
|
+
void DoCarries();
|
|
26
|
+
void Flush();
|
|
27
|
+
|
|
28
|
+
static const int kMaxLevel_ = 16; // 3**16 ~= 43M (3**20 ~= 3.4B)
|
|
29
|
+
static const int kMaxSeq_ = 128;
|
|
30
|
+
|
|
31
|
+
int k_;
|
|
32
|
+
int next_e_;
|
|
33
|
+
int limit_e_;
|
|
34
|
+
int level_limit_e_;
|
|
35
|
+
uint8 seq_[kMaxSeq_];
|
|
36
|
+
uint8 count_[kMaxLevel_ + 1]; // +1 allows graceful overflow
|
|
37
|
+
|
|
38
|
+
DISALLOW_EVIL_CONSTRUCTORS(SubsetSequence);
|
|
39
|
+
|
|
40
|
+
// Require enough room to end up with 40 entries plus carrying space
|
|
41
|
+
COMPILE_ASSERT(kMaxSeq_ >= (kMaxLevel_ * 2 + 40), kMaxSeq__is_too_small);
|
|
42
|
+
};
|
|
43
|
+
|
|
44
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_SUBSETSEQUENCE_H_
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
// Copyright 2008 Google Inc. All Rights Reserved.
|
|
2
|
+
// Author: dsites@google.com (Dick Sites)
|
|
3
|
+
/*
|
|
4
|
+
#include "testing/base/public/gunit.h"
|
|
5
|
+
#include "testing/lib/strings/overrun_sensitive_memory_block.h"
|
|
6
|
+
#include "cld/encodings/compact_lang_det/subsetsequence.h"
|
|
7
|
+
|
|
8
|
+
// This always passes. It is just scaffolidng to exercise the subsequence
|
|
9
|
+
// facility, which is likely to get abandoned soon. dsites 2008.11.17
|
|
10
|
+
//
|
|
11
|
+
TEST(SubsetSequence, foo) {
|
|
12
|
+
uint8 dst[120];
|
|
13
|
+
|
|
14
|
+
// Create 120-element vector
|
|
15
|
+
printf("Creating %d items:\n", 120);
|
|
16
|
+
SubsetSequence ss;
|
|
17
|
+
for (int i = 0; i < 120; ++i) {
|
|
18
|
+
ss.Add(i);
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
// Extract various lengths
|
|
22
|
+
for (int n = 120; n >= 0; --n) {
|
|
23
|
+
ss.Extract(n, dst);
|
|
24
|
+
printf("[%d] ", n);
|
|
25
|
+
for (int i = 0; i < n; ++i) {
|
|
26
|
+
printf("%d ", dst[i]);
|
|
27
|
+
}
|
|
28
|
+
printf("\n");
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
printf("\n");
|
|
32
|
+
printf("\n");
|
|
33
|
+
|
|
34
|
+
// Create 120-element vector of 7 items each
|
|
35
|
+
printf("Creating %d items:\n", 120);
|
|
36
|
+
ss.Init();
|
|
37
|
+
for (int i = 0; i < 120; ++i) {
|
|
38
|
+
ss.Add(i / 7);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// Extract various lengths
|
|
42
|
+
for (int n = 120; n >= 0; --n) {
|
|
43
|
+
ss.Extract(n, dst);
|
|
44
|
+
printf("[%d] ", n);
|
|
45
|
+
for (int i = 0; i < n; ++i) {
|
|
46
|
+
printf("%d ", dst[i]);
|
|
47
|
+
}
|
|
48
|
+
printf("\n");
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
printf("\n");
|
|
52
|
+
printf("\n");
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
// Create 400 element vector of patterns
|
|
56
|
+
int nn1 = 400;
|
|
57
|
+
int divisor = (nn1 + 239) / 240; // Max inserted value = 240
|
|
58
|
+
printf("Creating %d items:\n", nn1);
|
|
59
|
+
ss.Init();
|
|
60
|
+
for (int i = 0; i < nn1; ++i) {
|
|
61
|
+
ss.Add(i / divisor);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// Extract 12-item summary lengths
|
|
65
|
+
int n1 = 12;
|
|
66
|
+
ss.Extract(n1, dst);
|
|
67
|
+
printf("[%d] ", n1);
|
|
68
|
+
for (int i = 0; i < n1; ++i) {
|
|
69
|
+
printf("%d ", dst[i]);
|
|
70
|
+
}
|
|
71
|
+
printf("\n");
|
|
72
|
+
|
|
73
|
+
printf("\n");
|
|
74
|
+
printf("\n");
|
|
75
|
+
|
|
76
|
+
// Create 10**n element vector of patterns
|
|
77
|
+
int pow_10 = 1;
|
|
78
|
+
for (int nn = 0; nn < 9; ++nn) {
|
|
79
|
+
printf("Creating %d items:\n", pow_10);
|
|
80
|
+
int divisor = (pow_10 + 239) / 240; // Max inserted value = 240
|
|
81
|
+
ss.Init();
|
|
82
|
+
for (int i = 0; i < pow_10; ++i) {
|
|
83
|
+
ss.Add(i / divisor);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// Extract 12-item summary lengths
|
|
87
|
+
int n = 12;
|
|
88
|
+
ss.Extract(n, dst);
|
|
89
|
+
printf("[%d] ", n);
|
|
90
|
+
for (int i = 0; i < n; ++i) {
|
|
91
|
+
printf("%d ", dst[i]);
|
|
92
|
+
}
|
|
93
|
+
printf("\n");
|
|
94
|
+
|
|
95
|
+
pow_10 *= 10;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
}
|
|
99
|
+
*/
|
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#include "encodings/compact_lang_det/tote.h"
|
|
6
|
+
#include <string.h> // memset
|
|
7
|
+
|
|
8
|
+
#include "encodings/compact_lang_det/win/cld_logging.h"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
// Take a set of <key, value> pairs and tote them up.
|
|
12
|
+
// After explicitly sorting, retrieve top key, value pairs
|
|
13
|
+
Tote::Tote() {
|
|
14
|
+
gram_count_ = 0;
|
|
15
|
+
incr_count_ = 0;
|
|
16
|
+
byte_count_ = 0;
|
|
17
|
+
memset(key_, 0, sizeof(key_));
|
|
18
|
+
// No need to initialize values
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
Tote::~Tote() {
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
void Tote::Reinit() {
|
|
25
|
+
gram_count_ = 0;
|
|
26
|
+
incr_count_ = 0;
|
|
27
|
+
byte_count_ = 0;
|
|
28
|
+
memset(key_, 0, sizeof(key_));
|
|
29
|
+
// No need to initialize values
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
// Increment count of quadgrams/trigrams/unigrams scored
|
|
33
|
+
void Tote::AddGram() {
|
|
34
|
+
++gram_count_;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// Three-way associative, guaranteeing that the largest two counts are always
|
|
38
|
+
// in the data structure. kMaxSize must be a multiple of 3, and is tied to the
|
|
39
|
+
// subscript calculations here, which are for 8 sets of 3-way associative
|
|
40
|
+
// buckets. The subscripts for set N are [N], [N+8], and [N+16] used in a
|
|
41
|
+
// slightly-weird way: The initial probe point is [N] or [N+8], whichever
|
|
42
|
+
// is specified by key mod 16. In most cases (nearly *all* cases except Latin
|
|
43
|
+
// script), this entry matches and we update/return. The second probe is
|
|
44
|
+
// the other of [N] and [N+8]. The third probe is only used as a fallback to
|
|
45
|
+
// these two, and is there only for the rare case that there are three or more
|
|
46
|
+
// languages with Language enum values equal mod 8, contending within the same
|
|
47
|
+
// bucket. This can only happen in Latin and (rarely) Cyrillic scripts, because
|
|
48
|
+
// the other scripts have fewer than 17 languages total.
|
|
49
|
+
// If you change kMaxSize, change the constants 7/8/15/16 below
|
|
50
|
+
void Tote::Add(uint8 ikey, int idelta) {
|
|
51
|
+
DCHECK(ikey != 0);
|
|
52
|
+
++incr_count_;
|
|
53
|
+
|
|
54
|
+
// Look for existing entry
|
|
55
|
+
int sub0 = ikey & 15;
|
|
56
|
+
if (key_[sub0] == ikey) {
|
|
57
|
+
value_[sub0] += idelta;
|
|
58
|
+
return;
|
|
59
|
+
}
|
|
60
|
+
int sub1 = sub0 ^ 8;
|
|
61
|
+
if (key_[sub1] == ikey) {
|
|
62
|
+
value_[sub1] += idelta;
|
|
63
|
+
return;
|
|
64
|
+
}
|
|
65
|
+
int sub2 = (ikey & 7) + 16;
|
|
66
|
+
if (key_[sub2] == ikey) {
|
|
67
|
+
value_[sub2] += idelta;
|
|
68
|
+
return;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// Allocate new entry
|
|
72
|
+
int alloc = -1;
|
|
73
|
+
if (key_[sub0] == 0) {
|
|
74
|
+
alloc = sub0;
|
|
75
|
+
} else if (key_[sub1] == 0) {
|
|
76
|
+
alloc = sub1;
|
|
77
|
+
} else if (key_[sub2] == 0) {
|
|
78
|
+
alloc = sub2;
|
|
79
|
+
} else {
|
|
80
|
+
// All choices allocated, need to replace smallest one
|
|
81
|
+
alloc = sub0;
|
|
82
|
+
if (value_[sub1] < value_[alloc]) {alloc = sub1;}
|
|
83
|
+
if (value_[sub2] < value_[alloc]) {alloc = sub2;}
|
|
84
|
+
}
|
|
85
|
+
key_[alloc] = ikey;
|
|
86
|
+
value_[alloc] = idelta;
|
|
87
|
+
return;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
// Return current top key
|
|
91
|
+
int Tote::CurrentTopKey() {
|
|
92
|
+
int top_key = 0;
|
|
93
|
+
int top_value = -1;
|
|
94
|
+
for (int sub = 0; sub < kMaxSize_; ++sub) {
|
|
95
|
+
if (key_[sub] == 0) {continue;}
|
|
96
|
+
if (top_value < value_[sub]) {
|
|
97
|
+
top_value = value_[sub];
|
|
98
|
+
top_key = key_[sub];
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
return top_key;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
// Sort first n entries by decreasing order of value
|
|
106
|
+
// If key==0 other fields are not valid, treat value as -1
|
|
107
|
+
void Tote::Sort(int n) {
|
|
108
|
+
// This is n**2, but n is small
|
|
109
|
+
for (int sub = 0; sub < n; ++sub) {
|
|
110
|
+
if (key_[sub] == 0) {value_[sub] = -1;}
|
|
111
|
+
|
|
112
|
+
// Bubble sort key[sub] and entry[sub]
|
|
113
|
+
for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) {
|
|
114
|
+
if (key_[sub2] == 0) {value_[sub2] = -1;}
|
|
115
|
+
if (value_[sub] < value_[sub2]) {
|
|
116
|
+
// swap
|
|
117
|
+
uint8 tmpk = key_[sub];
|
|
118
|
+
key_[sub] = key_[sub2];
|
|
119
|
+
key_[sub2] = tmpk;
|
|
120
|
+
int tmpv = value_[sub];
|
|
121
|
+
value_[sub] = value_[sub2];
|
|
122
|
+
value_[sub2] = tmpv;
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
void Tote::Dump(FILE* f) {
|
|
129
|
+
for (int sub = 0; sub < kMaxSize_; ++sub) {
|
|
130
|
+
if (key_[sub] > 0) {
|
|
131
|
+
fprintf(f, "[%2d] %3d %8d\n", sub, key_[sub], value_[sub]);
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
fprintf(f, "%d %d %d\n", gram_count_, incr_count_, byte_count_);
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
// Take a set of <key, value> pairs and tote them up.
|
|
141
|
+
// After explicitly sorting, retrieve top key, value pairs
|
|
142
|
+
ToteWithReliability::ToteWithReliability() {
|
|
143
|
+
// No need to initialize score_ or value_
|
|
144
|
+
incr_count_ = 0;
|
|
145
|
+
sorted_ = 0;
|
|
146
|
+
memset(closepair_, 0, sizeof(closepair_));
|
|
147
|
+
memset(key_, 0, sizeof(key_));
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
ToteWithReliability::~ToteWithReliability() {
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
void ToteWithReliability::Reinit() {
|
|
154
|
+
// No need to initialize score_ or value_
|
|
155
|
+
incr_count_ = 0;
|
|
156
|
+
sorted_ = 0;
|
|
157
|
+
memset(closepair_, 0, sizeof(closepair_));
|
|
158
|
+
memset(key_, 0, sizeof(key_));
|
|
159
|
+
////ss_.Init();
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// Weight reliability by ibytes
|
|
163
|
+
// Also see three-way associative comments above for Tote
|
|
164
|
+
void ToteWithReliability::Add(uint8 ikey, int ibytes,
|
|
165
|
+
int score, int ireliability) {
|
|
166
|
+
DCHECK(ikey != 0);
|
|
167
|
+
CHECK(sorted_ == 0);
|
|
168
|
+
++incr_count_;
|
|
169
|
+
|
|
170
|
+
// Look for existing entry
|
|
171
|
+
int sub0 = ikey & 15;
|
|
172
|
+
if (key_[sub0] == ikey) {
|
|
173
|
+
value_[sub0] += ibytes;
|
|
174
|
+
score_[sub0] += score;
|
|
175
|
+
reliability_[sub0] += ireliability * ibytes;
|
|
176
|
+
return;
|
|
177
|
+
}
|
|
178
|
+
int sub1 = sub0 ^ 8;
|
|
179
|
+
if (key_[sub1] == ikey) {
|
|
180
|
+
value_[sub1] += ibytes;
|
|
181
|
+
score_[sub1] += score;
|
|
182
|
+
reliability_[sub1] += ireliability * ibytes;
|
|
183
|
+
return;
|
|
184
|
+
}
|
|
185
|
+
int sub2 = (ikey & 7) + 16;
|
|
186
|
+
if (key_[sub2] == ikey) {
|
|
187
|
+
value_[sub2] += ibytes;
|
|
188
|
+
score_[sub2] += score;
|
|
189
|
+
reliability_[sub2] += ireliability * ibytes;
|
|
190
|
+
return;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
// Allocate new entry
|
|
194
|
+
int alloc = -1;
|
|
195
|
+
if (key_[sub0] == 0) {
|
|
196
|
+
alloc = sub0;
|
|
197
|
+
} else if (key_[sub1] == 0) {
|
|
198
|
+
alloc = sub1;
|
|
199
|
+
} else if (key_[sub2] == 0) {
|
|
200
|
+
alloc = sub2;
|
|
201
|
+
} else {
|
|
202
|
+
// All choices allocated, need to replace smallest one
|
|
203
|
+
alloc = sub0;
|
|
204
|
+
if (value_[sub1] < value_[alloc]) {alloc = sub1;}
|
|
205
|
+
if (value_[sub2] < value_[alloc]) {alloc = sub2;}
|
|
206
|
+
}
|
|
207
|
+
key_[alloc] = ikey;
|
|
208
|
+
value_[alloc] = ibytes;
|
|
209
|
+
score_[alloc] = score;
|
|
210
|
+
reliability_[alloc] = ireliability * ibytes;
|
|
211
|
+
return;
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
// Find subscript of a given packed language, or -1
|
|
215
|
+
int ToteWithReliability::Find(uint8 ikey) {
|
|
216
|
+
DCHECK(ikey != 0);
|
|
217
|
+
|
|
218
|
+
if (sorted_) {
|
|
219
|
+
// Linear search if sorted
|
|
220
|
+
for (int sub = 0; sub < kMaxSize_; ++sub) {
|
|
221
|
+
if (key_[sub] == ikey) {return sub;}
|
|
222
|
+
}
|
|
223
|
+
return -1;
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
// Look for existing entry
|
|
227
|
+
int sub0 = ikey & 15;
|
|
228
|
+
if (key_[sub0] == ikey) {
|
|
229
|
+
return sub0;
|
|
230
|
+
}
|
|
231
|
+
int sub1 = sub0 ^ 8;
|
|
232
|
+
if (key_[sub1] == ikey) {
|
|
233
|
+
return sub1;
|
|
234
|
+
}
|
|
235
|
+
int sub2 = (ikey & 7) + 16;
|
|
236
|
+
if (key_[sub2] == ikey) {
|
|
237
|
+
return sub2;
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
return -1;
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
// Return current top key
|
|
244
|
+
int ToteWithReliability::CurrentTopKey() {
|
|
245
|
+
int top_key = 0;
|
|
246
|
+
int top_value = -1;
|
|
247
|
+
for (int sub = 0; sub < kMaxSize_; ++sub) {
|
|
248
|
+
if (key_[sub] == 0) {continue;}
|
|
249
|
+
if (top_value < value_[sub]) {
|
|
250
|
+
top_value = value_[sub];
|
|
251
|
+
top_key = key_[sub];
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
return top_key;
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
// Sort first n entries by decreasing order of value
|
|
259
|
+
// If key==0 other fields are not valid, treat value as -1
|
|
260
|
+
void ToteWithReliability::Sort(int n) {
|
|
261
|
+
// This is n**2, but n is small
|
|
262
|
+
for (int sub = 0; sub < n; ++sub) {
|
|
263
|
+
if (key_[sub] == 0) {value_[sub] = -1;}
|
|
264
|
+
|
|
265
|
+
// Bubble sort key[sub] and entry[sub]
|
|
266
|
+
for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) {
|
|
267
|
+
if (key_[sub2] == 0) {value_[sub2] = -1;}
|
|
268
|
+
if (value_[sub] < value_[sub2]) {
|
|
269
|
+
// swap
|
|
270
|
+
uint8 tmpk = key_[sub];
|
|
271
|
+
key_[sub] = key_[sub2];
|
|
272
|
+
key_[sub2] = tmpk;
|
|
273
|
+
|
|
274
|
+
int tmpv = value_[sub];
|
|
275
|
+
value_[sub] = value_[sub2];
|
|
276
|
+
value_[sub2] = tmpv;
|
|
277
|
+
|
|
278
|
+
double tmps = score_[sub];
|
|
279
|
+
score_[sub] = score_[sub2];
|
|
280
|
+
score_[sub2] = tmps;
|
|
281
|
+
|
|
282
|
+
int tmpr = reliability_[sub];
|
|
283
|
+
reliability_[sub] = reliability_[sub2];
|
|
284
|
+
reliability_[sub2] = tmpr;
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
sorted_ = 1;
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
void ToteWithReliability::Dump(FILE* f) {
|
|
292
|
+
for (int sub = 0; sub < kMaxSize_; ++sub) {
|
|
293
|
+
if (key_[sub] > 0) {
|
|
294
|
+
fprintf(f, "[%2d] %3d %6d %5d %4d\n",
|
|
295
|
+
sub, key_[sub], value_[sub], score_[sub], reliability_[sub]);
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
fprintf(f, " %d#\n", incr_count_);
|
|
299
|
+
}
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_TOTE_H_
|
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_TOTE_H_
|
|
7
|
+
|
|
8
|
+
#include <stdio.h>
|
|
9
|
+
#include "encodings/compact_lang_det/win/cld_basictypes.h"
|
|
10
|
+
|
|
11
|
+
// Take a set of <key, value> pairs and tote them up.
|
|
12
|
+
// After explicitly sorting, retrieve top key, value pairs
|
|
13
|
+
class Tote {
|
|
14
|
+
public:
|
|
15
|
+
Tote();
|
|
16
|
+
~Tote();
|
|
17
|
+
void Reinit();
|
|
18
|
+
void AddGram();
|
|
19
|
+
void Add(uint8 ikey, int idelta);
|
|
20
|
+
void AddBytes(int ibytes) {byte_count_ += ibytes;}
|
|
21
|
+
int CurrentTopKey();
|
|
22
|
+
void Sort(int n);
|
|
23
|
+
void Dump(FILE* f);
|
|
24
|
+
uint16 GetGramCount() const {return gram_count_;}
|
|
25
|
+
uint16 GetIncrCount() const {return incr_count_;}
|
|
26
|
+
int GetByteCount() const {return byte_count_;}
|
|
27
|
+
int MaxSize() const {return kMaxSize_;}
|
|
28
|
+
uint8 Key(int i) const {return key_[i];}
|
|
29
|
+
int Value(int i) const {return value_[i];}
|
|
30
|
+
void SetGramCount(uint16 v) {gram_count_ = v;}
|
|
31
|
+
void SetIncrCount(uint16 v) {incr_count_ = v;}
|
|
32
|
+
void SetKey(int i, int v) {key_[i] = v;}
|
|
33
|
+
void SetValue(int i, int v) {value_[i] = v;}
|
|
34
|
+
|
|
35
|
+
private:
|
|
36
|
+
static const int kMaxSize_ = 24;
|
|
37
|
+
uint16 gram_count_; // Number of quadgrams/etc. scored
|
|
38
|
+
uint16 incr_count_; // Number of Add calls (1-3 per gram)
|
|
39
|
+
int byte_count_; // Bytes of text scored
|
|
40
|
+
// Align at multiple of 8 bytes
|
|
41
|
+
uint8 key_[kMaxSize_]; // Lang unassigned = 0, valid = 1..255
|
|
42
|
+
int value_[kMaxSize_]; // Probability score sum
|
|
43
|
+
};
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
// Take a set of <key, value, reliability> triples and tote them up.
|
|
47
|
+
// After explicitly sorting, retrieve top key, value, reliability triples
|
|
48
|
+
class ToteWithReliability {
|
|
49
|
+
public:
|
|
50
|
+
ToteWithReliability();
|
|
51
|
+
~ToteWithReliability();
|
|
52
|
+
void Reinit();
|
|
53
|
+
void Add(uint8 ikey, int ibytes, int score, int ireliability);
|
|
54
|
+
int Find(uint8 ikey);
|
|
55
|
+
void AddClosePair(int subscr, int val) {closepair_[subscr] += val;}
|
|
56
|
+
int CurrentTopKey();
|
|
57
|
+
void Sort(int n);
|
|
58
|
+
void Dump(FILE* f);
|
|
59
|
+
|
|
60
|
+
////void AddSeq(uint8 ikey) {ss_.Add(ikey);}
|
|
61
|
+
////void ExtractSeq(int n, uint8* dst) {ss_.Extract(n, dst);}
|
|
62
|
+
|
|
63
|
+
int GetIncrCount() const {return incr_count_;}
|
|
64
|
+
int GetClosePair(int subscr) const {return closepair_[subscr];}
|
|
65
|
+
int MaxSize() const {return kMaxSize_;}
|
|
66
|
+
uint8 Key(int i) const {return key_[i];}
|
|
67
|
+
int Value(int i) const {return value_[i];}
|
|
68
|
+
int Score(int i) const {return score_[i];}
|
|
69
|
+
int Reliability(int i) const {return reliability_[i];}
|
|
70
|
+
void SetKey(int i, int v) {key_[i] = v;}
|
|
71
|
+
void SetValue(int i, int v) {value_[i] = v;}
|
|
72
|
+
void SetScore(int i, int v) {score_[i] = v;}
|
|
73
|
+
void SetReliability(int i, int v) {reliability_[i] = v;}
|
|
74
|
+
|
|
75
|
+
private:
|
|
76
|
+
static const int kMaxSize_ = 24;
|
|
77
|
+
static const int kMaxClosePairSize_ = 8;
|
|
78
|
+
int incr_count_; // Number of Add calls
|
|
79
|
+
int sorted_; // Contents have been sorted, cannot Add
|
|
80
|
+
// Align at multiple of 8 bytes
|
|
81
|
+
int closepair_[kMaxClosePairSize_];
|
|
82
|
+
uint8 key_[kMaxSize_]; // Lang unassigned = 0, valid = 1..255
|
|
83
|
+
int value_[kMaxSize_]; // Bytecount this lang
|
|
84
|
+
int score_[kMaxSize_]; // Probability score sum
|
|
85
|
+
int reliability_[kMaxSize_]; // Percentage 0..100
|
|
86
|
+
////SubsetSequence ss_;
|
|
87
|
+
};
|
|
88
|
+
|
|
89
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_TOTE_H_
|