cld-fixed 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +20 -0
  3. data/.rspec +2 -0
  4. data/Gemfile +6 -0
  5. data/LICENSE +27 -0
  6. data/README.md +34 -0
  7. data/Rakefile +5 -0
  8. data/cld.gemspec +22 -0
  9. data/ext/cld/Makefile.am +28 -0
  10. data/ext/cld/Makefile.in +790 -0
  11. data/ext/cld/aclocal.m4 +8895 -0
  12. data/ext/cld/base/basictypes.h +348 -0
  13. data/ext/cld/base/build_config.h +115 -0
  14. data/ext/cld/base/casts.h +156 -0
  15. data/ext/cld/base/commandlineflags.h +443 -0
  16. data/ext/cld/base/crash.h +41 -0
  17. data/ext/cld/base/dynamic_annotations.h +358 -0
  18. data/ext/cld/base/global_strip_options.h +59 -0
  19. data/ext/cld/base/log_severity.h +46 -0
  20. data/ext/cld/base/logging.h +1403 -0
  21. data/ext/cld/base/macros.h +243 -0
  22. data/ext/cld/base/port.h +54 -0
  23. data/ext/cld/base/scoped_ptr.h +428 -0
  24. data/ext/cld/base/stl_decl.h +0 -0
  25. data/ext/cld/base/stl_decl_msvc.h +107 -0
  26. data/ext/cld/base/string_util.h +29 -0
  27. data/ext/cld/base/strtoint.h +93 -0
  28. data/ext/cld/base/template_util.h +96 -0
  29. data/ext/cld/base/type_traits.h +198 -0
  30. data/ext/cld/base/vlog_is_on.h +143 -0
  31. data/ext/cld/build_aux/config.guess +1500 -0
  32. data/ext/cld/build_aux/config.sub +1616 -0
  33. data/ext/cld/build_aux/depcomp +584 -0
  34. data/ext/cld/build_aux/install-sh +507 -0
  35. data/ext/cld/build_aux/ltmain.sh +8745 -0
  36. data/ext/cld/build_aux/missing +367 -0
  37. data/ext/cld/cld_encodings.h +95 -0
  38. data/ext/cld/configure +17362 -0
  39. data/ext/cld/configure.ac +14 -0
  40. data/ext/cld/encodings/compact_lang_det/#cldutil.cc# +905 -0
  41. data/ext/cld/encodings/compact_lang_det/#cldutil.h# +1205 -0
  42. data/ext/cld/encodings/compact_lang_det/#compact_lang_det_impl.h# +171 -0
  43. data/ext/cld/encodings/compact_lang_det/#ext_lang_enc.cc# +545 -0
  44. data/ext/cld/encodings/compact_lang_det/#ext_lang_enc.h# +119 -0
  45. data/ext/cld/encodings/compact_lang_det/#getonescriptspan.cc# +570 -0
  46. data/ext/cld/encodings/compact_lang_det/#getonescriptspan.h# +131 -0
  47. data/ext/cld/encodings/compact_lang_det/#tote.cc# +299 -0
  48. data/ext/cld/encodings/compact_lang_det/#tote.h# +89 -0
  49. data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
  50. data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
  51. data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
  52. data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
  53. data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
  54. data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
  55. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
  56. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
  57. data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
  58. data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
  59. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
  60. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
  61. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
  62. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
  63. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
  64. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
  65. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
  66. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
  67. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
  68. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
  69. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
  70. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
  71. data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
  72. data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
  73. data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
  74. data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
  75. data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
  76. data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
  77. data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
  78. data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
  79. data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
  80. data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
  81. data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
  82. data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
  83. data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
  84. data/ext/cld/encodings/compact_lang_det/win/#cld_unilib_windows.cc# +29 -0
  85. data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
  86. data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
  87. data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
  88. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
  89. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
  90. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
  91. data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
  92. data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
  93. data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
  94. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
  95. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
  96. data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
  97. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
  98. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
  99. data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
  100. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
  101. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
  102. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
  103. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
  104. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
  105. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
  106. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
  107. data/ext/cld/encodings/internal/encodings.cc +12 -0
  108. data/ext/cld/encodings/lang_enc.h +254 -0
  109. data/ext/cld/encodings/proto/encodings.pb.h +169 -0
  110. data/ext/cld/encodings/public/encodings.h +301 -0
  111. data/ext/cld/extconf.rb +7 -0
  112. data/ext/cld/languages/internal/#languages.cc# +337 -0
  113. data/ext/cld/languages/internal/languages.cc +336 -0
  114. data/ext/cld/languages/proto/languages.pb.h +179 -0
  115. data/ext/cld/languages/public/languages.h +379 -0
  116. data/ext/cld/thunk.cc +55 -0
  117. data/lib/cld.rb +21 -0
  118. data/lib/cld/version.rb +3 -0
  119. data/spec/cld_spec.rb +67 -0
  120. data/spec/spec_helper.rb +6 -0
  121. metadata +193 -0
@@ -0,0 +1,44 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ // Remember a subset of a sequence of values, using a modest amount of memory
6
+
7
+ #ifndef ENCODINGS_COMPACT_LANG_DET_SUBSETSEQUENCE_H_
8
+ #define ENCODINGS_COMPACT_LANG_DET_SUBSETSEQUENCE_H_
9
+
10
+ #include "encodings/compact_lang_det/win/cld_basictypes.h"
11
+ #include "encodings/compact_lang_det/win/cld_google.h"
12
+
13
+
14
+ class SubsetSequence {
15
+ public:
16
+ void Init();
17
+ void Add(uint8 e);
18
+ void Extract(int n, uint8* dst);
19
+ SubsetSequence() {Init();}
20
+ ~SubsetSequence() {};
21
+
22
+ private:
23
+ uint8 Median3(int sub);
24
+ void NewLevel();
25
+ void DoCarries();
26
+ void Flush();
27
+
28
+ static const int kMaxLevel_ = 16; // 3**16 ~= 43M (3**20 ~= 3.4B)
29
+ static const int kMaxSeq_ = 128;
30
+
31
+ int k_;
32
+ int next_e_;
33
+ int limit_e_;
34
+ int level_limit_e_;
35
+ uint8 seq_[kMaxSeq_];
36
+ uint8 count_[kMaxLevel_ + 1]; // +1 allows graceful overflow
37
+
38
+ DISALLOW_EVIL_CONSTRUCTORS(SubsetSequence);
39
+
40
+ // Require enough room to end up with 40 entries plus carrying space
41
+ COMPILE_ASSERT(kMaxSeq_ >= (kMaxLevel_ * 2 + 40), kMaxSeq__is_too_small);
42
+ };
43
+
44
+ #endif // ENCODINGS_COMPACT_LANG_DET_SUBSETSEQUENCE_H_
@@ -0,0 +1,99 @@
1
+ // Copyright 2008 Google Inc. All Rights Reserved.
2
+ // Author: dsites@google.com (Dick Sites)
3
+ /*
4
+ #include "testing/base/public/gunit.h"
5
+ #include "testing/lib/strings/overrun_sensitive_memory_block.h"
6
+ #include "cld/encodings/compact_lang_det/subsetsequence.h"
7
+
8
+ // This always passes. It is just scaffolidng to exercise the subsequence
9
+ // facility, which is likely to get abandoned soon. dsites 2008.11.17
10
+ //
11
+ TEST(SubsetSequence, foo) {
12
+ uint8 dst[120];
13
+
14
+ // Create 120-element vector
15
+ printf("Creating %d items:\n", 120);
16
+ SubsetSequence ss;
17
+ for (int i = 0; i < 120; ++i) {
18
+ ss.Add(i);
19
+ }
20
+
21
+ // Extract various lengths
22
+ for (int n = 120; n >= 0; --n) {
23
+ ss.Extract(n, dst);
24
+ printf("[%d] ", n);
25
+ for (int i = 0; i < n; ++i) {
26
+ printf("%d ", dst[i]);
27
+ }
28
+ printf("\n");
29
+ }
30
+
31
+ printf("\n");
32
+ printf("\n");
33
+
34
+ // Create 120-element vector of 7 items each
35
+ printf("Creating %d items:\n", 120);
36
+ ss.Init();
37
+ for (int i = 0; i < 120; ++i) {
38
+ ss.Add(i / 7);
39
+ }
40
+
41
+ // Extract various lengths
42
+ for (int n = 120; n >= 0; --n) {
43
+ ss.Extract(n, dst);
44
+ printf("[%d] ", n);
45
+ for (int i = 0; i < n; ++i) {
46
+ printf("%d ", dst[i]);
47
+ }
48
+ printf("\n");
49
+ }
50
+
51
+ printf("\n");
52
+ printf("\n");
53
+
54
+
55
+ // Create 400 element vector of patterns
56
+ int nn1 = 400;
57
+ int divisor = (nn1 + 239) / 240; // Max inserted value = 240
58
+ printf("Creating %d items:\n", nn1);
59
+ ss.Init();
60
+ for (int i = 0; i < nn1; ++i) {
61
+ ss.Add(i / divisor);
62
+ }
63
+
64
+ // Extract 12-item summary lengths
65
+ int n1 = 12;
66
+ ss.Extract(n1, dst);
67
+ printf("[%d] ", n1);
68
+ for (int i = 0; i < n1; ++i) {
69
+ printf("%d ", dst[i]);
70
+ }
71
+ printf("\n");
72
+
73
+ printf("\n");
74
+ printf("\n");
75
+
76
+ // Create 10**n element vector of patterns
77
+ int pow_10 = 1;
78
+ for (int nn = 0; nn < 9; ++nn) {
79
+ printf("Creating %d items:\n", pow_10);
80
+ int divisor = (pow_10 + 239) / 240; // Max inserted value = 240
81
+ ss.Init();
82
+ for (int i = 0; i < pow_10; ++i) {
83
+ ss.Add(i / divisor);
84
+ }
85
+
86
+ // Extract 12-item summary lengths
87
+ int n = 12;
88
+ ss.Extract(n, dst);
89
+ printf("[%d] ", n);
90
+ for (int i = 0; i < n; ++i) {
91
+ printf("%d ", dst[i]);
92
+ }
93
+ printf("\n");
94
+
95
+ pow_10 *= 10;
96
+ }
97
+
98
+ }
99
+ */
@@ -0,0 +1,299 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #include "encodings/compact_lang_det/tote.h"
6
+ #include <string.h> // memset
7
+
8
+ #include "encodings/compact_lang_det/win/cld_logging.h"
9
+
10
+
11
+ // Take a set of <key, value> pairs and tote them up.
12
+ // After explicitly sorting, retrieve top key, value pairs
13
+ Tote::Tote() {
14
+ gram_count_ = 0;
15
+ incr_count_ = 0;
16
+ byte_count_ = 0;
17
+ memset(key_, 0, sizeof(key_));
18
+ // No need to initialize values
19
+ }
20
+
21
+ Tote::~Tote() {
22
+ }
23
+
24
+ void Tote::Reinit() {
25
+ gram_count_ = 0;
26
+ incr_count_ = 0;
27
+ byte_count_ = 0;
28
+ memset(key_, 0, sizeof(key_));
29
+ // No need to initialize values
30
+ }
31
+
32
+ // Increment count of quadgrams/trigrams/unigrams scored
33
+ void Tote::AddGram() {
34
+ ++gram_count_;
35
+ }
36
+
37
+ // Three-way associative, guaranteeing that the largest two counts are always
38
+ // in the data structure. kMaxSize must be a multiple of 3, and is tied to the
39
+ // subscript calculations here, which are for 8 sets of 3-way associative
40
+ // buckets. The subscripts for set N are [N], [N+8], and [N+16] used in a
41
+ // slightly-weird way: The initial probe point is [N] or [N+8], whichever
42
+ // is specified by key mod 16. In most cases (nearly *all* cases except Latin
43
+ // script), this entry matches and we update/return. The second probe is
44
+ // the other of [N] and [N+8]. The third probe is only used as a fallback to
45
+ // these two, and is there only for the rare case that there are three or more
46
+ // languages with Language enum values equal mod 8, contending within the same
47
+ // bucket. This can only happen in Latin and (rarely) Cyrillic scripts, because
48
+ // the other scripts have fewer than 17 languages total.
49
+ // If you change kMaxSize, change the constants 7/8/15/16 below
50
+ void Tote::Add(uint8 ikey, int idelta) {
51
+ DCHECK(ikey != 0);
52
+ ++incr_count_;
53
+
54
+ // Look for existing entry
55
+ int sub0 = ikey & 15;
56
+ if (key_[sub0] == ikey) {
57
+ value_[sub0] += idelta;
58
+ return;
59
+ }
60
+ int sub1 = sub0 ^ 8;
61
+ if (key_[sub1] == ikey) {
62
+ value_[sub1] += idelta;
63
+ return;
64
+ }
65
+ int sub2 = (ikey & 7) + 16;
66
+ if (key_[sub2] == ikey) {
67
+ value_[sub2] += idelta;
68
+ return;
69
+ }
70
+
71
+ // Allocate new entry
72
+ int alloc = -1;
73
+ if (key_[sub0] == 0) {
74
+ alloc = sub0;
75
+ } else if (key_[sub1] == 0) {
76
+ alloc = sub1;
77
+ } else if (key_[sub2] == 0) {
78
+ alloc = sub2;
79
+ } else {
80
+ // All choices allocated, need to replace smallest one
81
+ alloc = sub0;
82
+ if (value_[sub1] < value_[alloc]) {alloc = sub1;}
83
+ if (value_[sub2] < value_[alloc]) {alloc = sub2;}
84
+ }
85
+ key_[alloc] = ikey;
86
+ value_[alloc] = idelta;
87
+ return;
88
+ }
89
+
90
+ // Return current top key
91
+ int Tote::CurrentTopKey() {
92
+ int top_key = 0;
93
+ int top_value = -1;
94
+ for (int sub = 0; sub < kMaxSize_; ++sub) {
95
+ if (key_[sub] == 0) {continue;}
96
+ if (top_value < value_[sub]) {
97
+ top_value = value_[sub];
98
+ top_key = key_[sub];
99
+ }
100
+ }
101
+ return top_key;
102
+ }
103
+
104
+
105
+ // Sort first n entries by decreasing order of value
106
+ // If key==0 other fields are not valid, treat value as -1
107
+ void Tote::Sort(int n) {
108
+ // This is n**2, but n is small
109
+ for (int sub = 0; sub < n; ++sub) {
110
+ if (key_[sub] == 0) {value_[sub] = -1;}
111
+
112
+ // Bubble sort key[sub] and entry[sub]
113
+ for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) {
114
+ if (key_[sub2] == 0) {value_[sub2] = -1;}
115
+ if (value_[sub] < value_[sub2]) {
116
+ // swap
117
+ uint8 tmpk = key_[sub];
118
+ key_[sub] = key_[sub2];
119
+ key_[sub2] = tmpk;
120
+ int tmpv = value_[sub];
121
+ value_[sub] = value_[sub2];
122
+ value_[sub2] = tmpv;
123
+ }
124
+ }
125
+ }
126
+ }
127
+
128
+ void Tote::Dump(FILE* f) {
129
+ for (int sub = 0; sub < kMaxSize_; ++sub) {
130
+ if (key_[sub] > 0) {
131
+ fprintf(f, "[%2d] %3d %8d\n", sub, key_[sub], value_[sub]);
132
+ }
133
+ }
134
+ fprintf(f, "%d %d %d\n", gram_count_, incr_count_, byte_count_);
135
+ }
136
+
137
+
138
+
139
+
140
+ // Take a set of <key, value> pairs and tote them up.
141
+ // After explicitly sorting, retrieve top key, value pairs
142
+ ToteWithReliability::ToteWithReliability() {
143
+ // No need to initialize score_ or value_
144
+ incr_count_ = 0;
145
+ sorted_ = 0;
146
+ memset(closepair_, 0, sizeof(closepair_));
147
+ memset(key_, 0, sizeof(key_));
148
+ }
149
+
150
+ ToteWithReliability::~ToteWithReliability() {
151
+ }
152
+
153
+ void ToteWithReliability::Reinit() {
154
+ // No need to initialize score_ or value_
155
+ incr_count_ = 0;
156
+ sorted_ = 0;
157
+ memset(closepair_, 0, sizeof(closepair_));
158
+ memset(key_, 0, sizeof(key_));
159
+ ////ss_.Init();
160
+ }
161
+
162
+ // Weight reliability by ibytes
163
+ // Also see three-way associative comments above for Tote
164
+ void ToteWithReliability::Add(uint8 ikey, int ibytes,
165
+ int score, int ireliability) {
166
+ DCHECK(ikey != 0);
167
+ CHECK(sorted_ == 0);
168
+ ++incr_count_;
169
+
170
+ // Look for existing entry
171
+ int sub0 = ikey & 15;
172
+ if (key_[sub0] == ikey) {
173
+ value_[sub0] += ibytes;
174
+ score_[sub0] += score;
175
+ reliability_[sub0] += ireliability * ibytes;
176
+ return;
177
+ }
178
+ int sub1 = sub0 ^ 8;
179
+ if (key_[sub1] == ikey) {
180
+ value_[sub1] += ibytes;
181
+ score_[sub1] += score;
182
+ reliability_[sub1] += ireliability * ibytes;
183
+ return;
184
+ }
185
+ int sub2 = (ikey & 7) + 16;
186
+ if (key_[sub2] == ikey) {
187
+ value_[sub2] += ibytes;
188
+ score_[sub2] += score;
189
+ reliability_[sub2] += ireliability * ibytes;
190
+ return;
191
+ }
192
+
193
+ // Allocate new entry
194
+ int alloc = -1;
195
+ if (key_[sub0] == 0) {
196
+ alloc = sub0;
197
+ } else if (key_[sub1] == 0) {
198
+ alloc = sub1;
199
+ } else if (key_[sub2] == 0) {
200
+ alloc = sub2;
201
+ } else {
202
+ // All choices allocated, need to replace smallest one
203
+ alloc = sub0;
204
+ if (value_[sub1] < value_[alloc]) {alloc = sub1;}
205
+ if (value_[sub2] < value_[alloc]) {alloc = sub2;}
206
+ }
207
+ key_[alloc] = ikey;
208
+ value_[alloc] = ibytes;
209
+ score_[alloc] = score;
210
+ reliability_[alloc] = ireliability * ibytes;
211
+ return;
212
+ }
213
+
214
+ // Find subscript of a given packed language, or -1
215
+ int ToteWithReliability::Find(uint8 ikey) {
216
+ DCHECK(ikey != 0);
217
+
218
+ if (sorted_) {
219
+ // Linear search if sorted
220
+ for (int sub = 0; sub < kMaxSize_; ++sub) {
221
+ if (key_[sub] == ikey) {return sub;}
222
+ }
223
+ return -1;
224
+ }
225
+
226
+ // Look for existing entry
227
+ int sub0 = ikey & 15;
228
+ if (key_[sub0] == ikey) {
229
+ return sub0;
230
+ }
231
+ int sub1 = sub0 ^ 8;
232
+ if (key_[sub1] == ikey) {
233
+ return sub1;
234
+ }
235
+ int sub2 = (ikey & 7) + 16;
236
+ if (key_[sub2] == ikey) {
237
+ return sub2;
238
+ }
239
+
240
+ return -1;
241
+ }
242
+
243
+ // Return current top key
244
+ int ToteWithReliability::CurrentTopKey() {
245
+ int top_key = 0;
246
+ int top_value = -1;
247
+ for (int sub = 0; sub < kMaxSize_; ++sub) {
248
+ if (key_[sub] == 0) {continue;}
249
+ if (top_value < value_[sub]) {
250
+ top_value = value_[sub];
251
+ top_key = key_[sub];
252
+ }
253
+ }
254
+ return top_key;
255
+ }
256
+
257
+
258
+ // Sort first n entries by decreasing order of value
259
+ // If key==0 other fields are not valid, treat value as -1
260
+ void ToteWithReliability::Sort(int n) {
261
+ // This is n**2, but n is small
262
+ for (int sub = 0; sub < n; ++sub) {
263
+ if (key_[sub] == 0) {value_[sub] = -1;}
264
+
265
+ // Bubble sort key[sub] and entry[sub]
266
+ for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) {
267
+ if (key_[sub2] == 0) {value_[sub2] = -1;}
268
+ if (value_[sub] < value_[sub2]) {
269
+ // swap
270
+ uint8 tmpk = key_[sub];
271
+ key_[sub] = key_[sub2];
272
+ key_[sub2] = tmpk;
273
+
274
+ int tmpv = value_[sub];
275
+ value_[sub] = value_[sub2];
276
+ value_[sub2] = tmpv;
277
+
278
+ double tmps = score_[sub];
279
+ score_[sub] = score_[sub2];
280
+ score_[sub2] = tmps;
281
+
282
+ int tmpr = reliability_[sub];
283
+ reliability_[sub] = reliability_[sub2];
284
+ reliability_[sub2] = tmpr;
285
+ }
286
+ }
287
+ }
288
+ sorted_ = 1;
289
+ }
290
+
291
+ void ToteWithReliability::Dump(FILE* f) {
292
+ for (int sub = 0; sub < kMaxSize_; ++sub) {
293
+ if (key_[sub] > 0) {
294
+ fprintf(f, "[%2d] %3d %6d %5d %4d\n",
295
+ sub, key_[sub], value_[sub], score_[sub], reliability_[sub]);
296
+ }
297
+ }
298
+ fprintf(f, " %d#\n", incr_count_);
299
+ }
@@ -0,0 +1,89 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_TOTE_H_
6
+ #define ENCODINGS_COMPACT_LANG_DET_TOTE_H_
7
+
8
+ #include <stdio.h>
9
+ #include "encodings/compact_lang_det/win/cld_basictypes.h"
10
+
11
+ // Take a set of <key, value> pairs and tote them up.
12
+ // After explicitly sorting, retrieve top key, value pairs
13
+ class Tote {
14
+ public:
15
+ Tote();
16
+ ~Tote();
17
+ void Reinit();
18
+ void AddGram();
19
+ void Add(uint8 ikey, int idelta);
20
+ void AddBytes(int ibytes) {byte_count_ += ibytes;}
21
+ int CurrentTopKey();
22
+ void Sort(int n);
23
+ void Dump(FILE* f);
24
+ uint16 GetGramCount() const {return gram_count_;}
25
+ uint16 GetIncrCount() const {return incr_count_;}
26
+ int GetByteCount() const {return byte_count_;}
27
+ int MaxSize() const {return kMaxSize_;}
28
+ uint8 Key(int i) const {return key_[i];}
29
+ int Value(int i) const {return value_[i];}
30
+ void SetGramCount(uint16 v) {gram_count_ = v;}
31
+ void SetIncrCount(uint16 v) {incr_count_ = v;}
32
+ void SetKey(int i, int v) {key_[i] = v;}
33
+ void SetValue(int i, int v) {value_[i] = v;}
34
+
35
+ private:
36
+ static const int kMaxSize_ = 24;
37
+ uint16 gram_count_; // Number of quadgrams/etc. scored
38
+ uint16 incr_count_; // Number of Add calls (1-3 per gram)
39
+ int byte_count_; // Bytes of text scored
40
+ // Align at multiple of 8 bytes
41
+ uint8 key_[kMaxSize_]; // Lang unassigned = 0, valid = 1..255
42
+ int value_[kMaxSize_]; // Probability score sum
43
+ };
44
+
45
+
46
+ // Take a set of <key, value, reliability> triples and tote them up.
47
+ // After explicitly sorting, retrieve top key, value, reliability triples
48
+ class ToteWithReliability {
49
+ public:
50
+ ToteWithReliability();
51
+ ~ToteWithReliability();
52
+ void Reinit();
53
+ void Add(uint8 ikey, int ibytes, int score, int ireliability);
54
+ int Find(uint8 ikey);
55
+ void AddClosePair(int subscr, int val) {closepair_[subscr] += val;}
56
+ int CurrentTopKey();
57
+ void Sort(int n);
58
+ void Dump(FILE* f);
59
+
60
+ ////void AddSeq(uint8 ikey) {ss_.Add(ikey);}
61
+ ////void ExtractSeq(int n, uint8* dst) {ss_.Extract(n, dst);}
62
+
63
+ int GetIncrCount() const {return incr_count_;}
64
+ int GetClosePair(int subscr) const {return closepair_[subscr];}
65
+ int MaxSize() const {return kMaxSize_;}
66
+ uint8 Key(int i) const {return key_[i];}
67
+ int Value(int i) const {return value_[i];}
68
+ int Score(int i) const {return score_[i];}
69
+ int Reliability(int i) const {return reliability_[i];}
70
+ void SetKey(int i, int v) {key_[i] = v;}
71
+ void SetValue(int i, int v) {value_[i] = v;}
72
+ void SetScore(int i, int v) {score_[i] = v;}
73
+ void SetReliability(int i, int v) {reliability_[i] = v;}
74
+
75
+ private:
76
+ static const int kMaxSize_ = 24;
77
+ static const int kMaxClosePairSize_ = 8;
78
+ int incr_count_; // Number of Add calls
79
+ int sorted_; // Contents have been sorted, cannot Add
80
+ // Align at multiple of 8 bytes
81
+ int closepair_[kMaxClosePairSize_];
82
+ uint8 key_[kMaxSize_]; // Lang unassigned = 0, valid = 1..255
83
+ int value_[kMaxSize_]; // Bytecount this lang
84
+ int score_[kMaxSize_]; // Probability score sum
85
+ int reliability_[kMaxSize_]; // Percentage 0..100
86
+ ////SubsetSequence ss_;
87
+ };
88
+
89
+ #endif // ENCODINGS_COMPACT_LANG_DET_TOTE_H_