cld-fixed 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +20 -0
  3. data/.rspec +2 -0
  4. data/Gemfile +6 -0
  5. data/LICENSE +27 -0
  6. data/README.md +34 -0
  7. data/Rakefile +5 -0
  8. data/cld.gemspec +22 -0
  9. data/ext/cld/Makefile.am +28 -0
  10. data/ext/cld/Makefile.in +790 -0
  11. data/ext/cld/aclocal.m4 +8895 -0
  12. data/ext/cld/base/basictypes.h +348 -0
  13. data/ext/cld/base/build_config.h +115 -0
  14. data/ext/cld/base/casts.h +156 -0
  15. data/ext/cld/base/commandlineflags.h +443 -0
  16. data/ext/cld/base/crash.h +41 -0
  17. data/ext/cld/base/dynamic_annotations.h +358 -0
  18. data/ext/cld/base/global_strip_options.h +59 -0
  19. data/ext/cld/base/log_severity.h +46 -0
  20. data/ext/cld/base/logging.h +1403 -0
  21. data/ext/cld/base/macros.h +243 -0
  22. data/ext/cld/base/port.h +54 -0
  23. data/ext/cld/base/scoped_ptr.h +428 -0
  24. data/ext/cld/base/stl_decl.h +0 -0
  25. data/ext/cld/base/stl_decl_msvc.h +107 -0
  26. data/ext/cld/base/string_util.h +29 -0
  27. data/ext/cld/base/strtoint.h +93 -0
  28. data/ext/cld/base/template_util.h +96 -0
  29. data/ext/cld/base/type_traits.h +198 -0
  30. data/ext/cld/base/vlog_is_on.h +143 -0
  31. data/ext/cld/build_aux/config.guess +1500 -0
  32. data/ext/cld/build_aux/config.sub +1616 -0
  33. data/ext/cld/build_aux/depcomp +584 -0
  34. data/ext/cld/build_aux/install-sh +507 -0
  35. data/ext/cld/build_aux/ltmain.sh +8745 -0
  36. data/ext/cld/build_aux/missing +367 -0
  37. data/ext/cld/cld_encodings.h +95 -0
  38. data/ext/cld/configure +17362 -0
  39. data/ext/cld/configure.ac +14 -0
  40. data/ext/cld/encodings/compact_lang_det/#cldutil.cc# +905 -0
  41. data/ext/cld/encodings/compact_lang_det/#cldutil.h# +1205 -0
  42. data/ext/cld/encodings/compact_lang_det/#compact_lang_det_impl.h# +171 -0
  43. data/ext/cld/encodings/compact_lang_det/#ext_lang_enc.cc# +545 -0
  44. data/ext/cld/encodings/compact_lang_det/#ext_lang_enc.h# +119 -0
  45. data/ext/cld/encodings/compact_lang_det/#getonescriptspan.cc# +570 -0
  46. data/ext/cld/encodings/compact_lang_det/#getonescriptspan.h# +131 -0
  47. data/ext/cld/encodings/compact_lang_det/#tote.cc# +299 -0
  48. data/ext/cld/encodings/compact_lang_det/#tote.h# +89 -0
  49. data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
  50. data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
  51. data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
  52. data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
  53. data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
  54. data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
  55. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
  56. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
  57. data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
  58. data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
  59. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
  60. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
  61. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
  62. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
  63. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
  64. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
  65. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
  66. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
  67. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
  68. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
  69. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
  70. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
  71. data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
  72. data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
  73. data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
  74. data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
  75. data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
  76. data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
  77. data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
  78. data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
  79. data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
  80. data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
  81. data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
  82. data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
  83. data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
  84. data/ext/cld/encodings/compact_lang_det/win/#cld_unilib_windows.cc# +29 -0
  85. data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
  86. data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
  87. data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
  88. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
  89. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
  90. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
  91. data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
  92. data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
  93. data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
  94. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
  95. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
  96. data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
  97. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
  98. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
  99. data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
  100. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
  101. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
  102. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
  103. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
  104. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
  105. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
  106. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
  107. data/ext/cld/encodings/internal/encodings.cc +12 -0
  108. data/ext/cld/encodings/lang_enc.h +254 -0
  109. data/ext/cld/encodings/proto/encodings.pb.h +169 -0
  110. data/ext/cld/encodings/public/encodings.h +301 -0
  111. data/ext/cld/extconf.rb +7 -0
  112. data/ext/cld/languages/internal/#languages.cc# +337 -0
  113. data/ext/cld/languages/internal/languages.cc +336 -0
  114. data/ext/cld/languages/proto/languages.pb.h +179 -0
  115. data/ext/cld/languages/public/languages.h +379 -0
  116. data/ext/cld/thunk.cc +55 -0
  117. data/lib/cld.rb +21 -0
  118. data/lib/cld/version.rb +3 -0
  119. data/spec/cld_spec.rb +67 -0
  120. data/spec/spec_helper.rb +6 -0
  121. metadata +193 -0
@@ -0,0 +1,131 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
6
+ #define ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
7
+
8
+ #include "encodings/compact_lang_det/letterscript_enum.h"
9
+ #include "encodings/compact_lang_det/compact_lang_det_impl.h"
10
+
11
+ namespace getone {
12
+ static const int kMaxScriptBuffer = 4096;
13
+ static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2;
14
+ static const int kMaxScriptBytes = kMaxScriptBuffer- 8; // Leave some room
15
+ static const int kMaxAnswerBuffer = 256;
16
+
17
+ typedef enum UnicodeLScript ULScript;
18
+
19
+ typedef struct {
20
+ char* text; // Pointer to the span, somewhere
21
+ int text_bytes; // Number of bytes of text in the span
22
+ int offset; // Offset of start of span in original input buffer
23
+ ULScript script; // Script of all the letters in this span
24
+ Language lang; // Language identified for this span
25
+ bool truncated; // true if buffer filled up before a
26
+ // different script or EOF was found
27
+ } LangSpan;
28
+
29
+
30
+ static inline bool IsContinuationByte(char c) {
31
+ return static_cast<signed char>(c) < -64;
32
+ }
33
+
34
+ // Gets lscript number for letters; always returns
35
+ // 0 (common script) for non-letters
36
+ int GetUTF8LetterScriptNum(const char* src);
37
+
38
+
39
+ // Update src pointer to point to next quadgram, +2..+5
40
+ // Looks at src[0..4]
41
+ const char* AdvanceQuad(const char* src);
42
+ } // end namespace getone
43
+
44
+
45
+
46
+
47
+
48
+
49
+ class ScriptScanner {
50
+ public:
51
+ ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text);
52
+ ~ScriptScanner();
53
+
54
+ // Copy next run of same-script non-tag letters to buffer [NUL terminated]
55
+ bool GetOneScriptSpan(getone::LangSpan* span);
56
+
57
+ // Force Latin and Cyrillic scripts to be lowercase
58
+ void LowerScriptSpan(getone::LangSpan* span);
59
+
60
+ // Copy next run of same-script non-tag letters to buffer [NUL terminated]
61
+ // Force Latin and Cyrillic scripts to be lowercase
62
+ bool GetOneScriptSpanLower(getone::LangSpan* span);
63
+
64
+ private:
65
+ int SkipToFrontOfSpan(const char* src, int len, int* script);
66
+
67
+ const char* start_byte_;
68
+ const char* next_byte_;
69
+ const char* next_byte_limit_;
70
+ int byte_length_;
71
+ bool is_plain_text_;
72
+ char* script_buffer_; // Holds text with expanded entities
73
+ char* script_buffer_lower_; // Holds lowercased text
74
+ };
75
+
76
+
77
+ class LangScanner {
78
+ public:
79
+ LangScanner(const CompactLangDetImpl::LangDetObj* langdetobj,
80
+ getone::LangSpan* spn, int smoothwidth, int smoothcandidates,
81
+ int maxlangs, int minlangspan);
82
+ ~LangScanner();
83
+
84
+
85
+ int script() {return script_;}
86
+
87
+ // Use new text
88
+ // Keep smoothing state if same script, otherwise reinit smoothing
89
+ void NewText(getone::LangSpan* spn);
90
+
91
+ bool GetOneShortLangSpanBoot(getone::LangSpan* span); // Just for bootstrapping
92
+ bool GetOneLangSpanBoot(getone::LangSpan* span); // Just for bootstrapping
93
+
94
+ // The real ones
95
+ bool GetOneShortLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj,
96
+ getone::LangSpan* span);
97
+ bool GetOneLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj,
98
+ getone::LangSpan* span);
99
+
100
+ // Increases language bias by delta
101
+ void SetLanguageBias(const CompactLangDetImpl::LangDetObj* langdetobj,
102
+ Language key, int delta);
103
+
104
+ // For debugging output
105
+ int next_answer_;
106
+ char answer_buffer_[getone::kMaxAnswerBuffer];
107
+ char answer_buffer2_[getone::kMaxAnswerBuffer];
108
+ char answer_buffer3_[getone::kMaxAnswerBuffer];
109
+ char answer_buffer4_[getone::kMaxAnswerBuffer];
110
+
111
+ private:
112
+ const char* start_byte_;
113
+ const char* next_byte_limit_;
114
+ const char* next_byte_;
115
+ const char* onelangspan_begin_;
116
+ int byte_length_;
117
+ int script_;
118
+ Language spanlang_;
119
+ int smoothwidth_;
120
+ int smoothwidth_2_;
121
+ int smoothcandidates_;
122
+ int maxlangs_;
123
+ int minlangspan_;
124
+ int rb_size_;
125
+ int next_rb_;
126
+ int rb_mask_;
127
+ uint32* rb_;
128
+ int* offset_rb_;
129
+ };
130
+
131
+ #endif // ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
@@ -0,0 +1,299 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #include "encodings/compact_lang_det/tote.h"
6
+ #include <string.h> // memset
7
+
8
+ #include "encodings/compact_lang_det/win/cld_logging.h"
9
+
10
+
11
+ // Take a set of <key, value> pairs and tote them up.
12
+ // After explicitly sorting, retrieve top key, value pairs
13
+ Tote::Tote() {
14
+ gram_count_ = 0;
15
+ incr_count_ = 0;
16
+ byte_count_ = 0;
17
+ memset(key_, 0, sizeof(key_));
18
+ // No need to initialize values
19
+ }
20
+
21
+ Tote::~Tote() {
22
+ }
23
+
24
+ void Tote::Reinit() {
25
+ gram_count_ = 0;
26
+ incr_count_ = 0;
27
+ byte_count_ = 0;
28
+ memset(key_, 0, sizeof(key_));
29
+ // No need to initialize values
30
+ }
31
+
32
+ // Increment count of quadgrams/trigrams/unigrams scored
33
+ void Tote::AddGram() {
34
+ ++gram_count_;
35
+ }
36
+
37
+ // Three-way associative, guaranteeing that the largest two counts are always
38
+ // in the data structure. kMaxSize must be a multiple of 3, and is tied to the
39
+ // subscript calculations here, which are for 8 sets of 3-way associative
40
+ // buckets. The subscripts for set N are [N], [N+8], and [N+16] used in a
41
+ // slightly-weird way: The initial probe point is [N] or [N+8], whichever
42
+ // is specified by key mod 16. In most cases (nearly *all* cases except Latin
43
+ // script), this entry matches and we update/return. The second probe is
44
+ // the other of [N] and [N+8]. The third probe is only used as a fallback to
45
+ // these two, and is there only for the rare case that there are three or more
46
+ // languages with Language enum values equal mod 8, contending within the same
47
+ // bucket. This can only happen in Latin and (rarely) Cyrillic scripts, because
48
+ // the other scripts have fewer than 17 languages total.
49
+ // If you change kMaxSize, change the constants 7/8/15/16 below
50
+ void Tote::Add(uint8 ikey, int idelta) {
51
+ DCHECK(ikey != 0);
52
+ ++incr_count_;
53
+
54
+ // Look for existing entry
55
+ int sub0 = ikey & 15;
56
+ if (key_[sub0] == ikey) {
57
+ value_[sub0] += idelta;
58
+ return;
59
+ }
60
+ int sub1 = sub0 ^ 8;
61
+ if (key_[sub1] == ikey) {
62
+ value_[sub1] += idelta;
63
+ return;
64
+ }
65
+ int sub2 = (ikey & 7) + 16;
66
+ if (key_[sub2] == ikey) {
67
+ value_[sub2] += idelta;
68
+ return;
69
+ }
70
+
71
+ // Allocate new entry
72
+ int alloc = -1;
73
+ if (key_[sub0] == 0) {
74
+ alloc = sub0;
75
+ } else if (key_[sub1] == 0) {
76
+ alloc = sub1;
77
+ } else if (key_[sub2] == 0) {
78
+ alloc = sub2;
79
+ } else {
80
+ // All choices allocated, need to replace smallest one
81
+ alloc = sub0;
82
+ if (value_[sub1] < value_[alloc]) {alloc = sub1;}
83
+ if (value_[sub2] < value_[alloc]) {alloc = sub2;}
84
+ }
85
+ key_[alloc] = ikey;
86
+ value_[alloc] = idelta;
87
+ return;
88
+ }
89
+
90
+ // Return current top key
91
+ int Tote::CurrentTopKey() {
92
+ int top_key = 0;
93
+ int top_value = -1;
94
+ for (int sub = 0; sub < kMaxSize_; ++sub) {
95
+ if (key_[sub] == 0) {continue;}
96
+ if (top_value < value_[sub]) {
97
+ top_value = value_[sub];
98
+ top_key = key_[sub];
99
+ }
100
+ }
101
+ return top_key;
102
+ }
103
+
104
+
105
+ // Sort first n entries by decreasing order of value
106
+ // If key==0 other fields are not valid, treat value as -1
107
+ void Tote::Sort(int n) {
108
+ // This is n**2, but n is small
109
+ for (int sub = 0; sub < n; ++sub) {
110
+ if (key_[sub] == 0) {value_[sub] = -1;}
111
+
112
+ // Bubble sort key[sub] and entry[sub]
113
+ for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) {
114
+ if (key_[sub2] == 0) {value_[sub2] = -1;}
115
+ if (value_[sub] < value_[sub2]) {
116
+ // swap
117
+ uint8 tmpk = key_[sub];
118
+ key_[sub] = key_[sub2];
119
+ key_[sub2] = tmpk;
120
+ int tmpv = value_[sub];
121
+ value_[sub] = value_[sub2];
122
+ value_[sub2] = tmpv;
123
+ }
124
+ }
125
+ }
126
+ }
127
+
128
+ void Tote::Dump(FILE* f) {
129
+ for (int sub = 0; sub < kMaxSize_; ++sub) {
130
+ if (key_[sub] > 0) {
131
+ fprintf(f, "[%2d] %3d %8d\n", sub, key_[sub], value_[sub]);
132
+ }
133
+ }
134
+ fprintf(f, "%d %d %d\n", gram_count_, incr_count_, byte_count_);
135
+ }
136
+
137
+
138
+
139
+
140
+ // Take a set of <key, value> pairs and tote them up.
141
+ // After explicitly sorting, retrieve top key, value pairs
142
+ ToteWithReliability::ToteWithReliability() {
143
+ // No need to initialize score_ or value_
144
+ incr_count_ = 0;
145
+ sorted_ = 0;
146
+ memset(closepair_, 0, sizeof(closepair_));
147
+ memset(key_, 0, sizeof(key_));
148
+ }
149
+
150
+ ToteWithReliability::~ToteWithReliability() {
151
+ }
152
+
153
+ void ToteWithReliability::Reinit() {
154
+ // No need to initialize score_ or value_
155
+ incr_count_ = 0;
156
+ sorted_ = 0;
157
+ memset(closepair_, 0, sizeof(closepair_));
158
+ memset(key_, 0, sizeof(key_));
159
+ ////ss_.Init();
160
+ }
161
+
162
+ // Weight reliability by ibytes
163
+ // Also see three-way associative comments above for Tote
164
+ void ToteWithReliability::Add(uint8 ikey, int ibytes,
165
+ int score, int ireliability) {
166
+ DCHECK(ikey != 0);
167
+ CHECK(sorted_ == 0);
168
+ ++incr_count_;
169
+
170
+ // Look for existing entry
171
+ int sub0 = ikey & 15;
172
+ if (key_[sub0] == ikey) {
173
+ value_[sub0] += ibytes;
174
+ score_[sub0] += score;
175
+ reliability_[sub0] += ireliability * ibytes;
176
+ return;
177
+ }
178
+ int sub1 = sub0 ^ 8;
179
+ if (key_[sub1] == ikey) {
180
+ value_[sub1] += ibytes;
181
+ score_[sub1] += score;
182
+ reliability_[sub1] += ireliability * ibytes;
183
+ return;
184
+ }
185
+ int sub2 = (ikey & 7) + 16;
186
+ if (key_[sub2] == ikey) {
187
+ value_[sub2] += ibytes;
188
+ score_[sub2] += score;
189
+ reliability_[sub2] += ireliability * ibytes;
190
+ return;
191
+ }
192
+
193
+ // Allocate new entry
194
+ int alloc = -1;
195
+ if (key_[sub0] == 0) {
196
+ alloc = sub0;
197
+ } else if (key_[sub1] == 0) {
198
+ alloc = sub1;
199
+ } else if (key_[sub2] == 0) {
200
+ alloc = sub2;
201
+ } else {
202
+ // All choices allocated, need to replace smallest one
203
+ alloc = sub0;
204
+ if (value_[sub1] < value_[alloc]) {alloc = sub1;}
205
+ if (value_[sub2] < value_[alloc]) {alloc = sub2;}
206
+ }
207
+ key_[alloc] = ikey;
208
+ value_[alloc] = ibytes;
209
+ score_[alloc] = score;
210
+ reliability_[alloc] = ireliability * ibytes;
211
+ return;
212
+ }
213
+
214
+ // Find subscript of a given packed language, or -1
215
+ int ToteWithReliability::Find(uint8 ikey) {
216
+ DCHECK(ikey != 0);
217
+
218
+ if (sorted_) {
219
+ // Linear search if sorted
220
+ for (int sub = 0; sub < kMaxSize_; ++sub) {
221
+ if (key_[sub] == ikey) {return sub;}
222
+ }
223
+ return -1;
224
+ }
225
+
226
+ // Look for existing entry
227
+ int sub0 = ikey & 15;
228
+ if (key_[sub0] == ikey) {
229
+ return sub0;
230
+ }
231
+ int sub1 = sub0 ^ 8;
232
+ if (key_[sub1] == ikey) {
233
+ return sub1;
234
+ }
235
+ int sub2 = (ikey & 7) + 16;
236
+ if (key_[sub2] == ikey) {
237
+ return sub2;
238
+ }
239
+
240
+ return -1;
241
+ }
242
+
243
+ // Return current top key
244
+ int ToteWithReliability::CurrentTopKey() {
245
+ int top_key = 0;
246
+ int top_value = -1;
247
+ for (int sub = 0; sub < kMaxSize_; ++sub) {
248
+ if (key_[sub] == 0) {continue;}
249
+ if (top_value < value_[sub]) {
250
+ top_value = value_[sub];
251
+ top_key = key_[sub];
252
+ }
253
+ }
254
+ return top_key;
255
+ }
256
+
257
+
258
+ // Sort first n entries by decreasing order of value
259
+ // If key==0 other fields are not valid, treat value as -1
260
+ void ToteWithReliability::Sort(int n) {
261
+ // This is n**2, but n is small
262
+ for (int sub = 0; sub < n; ++sub) {
263
+ if (key_[sub] == 0) {value_[sub] = -1;}
264
+
265
+ // Bubble sort key[sub] and entry[sub]
266
+ for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) {
267
+ if (key_[sub2] == 0) {value_[sub2] = -1;}
268
+ if (value_[sub] < value_[sub2]) {
269
+ // swap
270
+ uint8 tmpk = key_[sub];
271
+ key_[sub] = key_[sub2];
272
+ key_[sub2] = tmpk;
273
+
274
+ int tmpv = value_[sub];
275
+ value_[sub] = value_[sub2];
276
+ value_[sub2] = tmpv;
277
+
278
+ double tmps = score_[sub];
279
+ score_[sub] = score_[sub2];
280
+ score_[sub2] = tmps;
281
+
282
+ int tmpr = reliability_[sub];
283
+ reliability_[sub] = reliability_[sub2];
284
+ reliability_[sub2] = tmpr;
285
+ }
286
+ }
287
+ }
288
+ sorted_ = 1;
289
+ }
290
+
291
+ void ToteWithReliability::Dump(FILE* f) {
292
+ for (int sub = 0; sub < kMaxSize_; ++sub) {
293
+ if (key_[sub] > 0) {
294
+ fprintf(f, "[%2d] %3d %6d %5d %4d\n",
295
+ sub, key_[sub], value_[sub], score_[sub], reliability_[sub]);
296
+ }
297
+ }
298
+ fprintf(f, " %d#\n", incr_count_);
299
+ }
@@ -0,0 +1,89 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_TOTE_H_
6
+ #define ENCODINGS_COMPACT_LANG_DET_TOTE_H_
7
+
8
+ #include <stdio.h>
9
+ #include "encodings/compact_lang_det/win/cld_basictypes.h"
10
+
11
+ // Take a set of <key, value> pairs and tote them up.
12
+ // After explicitly sorting, retrieve top key, value pairs
13
+ class Tote {
14
+ public:
15
+ Tote();
16
+ ~Tote();
17
+ void Reinit();
18
+ void AddGram();
19
+ void Add(uint8 ikey, int idelta);
20
+ void AddBytes(int ibytes) {byte_count_ += ibytes;}
21
+ int CurrentTopKey();
22
+ void Sort(int n);
23
+ void Dump(FILE* f);
24
+ uint16 GetGramCount() const {return gram_count_;}
25
+ uint16 GetIncrCount() const {return incr_count_;}
26
+ int GetByteCount() const {return byte_count_;}
27
+ int MaxSize() const {return kMaxSize_;}
28
+ uint8 Key(int i) const {return key_[i];}
29
+ int Value(int i) const {return value_[i];}
30
+ void SetGramCount(uint16 v) {gram_count_ = v;}
31
+ void SetIncrCount(uint16 v) {incr_count_ = v;}
32
+ void SetKey(int i, int v) {key_[i] = v;}
33
+ void SetValue(int i, int v) {value_[i] = v;}
34
+
35
+ private:
36
+ static const int kMaxSize_ = 24;
37
+ uint16 gram_count_; // Number of quadgrams/etc. scored
38
+ uint16 incr_count_; // Number of Add calls (1-3 per gram)
39
+ int byte_count_; // Bytes of text scored
40
+ // Align at multiple of 8 bytes
41
+ uint8 key_[kMaxSize_]; // Lang unassigned = 0, valid = 1..255
42
+ int value_[kMaxSize_]; // Probability score sum
43
+ };
44
+
45
+
46
+ // Take a set of <key, value, reliability> triples and tote them up.
47
+ // After explicitly sorting, retrieve top key, value, reliability triples
48
+ class ToteWithReliability {
49
+ public:
50
+ ToteWithReliability();
51
+ ~ToteWithReliability();
52
+ void Reinit();
53
+ void Add(uint8 ikey, int ibytes, int score, int ireliability);
54
+ int Find(uint8 ikey);
55
+ void AddClosePair(int subscr, int val) {closepair_[subscr] += val;}
56
+ int CurrentTopKey();
57
+ void Sort(int n);
58
+ void Dump(FILE* f);
59
+
60
+ ////void AddSeq(uint8 ikey) {ss_.Add(ikey);}
61
+ ////void ExtractSeq(int n, uint8* dst) {ss_.Extract(n, dst);}
62
+
63
+ int GetIncrCount() const {return incr_count_;}
64
+ int GetClosePair(int subscr) const {return closepair_[subscr];}
65
+ int MaxSize() const {return kMaxSize_;}
66
+ uint8 Key(int i) const {return key_[i];}
67
+ int Value(int i) const {return value_[i];}
68
+ int Score(int i) const {return score_[i];}
69
+ int Reliability(int i) const {return reliability_[i];}
70
+ void SetKey(int i, int v) {key_[i] = v;}
71
+ void SetValue(int i, int v) {value_[i] = v;}
72
+ void SetScore(int i, int v) {score_[i] = v;}
73
+ void SetReliability(int i, int v) {reliability_[i] = v;}
74
+
75
+ private:
76
+ static const int kMaxSize_ = 24;
77
+ static const int kMaxClosePairSize_ = 8;
78
+ int incr_count_; // Number of Add calls
79
+ int sorted_; // Contents have been sorted, cannot Add
80
+ // Align at multiple of 8 bytes
81
+ int closepair_[kMaxClosePairSize_];
82
+ uint8 key_[kMaxSize_]; // Lang unassigned = 0, valid = 1..255
83
+ int value_[kMaxSize_]; // Bytecount this lang
84
+ int score_[kMaxSize_]; // Probability score sum
85
+ int reliability_[kMaxSize_]; // Percentage 0..100
86
+ ////SubsetSequence ss_;
87
+ };
88
+
89
+ #endif // ENCODINGS_COMPACT_LANG_DET_TOTE_H_