krukid-cld 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (108) hide show
  1. data/LICENSE +27 -0
  2. data/Manifest +106 -0
  3. data/README.rdoc +173 -0
  4. data/Rakefile +15 -0
  5. data/base/basictypes.h +348 -0
  6. data/base/build_config.h +115 -0
  7. data/base/casts.h +156 -0
  8. data/base/commandlineflags.h +443 -0
  9. data/base/crash.h +41 -0
  10. data/base/dynamic_annotations.h +358 -0
  11. data/base/global_strip_options.h +59 -0
  12. data/base/log_severity.h +46 -0
  13. data/base/logging.h +1403 -0
  14. data/base/macros.h +243 -0
  15. data/base/port.h +54 -0
  16. data/base/scoped_ptr.h +428 -0
  17. data/base/stl_decl.h +0 -0
  18. data/base/stl_decl_msvc.h +107 -0
  19. data/base/string_util.h +29 -0
  20. data/base/strtoint.h +93 -0
  21. data/base/template_util.h +96 -0
  22. data/base/type_traits.h +198 -0
  23. data/base/vlog_is_on.h +143 -0
  24. data/build.sh +48 -0
  25. data/build.win.cmd +28 -0
  26. data/cld.gemspec +33 -0
  27. data/cld_encodings.h +95 -0
  28. data/encodings/compact_lang_det/#cldutil.cc# +905 -0
  29. data/encodings/compact_lang_det/#cldutil.h# +1205 -0
  30. data/encodings/compact_lang_det/#compact_lang_det_impl.h# +171 -0
  31. data/encodings/compact_lang_det/#ext_lang_enc.cc# +545 -0
  32. data/encodings/compact_lang_det/#ext_lang_enc.h# +119 -0
  33. data/encodings/compact_lang_det/#getonescriptspan.cc# +570 -0
  34. data/encodings/compact_lang_det/#getonescriptspan.h# +131 -0
  35. data/encodings/compact_lang_det/#tote.cc# +299 -0
  36. data/encodings/compact_lang_det/#tote.h# +89 -0
  37. data/encodings/compact_lang_det/cldutil.cc +905 -0
  38. data/encodings/compact_lang_det/cldutil.h +1205 -0
  39. data/encodings/compact_lang_det/cldutil_dbg.h +76 -0
  40. data/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
  41. data/encodings/compact_lang_det/compact_lang_det.cc +62 -0
  42. data/encodings/compact_lang_det/compact_lang_det.h +145 -0
  43. data/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
  44. data/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
  45. data/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
  46. data/encodings/compact_lang_det/compile.cmd +1 -0
  47. data/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
  48. data/encodings/compact_lang_det/ext_lang_enc.h +119 -0
  49. data/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
  50. data/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
  51. data/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
  52. data/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
  53. data/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
  54. data/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
  55. data/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
  56. data/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
  57. data/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
  58. data/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
  59. data/encodings/compact_lang_det/getonescriptspan.cc +570 -0
  60. data/encodings/compact_lang_det/getonescriptspan.h +131 -0
  61. data/encodings/compact_lang_det/letterscript_enum.cc +117 -0
  62. data/encodings/compact_lang_det/letterscript_enum.h +99 -0
  63. data/encodings/compact_lang_det/subsetsequence.cc +259 -0
  64. data/encodings/compact_lang_det/subsetsequence.h +44 -0
  65. data/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
  66. data/encodings/compact_lang_det/tote.cc +299 -0
  67. data/encodings/compact_lang_det/tote.h +89 -0
  68. data/encodings/compact_lang_det/unittest_data.h +193 -0
  69. data/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
  70. data/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
  71. data/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
  72. data/encodings/compact_lang_det/win/#cld_unilib_windows.cc# +29 -0
  73. data/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
  74. data/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
  75. data/encodings/compact_lang_det/win/cld_google.h +18 -0
  76. data/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
  77. data/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
  78. data/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
  79. data/encodings/compact_lang_det/win/cld_logging.h +21 -0
  80. data/encodings/compact_lang_det/win/cld_macros.h +19 -0
  81. data/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
  82. data/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
  83. data/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
  84. data/encodings/compact_lang_det/win/cld_unilib.h +15 -0
  85. data/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
  86. data/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
  87. data/encodings/compact_lang_det/win/cld_utf.h +24 -0
  88. data/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
  89. data/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
  90. data/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
  91. data/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
  92. data/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
  93. data/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
  94. data/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
  95. data/encodings/internal/encodings.cc +12 -0
  96. data/encodings/lang_enc.h +254 -0
  97. data/encodings/proto/encodings.pb.h +169 -0
  98. data/encodings/public/encodings.h +301 -0
  99. data/ext/cld/extconf.rb +8 -0
  100. data/krukid-cld.gemspec +33 -0
  101. data/languages/internal/#languages.cc# +337 -0
  102. data/languages/internal/languages.cc +337 -0
  103. data/languages/proto/languages.pb.h +179 -0
  104. data/languages/public/languages.h +379 -0
  105. data/lib/cld.rb +12 -0
  106. data/test/test.rb +570 -0
  107. data/thunk.cc +131 -0
  108. metadata +196 -0
@@ -0,0 +1,299 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #include "encodings/compact_lang_det/tote.h"
6
+ #include <string.h> // memset
7
+
8
+ #include "encodings/compact_lang_det/win/cld_logging.h"
9
+
10
+
11
+ // Take a set of <key, value> pairs and tote them up.
12
+ // After explicitly sorting, retrieve top key, value pairs
13
+ Tote::Tote() {
14
+ gram_count_ = 0;
15
+ incr_count_ = 0;
16
+ byte_count_ = 0;
17
+ memset(key_, 0, sizeof(key_));
18
+ // No need to initialize values
19
+ }
20
+
21
+ Tote::~Tote() {
22
+ }
23
+
24
+ void Tote::Reinit() {
25
+ gram_count_ = 0;
26
+ incr_count_ = 0;
27
+ byte_count_ = 0;
28
+ memset(key_, 0, sizeof(key_));
29
+ // No need to initialize values
30
+ }
31
+
32
+ // Increment count of quadgrams/trigrams/unigrams scored
33
+ void Tote::AddGram() {
34
+ ++gram_count_;
35
+ }
36
+
37
+ // Three-way associative, guaranteeing that the largest two counts are always
38
+ // in the data structure. kMaxSize must be a multiple of 3, and is tied to the
39
+ // subscript calculations here, which are for 8 sets of 3-way associative
40
+ // buckets. The subscripts for set N are [N], [N+8], and [N+16] used in a
41
+ // slightly-weird way: The initial probe point is [N] or [N+8], whichever
42
+ // is specified by key mod 16. In most cases (nearly *all* cases except Latin
43
+ // script), this entry matches and we update/return. The second probe is
44
+ // the other of [N] and [N+8]. The third probe is only used as a fallback to
45
+ // these two, and is there only for the rare case that there are three or more
46
+ // languages with Language enum values equal mod 8, contending within the same
47
+ // bucket. This can only happen in Latin and (rarely) Cyrillic scripts, because
48
+ // the other scripts have fewer than 17 languages total.
49
+ // If you change kMaxSize, change the constants 7/8/15/16 below
50
+ void Tote::Add(uint8 ikey, int idelta) {
51
+ DCHECK(ikey != 0);
52
+ ++incr_count_;
53
+
54
+ // Look for existing entry
55
+ int sub0 = ikey & 15;
56
+ if (key_[sub0] == ikey) {
57
+ value_[sub0] += idelta;
58
+ return;
59
+ }
60
+ int sub1 = sub0 ^ 8;
61
+ if (key_[sub1] == ikey) {
62
+ value_[sub1] += idelta;
63
+ return;
64
+ }
65
+ int sub2 = (ikey & 7) + 16;
66
+ if (key_[sub2] == ikey) {
67
+ value_[sub2] += idelta;
68
+ return;
69
+ }
70
+
71
+ // Allocate new entry
72
+ int alloc = -1;
73
+ if (key_[sub0] == 0) {
74
+ alloc = sub0;
75
+ } else if (key_[sub1] == 0) {
76
+ alloc = sub1;
77
+ } else if (key_[sub2] == 0) {
78
+ alloc = sub2;
79
+ } else {
80
+ // All choices allocated, need to replace smallest one
81
+ alloc = sub0;
82
+ if (value_[sub1] < value_[alloc]) {alloc = sub1;}
83
+ if (value_[sub2] < value_[alloc]) {alloc = sub2;}
84
+ }
85
+ key_[alloc] = ikey;
86
+ value_[alloc] = idelta;
87
+ return;
88
+ }
89
+
90
+ // Return current top key
91
+ int Tote::CurrentTopKey() {
92
+ int top_key = 0;
93
+ int top_value = -1;
94
+ for (int sub = 0; sub < kMaxSize_; ++sub) {
95
+ if (key_[sub] == 0) {continue;}
96
+ if (top_value < value_[sub]) {
97
+ top_value = value_[sub];
98
+ top_key = key_[sub];
99
+ }
100
+ }
101
+ return top_key;
102
+ }
103
+
104
+
105
+ // Sort first n entries by decreasing order of value
106
+ // If key==0 other fields are not valid, treat value as -1
107
+ void Tote::Sort(int n) {
108
+ // This is n**2, but n is small
109
+ for (int sub = 0; sub < n; ++sub) {
110
+ if (key_[sub] == 0) {value_[sub] = -1;}
111
+
112
+ // Bubble sort key[sub] and entry[sub]
113
+ for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) {
114
+ if (key_[sub2] == 0) {value_[sub2] = -1;}
115
+ if (value_[sub] < value_[sub2]) {
116
+ // swap
117
+ uint8 tmpk = key_[sub];
118
+ key_[sub] = key_[sub2];
119
+ key_[sub2] = tmpk;
120
+ int tmpv = value_[sub];
121
+ value_[sub] = value_[sub2];
122
+ value_[sub2] = tmpv;
123
+ }
124
+ }
125
+ }
126
+ }
127
+
128
+ void Tote::Dump(FILE* f) {
129
+ for (int sub = 0; sub < kMaxSize_; ++sub) {
130
+ if (key_[sub] > 0) {
131
+ fprintf(f, "[%2d] %3d %8d\n", sub, key_[sub], value_[sub]);
132
+ }
133
+ }
134
+ fprintf(f, "%d %d %d\n", gram_count_, incr_count_, byte_count_);
135
+ }
136
+
137
+
138
+
139
+
140
+ // Take a set of <key, value> pairs and tote them up.
141
+ // After explicitly sorting, retrieve top key, value pairs
142
+ ToteWithReliability::ToteWithReliability() {
143
+ // No need to initialize score_ or value_
144
+ incr_count_ = 0;
145
+ sorted_ = 0;
146
+ memset(closepair_, 0, sizeof(closepair_));
147
+ memset(key_, 0, sizeof(key_));
148
+ }
149
+
150
+ ToteWithReliability::~ToteWithReliability() {
151
+ }
152
+
153
+ void ToteWithReliability::Reinit() {
154
+ // No need to initialize score_ or value_
155
+ incr_count_ = 0;
156
+ sorted_ = 0;
157
+ memset(closepair_, 0, sizeof(closepair_));
158
+ memset(key_, 0, sizeof(key_));
159
+ ////ss_.Init();
160
+ }
161
+
162
+ // Weight reliability by ibytes
163
+ // Also see three-way associative comments above for Tote
164
+ void ToteWithReliability::Add(uint8 ikey, int ibytes,
165
+ int score, int ireliability) {
166
+ DCHECK(ikey != 0);
167
+ CHECK(sorted_ == 0);
168
+ ++incr_count_;
169
+
170
+ // Look for existing entry
171
+ int sub0 = ikey & 15;
172
+ if (key_[sub0] == ikey) {
173
+ value_[sub0] += ibytes;
174
+ score_[sub0] += score;
175
+ reliability_[sub0] += ireliability * ibytes;
176
+ return;
177
+ }
178
+ int sub1 = sub0 ^ 8;
179
+ if (key_[sub1] == ikey) {
180
+ value_[sub1] += ibytes;
181
+ score_[sub1] += score;
182
+ reliability_[sub1] += ireliability * ibytes;
183
+ return;
184
+ }
185
+ int sub2 = (ikey & 7) + 16;
186
+ if (key_[sub2] == ikey) {
187
+ value_[sub2] += ibytes;
188
+ score_[sub2] += score;
189
+ reliability_[sub2] += ireliability * ibytes;
190
+ return;
191
+ }
192
+
193
+ // Allocate new entry
194
+ int alloc = -1;
195
+ if (key_[sub0] == 0) {
196
+ alloc = sub0;
197
+ } else if (key_[sub1] == 0) {
198
+ alloc = sub1;
199
+ } else if (key_[sub2] == 0) {
200
+ alloc = sub2;
201
+ } else {
202
+ // All choices allocated, need to replace smallest one
203
+ alloc = sub0;
204
+ if (value_[sub1] < value_[alloc]) {alloc = sub1;}
205
+ if (value_[sub2] < value_[alloc]) {alloc = sub2;}
206
+ }
207
+ key_[alloc] = ikey;
208
+ value_[alloc] = ibytes;
209
+ score_[alloc] = score;
210
+ reliability_[alloc] = ireliability * ibytes;
211
+ return;
212
+ }
213
+
214
+ // Find subscript of a given packed language, or -1
215
+ int ToteWithReliability::Find(uint8 ikey) {
216
+ DCHECK(ikey != 0);
217
+
218
+ if (sorted_) {
219
+ // Linear search if sorted
220
+ for (int sub = 0; sub < kMaxSize_; ++sub) {
221
+ if (key_[sub] == ikey) {return sub;}
222
+ }
223
+ return -1;
224
+ }
225
+
226
+ // Look for existing entry
227
+ int sub0 = ikey & 15;
228
+ if (key_[sub0] == ikey) {
229
+ return sub0;
230
+ }
231
+ int sub1 = sub0 ^ 8;
232
+ if (key_[sub1] == ikey) {
233
+ return sub1;
234
+ }
235
+ int sub2 = (ikey & 7) + 16;
236
+ if (key_[sub2] == ikey) {
237
+ return sub2;
238
+ }
239
+
240
+ return -1;
241
+ }
242
+
243
+ // Return current top key
244
+ int ToteWithReliability::CurrentTopKey() {
245
+ int top_key = 0;
246
+ int top_value = -1;
247
+ for (int sub = 0; sub < kMaxSize_; ++sub) {
248
+ if (key_[sub] == 0) {continue;}
249
+ if (top_value < value_[sub]) {
250
+ top_value = value_[sub];
251
+ top_key = key_[sub];
252
+ }
253
+ }
254
+ return top_key;
255
+ }
256
+
257
+
258
+ // Sort first n entries by decreasing order of value
259
+ // If key==0 other fields are not valid, treat value as -1
260
+ void ToteWithReliability::Sort(int n) {
261
+ // This is n**2, but n is small
262
+ for (int sub = 0; sub < n; ++sub) {
263
+ if (key_[sub] == 0) {value_[sub] = -1;}
264
+
265
+ // Bubble sort key[sub] and entry[sub]
266
+ for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) {
267
+ if (key_[sub2] == 0) {value_[sub2] = -1;}
268
+ if (value_[sub] < value_[sub2]) {
269
+ // swap
270
+ uint8 tmpk = key_[sub];
271
+ key_[sub] = key_[sub2];
272
+ key_[sub2] = tmpk;
273
+
274
+ int tmpv = value_[sub];
275
+ value_[sub] = value_[sub2];
276
+ value_[sub2] = tmpv;
277
+
278
+ double tmps = score_[sub];
279
+ score_[sub] = score_[sub2];
280
+ score_[sub2] = tmps;
281
+
282
+ int tmpr = reliability_[sub];
283
+ reliability_[sub] = reliability_[sub2];
284
+ reliability_[sub2] = tmpr;
285
+ }
286
+ }
287
+ }
288
+ sorted_ = 1;
289
+ }
290
+
291
+ void ToteWithReliability::Dump(FILE* f) {
292
+ for (int sub = 0; sub < kMaxSize_; ++sub) {
293
+ if (key_[sub] > 0) {
294
+ fprintf(f, "[%2d] %3d %6d %5d %4d\n",
295
+ sub, key_[sub], value_[sub], score_[sub], reliability_[sub]);
296
+ }
297
+ }
298
+ fprintf(f, " %d#\n", incr_count_);
299
+ }
@@ -0,0 +1,89 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_TOTE_H_
6
+ #define ENCODINGS_COMPACT_LANG_DET_TOTE_H_
7
+
8
+ #include <stdio.h>
9
+ #include "encodings/compact_lang_det/win/cld_basictypes.h"
10
+
11
+ // Take a set of <key, value> pairs and tote them up.
12
+ // After explicitly sorting, retrieve top key, value pairs
13
+ class Tote {
14
+ public:
15
+ Tote();
16
+ ~Tote();
17
+ void Reinit();
18
+ void AddGram();
19
+ void Add(uint8 ikey, int idelta);
20
+ void AddBytes(int ibytes) {byte_count_ += ibytes;}
21
+ int CurrentTopKey();
22
+ void Sort(int n);
23
+ void Dump(FILE* f);
24
+ uint16 GetGramCount() const {return gram_count_;}
25
+ uint16 GetIncrCount() const {return incr_count_;}
26
+ int GetByteCount() const {return byte_count_;}
27
+ int MaxSize() const {return kMaxSize_;}
28
+ uint8 Key(int i) const {return key_[i];}
29
+ int Value(int i) const {return value_[i];}
30
+ void SetGramCount(uint16 v) {gram_count_ = v;}
31
+ void SetIncrCount(uint16 v) {incr_count_ = v;}
32
+ void SetKey(int i, int v) {key_[i] = v;}
33
+ void SetValue(int i, int v) {value_[i] = v;}
34
+
35
+ private:
36
+ static const int kMaxSize_ = 24;
37
+ uint16 gram_count_; // Number of quadgrams/etc. scored
38
+ uint16 incr_count_; // Number of Add calls (1-3 per gram)
39
+ int byte_count_; // Bytes of text scored
40
+ // Align at multiple of 8 bytes
41
+ uint8 key_[kMaxSize_]; // Lang unassigned = 0, valid = 1..255
42
+ int value_[kMaxSize_]; // Probability score sum
43
+ };
44
+
45
+
46
+ // Take a set of <key, value, reliability> triples and tote them up.
47
+ // After explicitly sorting, retrieve top key, value, reliability triples
48
+ class ToteWithReliability {
49
+ public:
50
+ ToteWithReliability();
51
+ ~ToteWithReliability();
52
+ void Reinit();
53
+ void Add(uint8 ikey, int ibytes, int score, int ireliability);
54
+ int Find(uint8 ikey);
55
+ void AddClosePair(int subscr, int val) {closepair_[subscr] += val;}
56
+ int CurrentTopKey();
57
+ void Sort(int n);
58
+ void Dump(FILE* f);
59
+
60
+ ////void AddSeq(uint8 ikey) {ss_.Add(ikey);}
61
+ ////void ExtractSeq(int n, uint8* dst) {ss_.Extract(n, dst);}
62
+
63
+ int GetIncrCount() const {return incr_count_;}
64
+ int GetClosePair(int subscr) const {return closepair_[subscr];}
65
+ int MaxSize() const {return kMaxSize_;}
66
+ uint8 Key(int i) const {return key_[i];}
67
+ int Value(int i) const {return value_[i];}
68
+ int Score(int i) const {return score_[i];}
69
+ int Reliability(int i) const {return reliability_[i];}
70
+ void SetKey(int i, int v) {key_[i] = v;}
71
+ void SetValue(int i, int v) {value_[i] = v;}
72
+ void SetScore(int i, int v) {score_[i] = v;}
73
+ void SetReliability(int i, int v) {reliability_[i] = v;}
74
+
75
+ private:
76
+ static const int kMaxSize_ = 24;
77
+ static const int kMaxClosePairSize_ = 8;
78
+ int incr_count_; // Number of Add calls
79
+ int sorted_; // Contents have been sorted, cannot Add
80
+ // Align at multiple of 8 bytes
81
+ int closepair_[kMaxClosePairSize_];
82
+ uint8 key_[kMaxSize_]; // Lang unassigned = 0, valid = 1..255
83
+ int value_[kMaxSize_]; // Bytecount this lang
84
+ int score_[kMaxSize_]; // Probability score sum
85
+ int reliability_[kMaxSize_]; // Percentage 0..100
86
+ ////SubsetSequence ss_;
87
+ };
88
+
89
+ #endif // ENCODINGS_COMPACT_LANG_DET_TOTE_H_