language_detection 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (100) hide show
  1. data/.gitignore +19 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +85 -0
  5. data/Rakefile +11 -0
  6. data/ext/cld/Makefile +34 -0
  7. data/ext/cld/base/basictypes.h +348 -0
  8. data/ext/cld/base/build_config.h +124 -0
  9. data/ext/cld/base/casts.h +156 -0
  10. data/ext/cld/base/commandlineflags.h +443 -0
  11. data/ext/cld/base/crash.h +41 -0
  12. data/ext/cld/base/dynamic_annotations.h +358 -0
  13. data/ext/cld/base/global_strip_options.h +59 -0
  14. data/ext/cld/base/log_severity.h +46 -0
  15. data/ext/cld/base/logging.h +1403 -0
  16. data/ext/cld/base/macros.h +243 -0
  17. data/ext/cld/base/port.h +54 -0
  18. data/ext/cld/base/scoped_ptr.h +428 -0
  19. data/ext/cld/base/stl_decl.h +0 -0
  20. data/ext/cld/base/stl_decl_msvc.h +107 -0
  21. data/ext/cld/base/string_util.h +29 -0
  22. data/ext/cld/base/strtoint.h +93 -0
  23. data/ext/cld/base/template_util.h +96 -0
  24. data/ext/cld/base/type_traits.h +198 -0
  25. data/ext/cld/base/vlog_is_on.h +143 -0
  26. data/ext/cld/cld.so +0 -0
  27. data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
  28. data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
  29. data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
  30. data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
  31. data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
  32. data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
  33. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
  34. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
  35. data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
  36. data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
  37. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
  38. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
  39. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
  40. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
  41. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
  42. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
  43. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
  44. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
  45. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
  46. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
  47. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
  48. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
  49. data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
  50. data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
  51. data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
  52. data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
  53. data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
  54. data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
  55. data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
  56. data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
  57. data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
  58. data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
  59. data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
  60. data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
  61. data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
  62. data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
  63. data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
  64. data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
  65. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
  66. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
  67. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
  68. data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
  69. data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
  70. data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
  71. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
  72. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
  73. data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
  74. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
  75. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
  76. data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
  77. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
  78. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
  79. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
  80. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
  81. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
  82. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
  83. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
  84. data/ext/cld/encodings/internal/encodings.cc +12 -0
  85. data/ext/cld/encodings/lang_enc.h +254 -0
  86. data/ext/cld/encodings/proto/encodings.pb.h +169 -0
  87. data/ext/cld/encodings/public/encodings.h +301 -0
  88. data/ext/cld/extconf.rb +1 -0
  89. data/ext/cld/language_detection.cc +88 -0
  90. data/ext/cld/languages/internal/languages.cc +337 -0
  91. data/ext/cld/languages/proto/languages.pb.h +179 -0
  92. data/ext/cld/languages/public/languages.h +379 -0
  93. data/language_detection.gemspec +28 -0
  94. data/lib/language_detection/string.rb +1 -0
  95. data/lib/language_detection/version.rb +3 -0
  96. data/lib/language_detection.rb +54 -0
  97. data/test/_helper.rb +15 -0
  98. data/test/fixtures/languages.csv +80 -0
  99. data/test/language_detection_test.rb +88 -0
  100. metadata +250 -0
@@ -0,0 +1,117 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #include "encodings/compact_lang_det/letterscript_enum.h"
6
+
7
+ #include "encodings/compact_lang_det/win/cld_logging.h"
8
+
9
+ static const char* kUnicodeLScriptNames[ULScript_NUM_SCRIPTS] = {
10
+ "Common",
11
+ "Latin",
12
+ "Greek",
13
+ "Cyrillic",
14
+ "Armenian",
15
+ "Hebrew",
16
+ "Arabic",
17
+ "Syriac",
18
+ "Thaana",
19
+ "Devanagari",
20
+ "Bengali",
21
+ "Gurmukhi",
22
+ "Gujarati",
23
+ "Oriya",
24
+ "Tamil",
25
+ "Telugu",
26
+ "Kannada",
27
+ "Malayalam",
28
+ "Sinhala",
29
+ "Thai",
30
+ "Lao",
31
+ "Tibetan",
32
+ "Myanmar",
33
+ "Georgian",
34
+ "HanCJK",
35
+ "Ethiopic",
36
+ "Cherokee",
37
+ "Canadian_Aboriginal",
38
+ "Ogham",
39
+ "Runic",
40
+ "Khmer",
41
+ "Mongolian",
42
+ "Yi",
43
+ "Old_Italic",
44
+ "Gothic",
45
+ "Deseret",
46
+ "Inherited",
47
+ "Tagalog",
48
+ "Hanunoo",
49
+ "Buhid",
50
+ "Tagbanwa",
51
+ "Limbu",
52
+ "Tai_Le",
53
+ "Linear_B",
54
+ "Ugaritic",
55
+ "Shavian",
56
+ "Osmanya",
57
+ "Cypriot",
58
+ "Buginese",
59
+ "Coptic",
60
+ "New_Tai_Lue",
61
+ "Glagolitic",
62
+ "Tifinagh",
63
+ "Syloti_Nagri",
64
+ "Old_Persian",
65
+ "Kharoshthi",
66
+ "Balinese",
67
+ "Cuneiform",
68
+ "Phoenician",
69
+ "Phags_Pa",
70
+ "Nko",
71
+
72
+ // Unicode 5.1 beta
73
+ "Sundanese",
74
+ "Lepcha",
75
+ "Ol_Chiki",
76
+ "Vai",
77
+ "Saurashtra",
78
+ "Kayah_Li",
79
+ "Rejang",
80
+ "Lycian",
81
+ "Carian",
82
+ "Lydian",
83
+ "Cham",
84
+ };
85
+
86
+
87
+ // Unicode 5.1 beta script names from
88
+ // http://www.unicode.org/Public/5.1.0/diffs/5.0.0-5.1.0.all.2.diffs
89
+ // NOTE: 'Vai ' => "Vaii" to make four letters, not three
90
+ // see http://unicode.org/iso15924/iso15924-codes.html
91
+ const char* const kLScriptName4[ULScript_NUM_SCRIPTS] = {
92
+ "Zyyy", "Latn", "Grek", "Cyrl", "Armn", "Hebr", "Arab", "Syrc",
93
+ "Thaa", "Deva", "Beng", "Guru", "Gujr", "Orya", "Taml", "Telu",
94
+ "Knda", "Mlym", "Sinh", "Thai", "Laoo", "Tibt", "Mymr", "Geor",
95
+ "Hani", "Ethi", "Cher", "Cans", "Ogam", "Runr", "Khmr", "Mong",
96
+
97
+ "Yiii", "Ital", "Goth", "Dsrt", "Zzzz", "Tglg", "Hano", "Buhd",
98
+ "Tagb", "Limb", "Tale", "Linb", "Ugar", "Shaw", "Osma", "Cprt",
99
+ "Bugi", "Copt", "Talu", "Glag", "Tfng", "Sylo", "Xpeo", "Khar",
100
+ "Bali", "Xsux", "Phnx", "Phag", "Nkoo",
101
+
102
+ // Unicode 5.1 beta
103
+ "Sund", "Lepc", "Olck", "Vaii", "Saur", "Kali", "Rjng", "Lyci",
104
+ "Cari", "Lydi", "Cham",
105
+ };
106
+
107
+
108
+ const char* UnicodeLScriptName(const UnicodeLScript ls) {
109
+ CHECK(ls >= 0 && ls < ULScript_NUM_SCRIPTS);
110
+ return kUnicodeLScriptNames[ls];
111
+ }
112
+
113
+
114
+ const char* UnicodeLScriptCode(const UnicodeLScript ls) {
115
+ CHECK(ls >= 0 && ls < ULScript_NUM_SCRIPTS);
116
+ return kLScriptName4[ls];
117
+ }
@@ -0,0 +1,99 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_LETTERSCRIPT_ENUM_H__
6
+ #define ENCODINGS_COMPACT_LANG_DET_LETTERSCRIPT_ENUM_H__
7
+
8
+ enum UnicodeLScript {
9
+ ULScript_Common,
10
+ ULScript_Latin,
11
+ ULScript_Greek,
12
+ ULScript_Cyrillic,
13
+ ULScript_Armenian,
14
+ ULScript_Hebrew,
15
+ ULScript_Arabic,
16
+ ULScript_Syriac,
17
+ ULScript_Thaana,
18
+ ULScript_Devanagari,
19
+ ULScript_Bengali,
20
+ ULScript_Gurmukhi,
21
+ ULScript_Gujarati,
22
+ ULScript_Oriya,
23
+ ULScript_Tamil,
24
+ ULScript_Telugu,
25
+ ULScript_Kannada,
26
+ ULScript_Malayalam,
27
+ ULScript_Sinhala,
28
+ ULScript_Thai,
29
+ ULScript_Lao,
30
+ ULScript_Tibetan,
31
+ ULScript_Myanmar,
32
+ ULScript_Georgian,
33
+ ULScript_HanCJK,
34
+ ULScript_Ethiopic,
35
+ ULScript_Cherokee,
36
+ ULScript_Canadian_Aboriginal,
37
+ ULScript_Ogham,
38
+ ULScript_Runic,
39
+ ULScript_Khmer,
40
+ ULScript_Mongolian,
41
+ ULScript_Yi,
42
+ ULScript_Old_Italic,
43
+ ULScript_Gothic,
44
+ ULScript_Deseret,
45
+ ULScript_Inherited,
46
+ ULScript_Tagalog,
47
+ ULScript_Hanunoo,
48
+ ULScript_Buhid,
49
+ ULScript_Tagbanwa,
50
+ ULScript_Limbu,
51
+ ULScript_Tai_Le,
52
+ ULScript_Linear_B,
53
+ ULScript_Ugaritic,
54
+ ULScript_Shavian,
55
+ ULScript_Osmanya,
56
+ ULScript_Cypriot,
57
+ ULScript_Buginese,
58
+ ULScript_Coptic,
59
+ ULScript_New_Tai_Lue,
60
+ ULScript_Glagolitic,
61
+ ULScript_Tifinagh,
62
+ ULScript_Syloti_Nagri,
63
+ ULScript_Old_Persian,
64
+ ULScript_Kharoshthi,
65
+ ULScript_Balinese,
66
+ ULScript_Cuneiform,
67
+ ULScript_Phoenician,
68
+ ULScript_Phags_Pa,
69
+ ULScript_Nko,
70
+
71
+ // Unicode 5.1
72
+ ULScript_Sundanese,
73
+ ULScript_Lepcha,
74
+ ULScript_Ol_Chiki,
75
+ ULScript_Vai,
76
+ ULScript_Saurashtra,
77
+ ULScript_Kayah_Li,
78
+ ULScript_Rejang,
79
+ ULScript_Lycian,
80
+ ULScript_Carian,
81
+ ULScript_Lydian,
82
+ ULScript_Cham,
83
+ ULScript_NUM_SCRIPTS
84
+ };
85
+
86
+
87
+ static const UnicodeLScript UNKNOWN_LSCRIPT = ULScript_Common;
88
+
89
+
90
+ // Return the name corresponding to the script ls, e.g. "Latin".
91
+ // It is a fatal error if ls is not a valid UnicodeLScript.
92
+ const char* UnicodeLScriptName(const UnicodeLScript ls);
93
+
94
+
95
+ // Return the 4-letter code corresponding to the script ls, e.g. "Latn".
96
+ // It is a fatal error if ls is not a valid UnicodeLScript.
97
+ const char* UnicodeLScriptCode(const UnicodeLScript ls);
98
+
99
+ #endif // ENCODINGS_COMPACT_LANG_DET_LETTERSCRIPT_ENUM_H__
@@ -0,0 +1,259 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ // Remember a subset of a sequence of values, using a modest amount of memory
6
+
7
+ /***
8
+ Design:
9
+ Accumulate in powers of three, using 3-way median to collapse entries.
10
+ At any given time, there is one most-dense (highest power of 3) range of
11
+ entries and a series of less-dense ranges that hold 0..2 entries each. There
12
+ is a bounded-size storage array of S cells for all the entries.
13
+
14
+ The overflow detect is set up so that a new higher power of 3, K+1, is
15
+ triggered precisely when range K has 3n entries and all ranges < K have
16
+ zero entries.
17
+
18
+ In general, think of the range sizes as a multi-digit base 3 number, except
19
+ the highest digit may exceed 2:
20
+
21
+ 3**6 3**5 3**4 3**3 3**2 3**1 3**0 K=2
22
+ 0 0 0 0 3n-1 2 2 unused:1
23
+
24
+ There are a total of 3n-1 + 2 + 2 entries in use. Assume a size limit S at
25
+ one more than that, and we add a new 3**0 entry and "carry" by performing
26
+ medians on any group of 3 elements:
27
+
28
+ 3**6 3**5 3**4 3**3 3**2 3**1 3**0 K=2
29
+ 0 0 0 0 3n-1 2 3 unused:0
30
+ 0 0 0 0 3n-1 3 0 carry unused:2
31
+ 0 0 0 0 3n 0 0 carry unused:4
32
+
33
+ To accumulate 2 entries at all levels < K and 3 just before the first carry at
34
+ level 0, we need 2*K + 1 unused cells after doing all carries, or five cells
35
+ in this case. Since we only have 4 cells in the example above, we need to
36
+ make room by starting a new power of three:
37
+
38
+ 3**6 3**5 3**4 3**3 3**2 3**1 3**0
39
+ 0 0 0 0 3n 0 0 K=2 unused:4
40
+ 0 0 0 n 0 0 0 K=3 unused:2n+4
41
+
42
+ In the code below, we don't worry about overflow from the topmost place.
43
+
44
+
45
+ ***/
46
+
47
+ #include "encodings/compact_lang_det/subsetsequence.h"
48
+ #include <stdio.h>
49
+
50
+ #include "encodings/compact_lang_det/win/cld_logging.h"
51
+
52
+ void DumpInts(const char* label, const int* v, int n) {
53
+ printf("%s ", label);
54
+ for (int i = 0; i < n; ++i) {
55
+ printf("%d ", v[i]);
56
+ }
57
+ printf("\n");
58
+ }
59
+
60
+ void DumpUint8s(const char* label, const uint8* v, int n) {
61
+ printf("%s ", label);
62
+ for (int i = 0; i < n; ++i) {
63
+ printf("%d ", v[i]);
64
+ }
65
+ printf("\n");
66
+ }
67
+
68
+ // Return median of seq_[sub] .. seq_[sub+2], favoring middle element
69
+ uint8 SubsetSequence::Median3(int sub) {
70
+ if (seq_[sub] == seq_[sub + 1]) {
71
+ return seq_[sub];
72
+ }
73
+ if (seq_[sub] == seq_[sub + 2]) {
74
+ return seq_[sub];
75
+ }
76
+ return seq_[sub + 1];
77
+ }
78
+
79
+ void SubsetSequence::Init() {
80
+ // printf("Init\n");
81
+
82
+ k_ = 0;
83
+ count_[0] = 0;
84
+ next_e_ = 0;
85
+ seq_[0] = 0; // Default value if no calls to Add
86
+
87
+ // Want largest <= kMaxSeq_ that allows reserve and makes count_[k_] = 0 mod 3
88
+ int reserve = (2 * k_ + 1);
89
+ level_limit_e_ = kMaxSeq_ - reserve;
90
+ level_limit_e_ = (level_limit_e_ / 3) * 3; // Round down to multiple of 3
91
+ limit_e_ = level_limit_e_;
92
+ }
93
+
94
+ // Compress level k by 3x, creating level k+1
95
+ void SubsetSequence::NewLevel() {
96
+ // printf("NewLevel 3 ** %d\n", k_ + 1);
97
+ //DumpUint8s("count[k]", count_, k_ + 1);
98
+ //DumpUint8s("seq[next]", seq_, next_e_);
99
+
100
+ // Incoming level must be an exact multiple of three in size
101
+ CHECK((count_[k_] % 3) == 0);
102
+ int k_size = count_[k_];
103
+ int new_size = k_size / 3;
104
+
105
+ // Compress down by 3x, via median
106
+ for (int j = 0; j < new_size; ++j) {
107
+ seq_[j] = Median3(j * 3);
108
+ }
109
+
110
+ // Update counts
111
+ count_[k_] = 0;
112
+ // Else Overflow -- just continue with 3x dense Level K
113
+ if (k_ < (kMaxLevel_ - 1)) {++k_;}
114
+ count_[k_] = new_size;
115
+
116
+ // Update limits
117
+ next_e_ = new_size;
118
+ limit_e_ = next_e_ + 3;
119
+
120
+ // Want largest <= kMaxSeq_ that allows reserve and makes count_[k_] = 0 mod 3
121
+ int reserve = (2 * k_ + 1);
122
+ level_limit_e_ = kMaxSeq_ - reserve;
123
+ level_limit_e_ = (level_limit_e_ / 3) * 3; // Round down to multiple of 3
124
+ //
125
+ //DumpUint8s("after: count[k]", count_, k_ + 1);
126
+ //DumpUint8s("after: seq[next]", seq_, next_e_);
127
+ }
128
+
129
+ void SubsetSequence::DoCarries() {
130
+ CHECK(count_[k_] > 3); // We depend on count_[k_] being > 3 to stop while
131
+ // Make room by carrying
132
+
133
+ //DumpUint8s("DoCarries count[k]", count_, k_ + 1);
134
+ //DumpUint8s("DoCarries seq[next]", seq_, next_e_);
135
+
136
+ int i = 0;
137
+ while (count_[i] == 3) {
138
+ next_e_ -= 3;
139
+ seq_[next_e_] = Median3(next_e_);
140
+ ++next_e_;
141
+ count_[i] = 0;
142
+ ++count_[i + 1];
143
+ ++i;
144
+ }
145
+ limit_e_ = next_e_ + 3;
146
+
147
+ //DumpUint8s("after: DoCarries count[k]", count_, k_ + 1);
148
+ //DumpUint8s("after: DoCarries seq[next]", seq_, next_e_);
149
+
150
+ // If we just fully carried into level K,
151
+ // Make sure there is now enough room, else start level K + 1
152
+ if (i >= k_) {
153
+ CHECK(count_[k_] == next_e_);
154
+ if (next_e_ >= level_limit_e_) {
155
+ NewLevel();
156
+ }
157
+ }
158
+ }
159
+
160
+ void SubsetSequence::Add(uint8 e) {
161
+ // Add an entry then carry as needed
162
+ seq_[next_e_] = e;
163
+ ++next_e_;
164
+ ++count_[0];
165
+
166
+ if (next_e_ >= limit_e_) {
167
+ DoCarries();
168
+ }
169
+ }
170
+
171
+
172
+ // Collapse tail end by simple median across disparate-weight values,
173
+ // dropping or duplicating last value if need be.
174
+ // This routine is idempotent.
175
+ void SubsetSequence::Flush() {
176
+ // printf("Flush %d\n", count_[k_]);
177
+ int start_tail = count_[k_];
178
+ int size_tail = next_e_ - start_tail;
179
+ if ((size_tail % 3) == 2) {
180
+ seq_[next_e_] = seq_[next_e_ - 1]; // Duplicate last value
181
+ ++size_tail;
182
+ }
183
+
184
+ // Compress tail down by 3x, via median
185
+ int new_size = size_tail / 3; // May delete last value
186
+ for (int j = 0; j < new_size; ++j) {
187
+ seq_[start_tail + j] = Median3(start_tail + j * 3);
188
+ }
189
+
190
+ next_e_ = start_tail + new_size;
191
+ count_[k_] = next_e_;
192
+ }
193
+
194
+
195
+ // Extract representative pattern of exactly N values into dst[0..n-1]
196
+ // This routine may be called multiple times, but it may downsample as a
197
+ // side effect, causing subsequent calls with larger N to get poor answers.
198
+ void SubsetSequence::Extract(int to_n, uint8* dst) {
199
+ // Collapse partial-carries in tail
200
+ Flush();
201
+
202
+ // Just use Bresenham to resample
203
+ int from_n = next_e_;
204
+ if (to_n >= from_n) {
205
+ // Up-sample from_n => to_n
206
+ int err = to_n - 1; // bias toward no overshoot
207
+ int j = 0;
208
+ for (int i = 0; i < to_n; ++i) {
209
+ dst[i] = seq_[j];
210
+ err -= from_n;
211
+ if (err < 0) {
212
+ ++j;
213
+ err += to_n;
214
+ }
215
+ }
216
+ } else {
217
+ // Get to the point that the number of samples is <= 3 * to_n
218
+ while (next_e_ > (to_n * 3)) {
219
+ // Compress down by 3x, via median
220
+ // printf("Extract, median %d / 3\n", next_e_);
221
+ if ((next_e_ % 3) == 2) {
222
+ seq_[next_e_] = seq_[next_e_ - 1]; // Duplicate last value
223
+ ++next_e_;
224
+ }
225
+ int new_size = next_e_ / 3; // May delete last value
226
+ for (int j = 0; j < new_size; ++j) {
227
+ seq_[j] = Median3(j * 3);
228
+ }
229
+ next_e_ = new_size;
230
+ count_[k_] = next_e_;
231
+ }
232
+ from_n = next_e_;
233
+
234
+ if (to_n == from_n) {
235
+ // Copy verbatim
236
+ for (int i = 0; i < to_n; ++i) {
237
+ dst[i] = seq_[i];
238
+ }
239
+ return;
240
+ }
241
+
242
+ // Down-sample from_n => to_n, using medians
243
+ int err = 0; // Bias to immediate median sample
244
+ int j = 0;
245
+ for (int i = 0; i < from_n; ++i) {
246
+ err -= to_n;
247
+ if (err < 0) {
248
+ if (i <= (next_e_ - 2)) {
249
+ dst[j] = Median3(i);
250
+ } else {
251
+ dst[j] = seq_[i];
252
+ }
253
+ ++j;
254
+ err += from_n;
255
+ }
256
+ }
257
+ }
258
+
259
+ }
@@ -0,0 +1,44 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ // Remember a subset of a sequence of values, using a modest amount of memory
6
+
7
+ #ifndef ENCODINGS_COMPACT_LANG_DET_SUBSETSEQUENCE_H_
8
+ #define ENCODINGS_COMPACT_LANG_DET_SUBSETSEQUENCE_H_
9
+
10
+ #include "encodings/compact_lang_det/win/cld_basictypes.h"
11
+ #include "encodings/compact_lang_det/win/cld_google.h"
12
+
13
+
14
+ class SubsetSequence {
15
+ public:
16
+ void Init();
17
+ void Add(uint8 e);
18
+ void Extract(int n, uint8* dst);
19
+ SubsetSequence() {Init();}
20
+ ~SubsetSequence() {};
21
+
22
+ private:
23
+ uint8 Median3(int sub);
24
+ void NewLevel();
25
+ void DoCarries();
26
+ void Flush();
27
+
28
+ static const int kMaxLevel_ = 16; // 3**16 ~= 43M (3**20 ~= 3.4B)
29
+ static const int kMaxSeq_ = 128;
30
+
31
+ int k_;
32
+ int next_e_;
33
+ int limit_e_;
34
+ int level_limit_e_;
35
+ uint8 seq_[kMaxSeq_];
36
+ uint8 count_[kMaxLevel_ + 1]; // +1 allows graceful overflow
37
+
38
+ DISALLOW_EVIL_CONSTRUCTORS(SubsetSequence);
39
+
40
+ // Require enough room to end up with 40 entries plus carrying space
41
+ COMPILE_ASSERT(kMaxSeq_ >= (kMaxLevel_ * 2 + 40), kMaxSeq__is_too_small);
42
+ };
43
+
44
+ #endif // ENCODINGS_COMPACT_LANG_DET_SUBSETSEQUENCE_H_
@@ -0,0 +1,99 @@
1
+ // Copyright 2008 Google Inc. All Rights Reserved.
2
+ // Author: dsites@google.com (Dick Sites)
3
+ /*
4
+ #include "testing/base/public/gunit.h"
5
+ #include "testing/lib/strings/overrun_sensitive_memory_block.h"
6
+ #include "cld/encodings/compact_lang_det/subsetsequence.h"
7
+
8
+ // This always passes. It is just scaffolidng to exercise the subsequence
9
+ // facility, which is likely to get abandoned soon. dsites 2008.11.17
10
+ //
11
+ TEST(SubsetSequence, foo) {
12
+ uint8 dst[120];
13
+
14
+ // Create 120-element vector
15
+ printf("Creating %d items:\n", 120);
16
+ SubsetSequence ss;
17
+ for (int i = 0; i < 120; ++i) {
18
+ ss.Add(i);
19
+ }
20
+
21
+ // Extract various lengths
22
+ for (int n = 120; n >= 0; --n) {
23
+ ss.Extract(n, dst);
24
+ printf("[%d] ", n);
25
+ for (int i = 0; i < n; ++i) {
26
+ printf("%d ", dst[i]);
27
+ }
28
+ printf("\n");
29
+ }
30
+
31
+ printf("\n");
32
+ printf("\n");
33
+
34
+ // Create 120-element vector of 7 items each
35
+ printf("Creating %d items:\n", 120);
36
+ ss.Init();
37
+ for (int i = 0; i < 120; ++i) {
38
+ ss.Add(i / 7);
39
+ }
40
+
41
+ // Extract various lengths
42
+ for (int n = 120; n >= 0; --n) {
43
+ ss.Extract(n, dst);
44
+ printf("[%d] ", n);
45
+ for (int i = 0; i < n; ++i) {
46
+ printf("%d ", dst[i]);
47
+ }
48
+ printf("\n");
49
+ }
50
+
51
+ printf("\n");
52
+ printf("\n");
53
+
54
+
55
+ // Create 400 element vector of patterns
56
+ int nn1 = 400;
57
+ int divisor = (nn1 + 239) / 240; // Max inserted value = 240
58
+ printf("Creating %d items:\n", nn1);
59
+ ss.Init();
60
+ for (int i = 0; i < nn1; ++i) {
61
+ ss.Add(i / divisor);
62
+ }
63
+
64
+ // Extract 12-item summary lengths
65
+ int n1 = 12;
66
+ ss.Extract(n1, dst);
67
+ printf("[%d] ", n1);
68
+ for (int i = 0; i < n1; ++i) {
69
+ printf("%d ", dst[i]);
70
+ }
71
+ printf("\n");
72
+
73
+ printf("\n");
74
+ printf("\n");
75
+
76
+ // Create 10**n element vector of patterns
77
+ int pow_10 = 1;
78
+ for (int nn = 0; nn < 9; ++nn) {
79
+ printf("Creating %d items:\n", pow_10);
80
+ int divisor = (pow_10 + 239) / 240; // Max inserted value = 240
81
+ ss.Init();
82
+ for (int i = 0; i < pow_10; ++i) {
83
+ ss.Add(i / divisor);
84
+ }
85
+
86
+ // Extract 12-item summary lengths
87
+ int n = 12;
88
+ ss.Extract(n, dst);
89
+ printf("[%d] ", n);
90
+ for (int i = 0; i < n; ++i) {
91
+ printf("%d ", dst[i]);
92
+ }
93
+ printf("\n");
94
+
95
+ pow_10 *= 10;
96
+ }
97
+
98
+ }
99
+ */