language_detection 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. data/.gitignore +19 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +85 -0
  5. data/Rakefile +11 -0
  6. data/ext/cld/Makefile +34 -0
  7. data/ext/cld/base/basictypes.h +348 -0
  8. data/ext/cld/base/build_config.h +124 -0
  9. data/ext/cld/base/casts.h +156 -0
  10. data/ext/cld/base/commandlineflags.h +443 -0
  11. data/ext/cld/base/crash.h +41 -0
  12. data/ext/cld/base/dynamic_annotations.h +358 -0
  13. data/ext/cld/base/global_strip_options.h +59 -0
  14. data/ext/cld/base/log_severity.h +46 -0
  15. data/ext/cld/base/logging.h +1403 -0
  16. data/ext/cld/base/macros.h +243 -0
  17. data/ext/cld/base/port.h +54 -0
  18. data/ext/cld/base/scoped_ptr.h +428 -0
  19. data/ext/cld/base/stl_decl.h +0 -0
  20. data/ext/cld/base/stl_decl_msvc.h +107 -0
  21. data/ext/cld/base/string_util.h +29 -0
  22. data/ext/cld/base/strtoint.h +93 -0
  23. data/ext/cld/base/template_util.h +96 -0
  24. data/ext/cld/base/type_traits.h +198 -0
  25. data/ext/cld/base/vlog_is_on.h +143 -0
  26. data/ext/cld/cld.so +0 -0
  27. data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
  28. data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
  29. data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
  30. data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
  31. data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
  32. data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
  33. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
  34. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
  35. data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
  36. data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
  37. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
  38. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
  39. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
  40. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
  41. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
  42. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
  43. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
  44. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
  45. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
  46. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
  47. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
  48. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
  49. data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
  50. data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
  51. data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
  52. data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
  53. data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
  54. data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
  55. data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
  56. data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
  57. data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
  58. data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
  59. data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
  60. data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
  61. data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
  62. data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
  63. data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
  64. data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
  65. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
  66. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
  67. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
  68. data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
  69. data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
  70. data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
  71. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
  72. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
  73. data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
  74. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
  75. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
  76. data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
  77. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
  78. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
  79. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
  80. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
  81. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
  82. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
  83. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
  84. data/ext/cld/encodings/internal/encodings.cc +12 -0
  85. data/ext/cld/encodings/lang_enc.h +254 -0
  86. data/ext/cld/encodings/proto/encodings.pb.h +169 -0
  87. data/ext/cld/encodings/public/encodings.h +301 -0
  88. data/ext/cld/extconf.rb +1 -0
  89. data/ext/cld/language_detection.cc +88 -0
  90. data/ext/cld/languages/internal/languages.cc +337 -0
  91. data/ext/cld/languages/proto/languages.pb.h +179 -0
  92. data/ext/cld/languages/public/languages.h +379 -0
  93. data/language_detection.gemspec +28 -0
  94. data/lib/language_detection/string.rb +1 -0
  95. data/lib/language_detection/version.rb +3 -0
  96. data/lib/language_detection.rb +54 -0
  97. data/test/_helper.rb +15 -0
  98. data/test/fixtures/languages.csv +80 -0
  99. data/test/language_detection_test.rb +88 -0
  100. metadata +250 -0
@@ -0,0 +1,117 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #include "encodings/compact_lang_det/letterscript_enum.h"
6
+
7
+ #include "encodings/compact_lang_det/win/cld_logging.h"
8
+
9
+ static const char* kUnicodeLScriptNames[ULScript_NUM_SCRIPTS] = {
10
+ "Common",
11
+ "Latin",
12
+ "Greek",
13
+ "Cyrillic",
14
+ "Armenian",
15
+ "Hebrew",
16
+ "Arabic",
17
+ "Syriac",
18
+ "Thaana",
19
+ "Devanagari",
20
+ "Bengali",
21
+ "Gurmukhi",
22
+ "Gujarati",
23
+ "Oriya",
24
+ "Tamil",
25
+ "Telugu",
26
+ "Kannada",
27
+ "Malayalam",
28
+ "Sinhala",
29
+ "Thai",
30
+ "Lao",
31
+ "Tibetan",
32
+ "Myanmar",
33
+ "Georgian",
34
+ "HanCJK",
35
+ "Ethiopic",
36
+ "Cherokee",
37
+ "Canadian_Aboriginal",
38
+ "Ogham",
39
+ "Runic",
40
+ "Khmer",
41
+ "Mongolian",
42
+ "Yi",
43
+ "Old_Italic",
44
+ "Gothic",
45
+ "Deseret",
46
+ "Inherited",
47
+ "Tagalog",
48
+ "Hanunoo",
49
+ "Buhid",
50
+ "Tagbanwa",
51
+ "Limbu",
52
+ "Tai_Le",
53
+ "Linear_B",
54
+ "Ugaritic",
55
+ "Shavian",
56
+ "Osmanya",
57
+ "Cypriot",
58
+ "Buginese",
59
+ "Coptic",
60
+ "New_Tai_Lue",
61
+ "Glagolitic",
62
+ "Tifinagh",
63
+ "Syloti_Nagri",
64
+ "Old_Persian",
65
+ "Kharoshthi",
66
+ "Balinese",
67
+ "Cuneiform",
68
+ "Phoenician",
69
+ "Phags_Pa",
70
+ "Nko",
71
+
72
+ // Unicode 5.1 beta
73
+ "Sundanese",
74
+ "Lepcha",
75
+ "Ol_Chiki",
76
+ "Vai",
77
+ "Saurashtra",
78
+ "Kayah_Li",
79
+ "Rejang",
80
+ "Lycian",
81
+ "Carian",
82
+ "Lydian",
83
+ "Cham",
84
+ };
85
+
86
+
87
+ // Unicode 5.1 beta script names from
88
+ // http://www.unicode.org/Public/5.1.0/diffs/5.0.0-5.1.0.all.2.diffs
89
+ // NOTE: 'Vai ' => "Vaii" to make four letters, not three
90
+ // see http://unicode.org/iso15924/iso15924-codes.html
91
+ const char* const kLScriptName4[ULScript_NUM_SCRIPTS] = {
92
+ "Zyyy", "Latn", "Grek", "Cyrl", "Armn", "Hebr", "Arab", "Syrc",
93
+ "Thaa", "Deva", "Beng", "Guru", "Gujr", "Orya", "Taml", "Telu",
94
+ "Knda", "Mlym", "Sinh", "Thai", "Laoo", "Tibt", "Mymr", "Geor",
95
+ "Hani", "Ethi", "Cher", "Cans", "Ogam", "Runr", "Khmr", "Mong",
96
+
97
+ "Yiii", "Ital", "Goth", "Dsrt", "Zzzz", "Tglg", "Hano", "Buhd",
98
+ "Tagb", "Limb", "Tale", "Linb", "Ugar", "Shaw", "Osma", "Cprt",
99
+ "Bugi", "Copt", "Talu", "Glag", "Tfng", "Sylo", "Xpeo", "Khar",
100
+ "Bali", "Xsux", "Phnx", "Phag", "Nkoo",
101
+
102
+ // Unicode 5.1 beta
103
+ "Sund", "Lepc", "Olck", "Vaii", "Saur", "Kali", "Rjng", "Lyci",
104
+ "Cari", "Lydi", "Cham",
105
+ };
106
+
107
+
108
+ const char* UnicodeLScriptName(const UnicodeLScript ls) {
109
+ CHECK(ls >= 0 && ls < ULScript_NUM_SCRIPTS);
110
+ return kUnicodeLScriptNames[ls];
111
+ }
112
+
113
+
114
+ const char* UnicodeLScriptCode(const UnicodeLScript ls) {
115
+ CHECK(ls >= 0 && ls < ULScript_NUM_SCRIPTS);
116
+ return kLScriptName4[ls];
117
+ }
@@ -0,0 +1,99 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_LETTERSCRIPT_ENUM_H__
6
+ #define ENCODINGS_COMPACT_LANG_DET_LETTERSCRIPT_ENUM_H__
7
+
8
+ enum UnicodeLScript {
9
+ ULScript_Common,
10
+ ULScript_Latin,
11
+ ULScript_Greek,
12
+ ULScript_Cyrillic,
13
+ ULScript_Armenian,
14
+ ULScript_Hebrew,
15
+ ULScript_Arabic,
16
+ ULScript_Syriac,
17
+ ULScript_Thaana,
18
+ ULScript_Devanagari,
19
+ ULScript_Bengali,
20
+ ULScript_Gurmukhi,
21
+ ULScript_Gujarati,
22
+ ULScript_Oriya,
23
+ ULScript_Tamil,
24
+ ULScript_Telugu,
25
+ ULScript_Kannada,
26
+ ULScript_Malayalam,
27
+ ULScript_Sinhala,
28
+ ULScript_Thai,
29
+ ULScript_Lao,
30
+ ULScript_Tibetan,
31
+ ULScript_Myanmar,
32
+ ULScript_Georgian,
33
+ ULScript_HanCJK,
34
+ ULScript_Ethiopic,
35
+ ULScript_Cherokee,
36
+ ULScript_Canadian_Aboriginal,
37
+ ULScript_Ogham,
38
+ ULScript_Runic,
39
+ ULScript_Khmer,
40
+ ULScript_Mongolian,
41
+ ULScript_Yi,
42
+ ULScript_Old_Italic,
43
+ ULScript_Gothic,
44
+ ULScript_Deseret,
45
+ ULScript_Inherited,
46
+ ULScript_Tagalog,
47
+ ULScript_Hanunoo,
48
+ ULScript_Buhid,
49
+ ULScript_Tagbanwa,
50
+ ULScript_Limbu,
51
+ ULScript_Tai_Le,
52
+ ULScript_Linear_B,
53
+ ULScript_Ugaritic,
54
+ ULScript_Shavian,
55
+ ULScript_Osmanya,
56
+ ULScript_Cypriot,
57
+ ULScript_Buginese,
58
+ ULScript_Coptic,
59
+ ULScript_New_Tai_Lue,
60
+ ULScript_Glagolitic,
61
+ ULScript_Tifinagh,
62
+ ULScript_Syloti_Nagri,
63
+ ULScript_Old_Persian,
64
+ ULScript_Kharoshthi,
65
+ ULScript_Balinese,
66
+ ULScript_Cuneiform,
67
+ ULScript_Phoenician,
68
+ ULScript_Phags_Pa,
69
+ ULScript_Nko,
70
+
71
+ // Unicode 5.1
72
+ ULScript_Sundanese,
73
+ ULScript_Lepcha,
74
+ ULScript_Ol_Chiki,
75
+ ULScript_Vai,
76
+ ULScript_Saurashtra,
77
+ ULScript_Kayah_Li,
78
+ ULScript_Rejang,
79
+ ULScript_Lycian,
80
+ ULScript_Carian,
81
+ ULScript_Lydian,
82
+ ULScript_Cham,
83
+ ULScript_NUM_SCRIPTS
84
+ };
85
+
86
+
87
+ static const UnicodeLScript UNKNOWN_LSCRIPT = ULScript_Common;
88
+
89
+
90
+ // Return the name corresponding to the script ls, e.g. "Latin".
91
+ // It is a fatal error if ls is not a valid UnicodeLScript.
92
+ const char* UnicodeLScriptName(const UnicodeLScript ls);
93
+
94
+
95
+ // Return the 4-letter code corresponding to the script ls, e.g. "Latn".
96
+ // It is a fatal error if ls is not a valid UnicodeLScript.
97
+ const char* UnicodeLScriptCode(const UnicodeLScript ls);
98
+
99
+ #endif // ENCODINGS_COMPACT_LANG_DET_LETTERSCRIPT_ENUM_H__
@@ -0,0 +1,259 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ // Remember a subset of a sequence of values, using a modest amount of memory
6
+
7
+ /***
8
+ Design:
9
+ Accumulate in powers of three, using 3-way median to collapse entries.
10
+ At any given time, there is one most-dense (highest power of 3) range of
11
+ entries and a series of less-dense ranges that hold 0..2 entries each. There
12
+ is a bounded-size storage array of S cells for all the entries.
13
+
14
+ The overflow detect is set up so that a new higher power of 3, K+1, is
15
+ triggered precisely when range K has 3n entries and all ranges < K have
16
+ zero entries.
17
+
18
+ In general, think of the range sizes as a multi-digit base 3 number, except
19
+ the highest digit may exceed 2:
20
+
21
+ 3**6 3**5 3**4 3**3 3**2 3**1 3**0 K=2
22
+ 0 0 0 0 3n-1 2 2 unused:1
23
+
24
+ There are a total of 3n-1 + 2 + 2 entries in use. Assume a size limit S at
25
+ one more than that, and we add a new 3**0 entry and "carry" by performing
26
+ medians on any group of 3 elements:
27
+
28
+ 3**6 3**5 3**4 3**3 3**2 3**1 3**0 K=2
29
+ 0 0 0 0 3n-1 2 3 unused:0
30
+ 0 0 0 0 3n-1 3 0 carry unused:2
31
+ 0 0 0 0 3n 0 0 carry unused:4
32
+
33
+ To accumulate 2 entries at all levels < K and 3 just before the first carry at
34
+ level 0, we need 2*K + 1 unused cells after doing all carries, or five cells
35
+ in this case. Since we only have 4 cells in the example above, we need to
36
+ make room by starting a new power of three:
37
+
38
+ 3**6 3**5 3**4 3**3 3**2 3**1 3**0
39
+ 0 0 0 0 3n 0 0 K=2 unused:4
40
+ 0 0 0 n 0 0 0 K=3 unused:2n+4
41
+
42
+ In the code below, we don't worry about overflow from the topmost place.
43
+
44
+
45
+ ***/
46
+
47
+ #include "encodings/compact_lang_det/subsetsequence.h"
48
+ #include <stdio.h>
49
+
50
+ #include "encodings/compact_lang_det/win/cld_logging.h"
51
+
52
+ void DumpInts(const char* label, const int* v, int n) {
53
+ printf("%s ", label);
54
+ for (int i = 0; i < n; ++i) {
55
+ printf("%d ", v[i]);
56
+ }
57
+ printf("\n");
58
+ }
59
+
60
+ void DumpUint8s(const char* label, const uint8* v, int n) {
61
+ printf("%s ", label);
62
+ for (int i = 0; i < n; ++i) {
63
+ printf("%d ", v[i]);
64
+ }
65
+ printf("\n");
66
+ }
67
+
68
+ // Return median of seq_[sub] .. seq_[sub+2], favoring middle element
69
+ uint8 SubsetSequence::Median3(int sub) {
70
+ if (seq_[sub] == seq_[sub + 1]) {
71
+ return seq_[sub];
72
+ }
73
+ if (seq_[sub] == seq_[sub + 2]) {
74
+ return seq_[sub];
75
+ }
76
+ return seq_[sub + 1];
77
+ }
78
+
79
+ void SubsetSequence::Init() {
80
+ // printf("Init\n");
81
+
82
+ k_ = 0;
83
+ count_[0] = 0;
84
+ next_e_ = 0;
85
+ seq_[0] = 0; // Default value if no calls to Add
86
+
87
+ // Want largest <= kMaxSeq_ that allows reserve and makes count_[k_] = 0 mod 3
88
+ int reserve = (2 * k_ + 1);
89
+ level_limit_e_ = kMaxSeq_ - reserve;
90
+ level_limit_e_ = (level_limit_e_ / 3) * 3; // Round down to multiple of 3
91
+ limit_e_ = level_limit_e_;
92
+ }
93
+
94
+ // Compress level k by 3x, creating level k+1
95
+ void SubsetSequence::NewLevel() {
96
+ // printf("NewLevel 3 ** %d\n", k_ + 1);
97
+ //DumpUint8s("count[k]", count_, k_ + 1);
98
+ //DumpUint8s("seq[next]", seq_, next_e_);
99
+
100
+ // Incoming level must be an exact multiple of three in size
101
+ CHECK((count_[k_] % 3) == 0);
102
+ int k_size = count_[k_];
103
+ int new_size = k_size / 3;
104
+
105
+ // Compress down by 3x, via median
106
+ for (int j = 0; j < new_size; ++j) {
107
+ seq_[j] = Median3(j * 3);
108
+ }
109
+
110
+ // Update counts
111
+ count_[k_] = 0;
112
+ // Else Overflow -- just continue with 3x dense Level K
113
+ if (k_ < (kMaxLevel_ - 1)) {++k_;}
114
+ count_[k_] = new_size;
115
+
116
+ // Update limits
117
+ next_e_ = new_size;
118
+ limit_e_ = next_e_ + 3;
119
+
120
+ // Want largest <= kMaxSeq_ that allows reserve and makes count_[k_] = 0 mod 3
121
+ int reserve = (2 * k_ + 1);
122
+ level_limit_e_ = kMaxSeq_ - reserve;
123
+ level_limit_e_ = (level_limit_e_ / 3) * 3; // Round down to multiple of 3
124
+ //
125
+ //DumpUint8s("after: count[k]", count_, k_ + 1);
126
+ //DumpUint8s("after: seq[next]", seq_, next_e_);
127
+ }
128
+
129
+ void SubsetSequence::DoCarries() {
130
+ CHECK(count_[k_] > 3); // We depend on count_[k_] being > 3 to stop while
131
+ // Make room by carrying
132
+
133
+ //DumpUint8s("DoCarries count[k]", count_, k_ + 1);
134
+ //DumpUint8s("DoCarries seq[next]", seq_, next_e_);
135
+
136
+ int i = 0;
137
+ while (count_[i] == 3) {
138
+ next_e_ -= 3;
139
+ seq_[next_e_] = Median3(next_e_);
140
+ ++next_e_;
141
+ count_[i] = 0;
142
+ ++count_[i + 1];
143
+ ++i;
144
+ }
145
+ limit_e_ = next_e_ + 3;
146
+
147
+ //DumpUint8s("after: DoCarries count[k]", count_, k_ + 1);
148
+ //DumpUint8s("after: DoCarries seq[next]", seq_, next_e_);
149
+
150
+ // If we just fully carried into level K,
151
+ // Make sure there is now enough room, else start level K + 1
152
+ if (i >= k_) {
153
+ CHECK(count_[k_] == next_e_);
154
+ if (next_e_ >= level_limit_e_) {
155
+ NewLevel();
156
+ }
157
+ }
158
+ }
159
+
160
+ void SubsetSequence::Add(uint8 e) {
161
+ // Add an entry then carry as needed
162
+ seq_[next_e_] = e;
163
+ ++next_e_;
164
+ ++count_[0];
165
+
166
+ if (next_e_ >= limit_e_) {
167
+ DoCarries();
168
+ }
169
+ }
170
+
171
+
172
+ // Collapse tail end by simple median across disparate-weight values,
173
+ // dropping or duplicating last value if need be.
174
+ // This routine is idempotent.
175
+ void SubsetSequence::Flush() {
176
+ // printf("Flush %d\n", count_[k_]);
177
+ int start_tail = count_[k_];
178
+ int size_tail = next_e_ - start_tail;
179
+ if ((size_tail % 3) == 2) {
180
+ seq_[next_e_] = seq_[next_e_ - 1]; // Duplicate last value
181
+ ++size_tail;
182
+ }
183
+
184
+ // Compress tail down by 3x, via median
185
+ int new_size = size_tail / 3; // May delete last value
186
+ for (int j = 0; j < new_size; ++j) {
187
+ seq_[start_tail + j] = Median3(start_tail + j * 3);
188
+ }
189
+
190
+ next_e_ = start_tail + new_size;
191
+ count_[k_] = next_e_;
192
+ }
193
+
194
+
195
+ // Extract representative pattern of exactly N values into dst[0..n-1]
196
+ // This routine may be called multiple times, but it may downsample as a
197
+ // side effect, causing subsequent calls with larger N to get poor answers.
198
+ void SubsetSequence::Extract(int to_n, uint8* dst) {
199
+ // Collapse partial-carries in tail
200
+ Flush();
201
+
202
+ // Just use Bresenham to resample
203
+ int from_n = next_e_;
204
+ if (to_n >= from_n) {
205
+ // Up-sample from_n => to_n
206
+ int err = to_n - 1; // bias toward no overshoot
207
+ int j = 0;
208
+ for (int i = 0; i < to_n; ++i) {
209
+ dst[i] = seq_[j];
210
+ err -= from_n;
211
+ if (err < 0) {
212
+ ++j;
213
+ err += to_n;
214
+ }
215
+ }
216
+ } else {
217
+ // Get to the point that the number of samples is <= 3 * to_n
218
+ while (next_e_ > (to_n * 3)) {
219
+ // Compress down by 3x, via median
220
+ // printf("Extract, median %d / 3\n", next_e_);
221
+ if ((next_e_ % 3) == 2) {
222
+ seq_[next_e_] = seq_[next_e_ - 1]; // Duplicate last value
223
+ ++next_e_;
224
+ }
225
+ int new_size = next_e_ / 3; // May delete last value
226
+ for (int j = 0; j < new_size; ++j) {
227
+ seq_[j] = Median3(j * 3);
228
+ }
229
+ next_e_ = new_size;
230
+ count_[k_] = next_e_;
231
+ }
232
+ from_n = next_e_;
233
+
234
+ if (to_n == from_n) {
235
+ // Copy verbatim
236
+ for (int i = 0; i < to_n; ++i) {
237
+ dst[i] = seq_[i];
238
+ }
239
+ return;
240
+ }
241
+
242
+ // Down-sample from_n => to_n, using medians
243
+ int err = 0; // Bias to immediate median sample
244
+ int j = 0;
245
+ for (int i = 0; i < from_n; ++i) {
246
+ err -= to_n;
247
+ if (err < 0) {
248
+ if (i <= (next_e_ - 2)) {
249
+ dst[j] = Median3(i);
250
+ } else {
251
+ dst[j] = seq_[i];
252
+ }
253
+ ++j;
254
+ err += from_n;
255
+ }
256
+ }
257
+ }
258
+
259
+ }
@@ -0,0 +1,44 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ // Remember a subset of a sequence of values, using a modest amount of memory
6
+
7
+ #ifndef ENCODINGS_COMPACT_LANG_DET_SUBSETSEQUENCE_H_
8
+ #define ENCODINGS_COMPACT_LANG_DET_SUBSETSEQUENCE_H_
9
+
10
+ #include "encodings/compact_lang_det/win/cld_basictypes.h"
11
+ #include "encodings/compact_lang_det/win/cld_google.h"
12
+
13
+
14
+ class SubsetSequence {
15
+ public:
16
+ void Init();
17
+ void Add(uint8 e);
18
+ void Extract(int n, uint8* dst);
19
+ SubsetSequence() {Init();}
20
+ ~SubsetSequence() {};
21
+
22
+ private:
23
+ uint8 Median3(int sub);
24
+ void NewLevel();
25
+ void DoCarries();
26
+ void Flush();
27
+
28
+ static const int kMaxLevel_ = 16; // 3**16 ~= 43M (3**20 ~= 3.4B)
29
+ static const int kMaxSeq_ = 128;
30
+
31
+ int k_;
32
+ int next_e_;
33
+ int limit_e_;
34
+ int level_limit_e_;
35
+ uint8 seq_[kMaxSeq_];
36
+ uint8 count_[kMaxLevel_ + 1]; // +1 allows graceful overflow
37
+
38
+ DISALLOW_EVIL_CONSTRUCTORS(SubsetSequence);
39
+
40
+ // Require enough room to end up with 40 entries plus carrying space
41
+ COMPILE_ASSERT(kMaxSeq_ >= (kMaxLevel_ * 2 + 40), kMaxSeq__is_too_small);
42
+ };
43
+
44
+ #endif // ENCODINGS_COMPACT_LANG_DET_SUBSETSEQUENCE_H_
@@ -0,0 +1,99 @@
1
+ // Copyright 2008 Google Inc. All Rights Reserved.
2
+ // Author: dsites@google.com (Dick Sites)
3
+ /*
4
+ #include "testing/base/public/gunit.h"
5
+ #include "testing/lib/strings/overrun_sensitive_memory_block.h"
6
+ #include "cld/encodings/compact_lang_det/subsetsequence.h"
7
+
8
+ // This always passes. It is just scaffolidng to exercise the subsequence
9
+ // facility, which is likely to get abandoned soon. dsites 2008.11.17
10
+ //
11
+ TEST(SubsetSequence, foo) {
12
+ uint8 dst[120];
13
+
14
+ // Create 120-element vector
15
+ printf("Creating %d items:\n", 120);
16
+ SubsetSequence ss;
17
+ for (int i = 0; i < 120; ++i) {
18
+ ss.Add(i);
19
+ }
20
+
21
+ // Extract various lengths
22
+ for (int n = 120; n >= 0; --n) {
23
+ ss.Extract(n, dst);
24
+ printf("[%d] ", n);
25
+ for (int i = 0; i < n; ++i) {
26
+ printf("%d ", dst[i]);
27
+ }
28
+ printf("\n");
29
+ }
30
+
31
+ printf("\n");
32
+ printf("\n");
33
+
34
+ // Create 120-element vector of 7 items each
35
+ printf("Creating %d items:\n", 120);
36
+ ss.Init();
37
+ for (int i = 0; i < 120; ++i) {
38
+ ss.Add(i / 7);
39
+ }
40
+
41
+ // Extract various lengths
42
+ for (int n = 120; n >= 0; --n) {
43
+ ss.Extract(n, dst);
44
+ printf("[%d] ", n);
45
+ for (int i = 0; i < n; ++i) {
46
+ printf("%d ", dst[i]);
47
+ }
48
+ printf("\n");
49
+ }
50
+
51
+ printf("\n");
52
+ printf("\n");
53
+
54
+
55
+ // Create 400 element vector of patterns
56
+ int nn1 = 400;
57
+ int divisor = (nn1 + 239) / 240; // Max inserted value = 240
58
+ printf("Creating %d items:\n", nn1);
59
+ ss.Init();
60
+ for (int i = 0; i < nn1; ++i) {
61
+ ss.Add(i / divisor);
62
+ }
63
+
64
+ // Extract 12-item summary lengths
65
+ int n1 = 12;
66
+ ss.Extract(n1, dst);
67
+ printf("[%d] ", n1);
68
+ for (int i = 0; i < n1; ++i) {
69
+ printf("%d ", dst[i]);
70
+ }
71
+ printf("\n");
72
+
73
+ printf("\n");
74
+ printf("\n");
75
+
76
+ // Create 10**n element vector of patterns
77
+ int pow_10 = 1;
78
+ for (int nn = 0; nn < 9; ++nn) {
79
+ printf("Creating %d items:\n", pow_10);
80
+ int divisor = (pow_10 + 239) / 240; // Max inserted value = 240
81
+ ss.Init();
82
+ for (int i = 0; i < pow_10; ++i) {
83
+ ss.Add(i / divisor);
84
+ }
85
+
86
+ // Extract 12-item summary lengths
87
+ int n = 12;
88
+ ss.Extract(n, dst);
89
+ printf("[%d] ", n);
90
+ for (int i = 0; i < n; ++i) {
91
+ printf("%d ", dst[i]);
92
+ }
93
+ printf("\n");
94
+
95
+ pow_10 *= 10;
96
+ }
97
+
98
+ }
99
+ */