language_detection 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +19 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +85 -0
- data/Rakefile +11 -0
- data/ext/cld/Makefile +34 -0
- data/ext/cld/base/basictypes.h +348 -0
- data/ext/cld/base/build_config.h +124 -0
- data/ext/cld/base/casts.h +156 -0
- data/ext/cld/base/commandlineflags.h +443 -0
- data/ext/cld/base/crash.h +41 -0
- data/ext/cld/base/dynamic_annotations.h +358 -0
- data/ext/cld/base/global_strip_options.h +59 -0
- data/ext/cld/base/log_severity.h +46 -0
- data/ext/cld/base/logging.h +1403 -0
- data/ext/cld/base/macros.h +243 -0
- data/ext/cld/base/port.h +54 -0
- data/ext/cld/base/scoped_ptr.h +428 -0
- data/ext/cld/base/stl_decl.h +0 -0
- data/ext/cld/base/stl_decl_msvc.h +107 -0
- data/ext/cld/base/string_util.h +29 -0
- data/ext/cld/base/strtoint.h +93 -0
- data/ext/cld/base/template_util.h +96 -0
- data/ext/cld/base/type_traits.h +198 -0
- data/ext/cld/base/vlog_is_on.h +143 -0
- data/ext/cld/cld.so +0 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
- data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
- data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
- data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
- data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
- data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
- data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
- data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
- data/ext/cld/encodings/internal/encodings.cc +12 -0
- data/ext/cld/encodings/lang_enc.h +254 -0
- data/ext/cld/encodings/proto/encodings.pb.h +169 -0
- data/ext/cld/encodings/public/encodings.h +301 -0
- data/ext/cld/extconf.rb +1 -0
- data/ext/cld/language_detection.cc +88 -0
- data/ext/cld/languages/internal/languages.cc +337 -0
- data/ext/cld/languages/proto/languages.pb.h +179 -0
- data/ext/cld/languages/public/languages.h +379 -0
- data/language_detection.gemspec +28 -0
- data/lib/language_detection/string.rb +1 -0
- data/lib/language_detection/version.rb +3 -0
- data/lib/language_detection.rb +54 -0
- data/test/_helper.rb +15 -0
- data/test/fixtures/languages.csv +80 -0
- data/test/language_detection_test.rb +88 -0
- metadata +250 -0
@@ -0,0 +1,117 @@
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
#include "encodings/compact_lang_det/letterscript_enum.h"
|
6
|
+
|
7
|
+
#include "encodings/compact_lang_det/win/cld_logging.h"
|
8
|
+
|
9
|
+
static const char* kUnicodeLScriptNames[ULScript_NUM_SCRIPTS] = {
|
10
|
+
"Common",
|
11
|
+
"Latin",
|
12
|
+
"Greek",
|
13
|
+
"Cyrillic",
|
14
|
+
"Armenian",
|
15
|
+
"Hebrew",
|
16
|
+
"Arabic",
|
17
|
+
"Syriac",
|
18
|
+
"Thaana",
|
19
|
+
"Devanagari",
|
20
|
+
"Bengali",
|
21
|
+
"Gurmukhi",
|
22
|
+
"Gujarati",
|
23
|
+
"Oriya",
|
24
|
+
"Tamil",
|
25
|
+
"Telugu",
|
26
|
+
"Kannada",
|
27
|
+
"Malayalam",
|
28
|
+
"Sinhala",
|
29
|
+
"Thai",
|
30
|
+
"Lao",
|
31
|
+
"Tibetan",
|
32
|
+
"Myanmar",
|
33
|
+
"Georgian",
|
34
|
+
"HanCJK",
|
35
|
+
"Ethiopic",
|
36
|
+
"Cherokee",
|
37
|
+
"Canadian_Aboriginal",
|
38
|
+
"Ogham",
|
39
|
+
"Runic",
|
40
|
+
"Khmer",
|
41
|
+
"Mongolian",
|
42
|
+
"Yi",
|
43
|
+
"Old_Italic",
|
44
|
+
"Gothic",
|
45
|
+
"Deseret",
|
46
|
+
"Inherited",
|
47
|
+
"Tagalog",
|
48
|
+
"Hanunoo",
|
49
|
+
"Buhid",
|
50
|
+
"Tagbanwa",
|
51
|
+
"Limbu",
|
52
|
+
"Tai_Le",
|
53
|
+
"Linear_B",
|
54
|
+
"Ugaritic",
|
55
|
+
"Shavian",
|
56
|
+
"Osmanya",
|
57
|
+
"Cypriot",
|
58
|
+
"Buginese",
|
59
|
+
"Coptic",
|
60
|
+
"New_Tai_Lue",
|
61
|
+
"Glagolitic",
|
62
|
+
"Tifinagh",
|
63
|
+
"Syloti_Nagri",
|
64
|
+
"Old_Persian",
|
65
|
+
"Kharoshthi",
|
66
|
+
"Balinese",
|
67
|
+
"Cuneiform",
|
68
|
+
"Phoenician",
|
69
|
+
"Phags_Pa",
|
70
|
+
"Nko",
|
71
|
+
|
72
|
+
// Unicode 5.1 beta
|
73
|
+
"Sundanese",
|
74
|
+
"Lepcha",
|
75
|
+
"Ol_Chiki",
|
76
|
+
"Vai",
|
77
|
+
"Saurashtra",
|
78
|
+
"Kayah_Li",
|
79
|
+
"Rejang",
|
80
|
+
"Lycian",
|
81
|
+
"Carian",
|
82
|
+
"Lydian",
|
83
|
+
"Cham",
|
84
|
+
};
|
85
|
+
|
86
|
+
|
87
|
+
// Unicode 5.1 beta script names from
|
88
|
+
// http://www.unicode.org/Public/5.1.0/diffs/5.0.0-5.1.0.all.2.diffs
|
89
|
+
// NOTE: 'Vai ' => "Vaii" to make four letters, not three
|
90
|
+
// see http://unicode.org/iso15924/iso15924-codes.html
|
91
|
+
const char* const kLScriptName4[ULScript_NUM_SCRIPTS] = {
|
92
|
+
"Zyyy", "Latn", "Grek", "Cyrl", "Armn", "Hebr", "Arab", "Syrc",
|
93
|
+
"Thaa", "Deva", "Beng", "Guru", "Gujr", "Orya", "Taml", "Telu",
|
94
|
+
"Knda", "Mlym", "Sinh", "Thai", "Laoo", "Tibt", "Mymr", "Geor",
|
95
|
+
"Hani", "Ethi", "Cher", "Cans", "Ogam", "Runr", "Khmr", "Mong",
|
96
|
+
|
97
|
+
"Yiii", "Ital", "Goth", "Dsrt", "Zzzz", "Tglg", "Hano", "Buhd",
|
98
|
+
"Tagb", "Limb", "Tale", "Linb", "Ugar", "Shaw", "Osma", "Cprt",
|
99
|
+
"Bugi", "Copt", "Talu", "Glag", "Tfng", "Sylo", "Xpeo", "Khar",
|
100
|
+
"Bali", "Xsux", "Phnx", "Phag", "Nkoo",
|
101
|
+
|
102
|
+
// Unicode 5.1 beta
|
103
|
+
"Sund", "Lepc", "Olck", "Vaii", "Saur", "Kali", "Rjng", "Lyci",
|
104
|
+
"Cari", "Lydi", "Cham",
|
105
|
+
};
|
106
|
+
|
107
|
+
|
108
|
+
const char* UnicodeLScriptName(const UnicodeLScript ls) {
|
109
|
+
CHECK(ls >= 0 && ls < ULScript_NUM_SCRIPTS);
|
110
|
+
return kUnicodeLScriptNames[ls];
|
111
|
+
}
|
112
|
+
|
113
|
+
|
114
|
+
const char* UnicodeLScriptCode(const UnicodeLScript ls) {
|
115
|
+
CHECK(ls >= 0 && ls < ULScript_NUM_SCRIPTS);
|
116
|
+
return kLScriptName4[ls];
|
117
|
+
}
|
@@ -0,0 +1,99 @@
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_LETTERSCRIPT_ENUM_H__
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_LETTERSCRIPT_ENUM_H__
|
7
|
+
|
8
|
+
enum UnicodeLScript {
|
9
|
+
ULScript_Common,
|
10
|
+
ULScript_Latin,
|
11
|
+
ULScript_Greek,
|
12
|
+
ULScript_Cyrillic,
|
13
|
+
ULScript_Armenian,
|
14
|
+
ULScript_Hebrew,
|
15
|
+
ULScript_Arabic,
|
16
|
+
ULScript_Syriac,
|
17
|
+
ULScript_Thaana,
|
18
|
+
ULScript_Devanagari,
|
19
|
+
ULScript_Bengali,
|
20
|
+
ULScript_Gurmukhi,
|
21
|
+
ULScript_Gujarati,
|
22
|
+
ULScript_Oriya,
|
23
|
+
ULScript_Tamil,
|
24
|
+
ULScript_Telugu,
|
25
|
+
ULScript_Kannada,
|
26
|
+
ULScript_Malayalam,
|
27
|
+
ULScript_Sinhala,
|
28
|
+
ULScript_Thai,
|
29
|
+
ULScript_Lao,
|
30
|
+
ULScript_Tibetan,
|
31
|
+
ULScript_Myanmar,
|
32
|
+
ULScript_Georgian,
|
33
|
+
ULScript_HanCJK,
|
34
|
+
ULScript_Ethiopic,
|
35
|
+
ULScript_Cherokee,
|
36
|
+
ULScript_Canadian_Aboriginal,
|
37
|
+
ULScript_Ogham,
|
38
|
+
ULScript_Runic,
|
39
|
+
ULScript_Khmer,
|
40
|
+
ULScript_Mongolian,
|
41
|
+
ULScript_Yi,
|
42
|
+
ULScript_Old_Italic,
|
43
|
+
ULScript_Gothic,
|
44
|
+
ULScript_Deseret,
|
45
|
+
ULScript_Inherited,
|
46
|
+
ULScript_Tagalog,
|
47
|
+
ULScript_Hanunoo,
|
48
|
+
ULScript_Buhid,
|
49
|
+
ULScript_Tagbanwa,
|
50
|
+
ULScript_Limbu,
|
51
|
+
ULScript_Tai_Le,
|
52
|
+
ULScript_Linear_B,
|
53
|
+
ULScript_Ugaritic,
|
54
|
+
ULScript_Shavian,
|
55
|
+
ULScript_Osmanya,
|
56
|
+
ULScript_Cypriot,
|
57
|
+
ULScript_Buginese,
|
58
|
+
ULScript_Coptic,
|
59
|
+
ULScript_New_Tai_Lue,
|
60
|
+
ULScript_Glagolitic,
|
61
|
+
ULScript_Tifinagh,
|
62
|
+
ULScript_Syloti_Nagri,
|
63
|
+
ULScript_Old_Persian,
|
64
|
+
ULScript_Kharoshthi,
|
65
|
+
ULScript_Balinese,
|
66
|
+
ULScript_Cuneiform,
|
67
|
+
ULScript_Phoenician,
|
68
|
+
ULScript_Phags_Pa,
|
69
|
+
ULScript_Nko,
|
70
|
+
|
71
|
+
// Unicode 5.1
|
72
|
+
ULScript_Sundanese,
|
73
|
+
ULScript_Lepcha,
|
74
|
+
ULScript_Ol_Chiki,
|
75
|
+
ULScript_Vai,
|
76
|
+
ULScript_Saurashtra,
|
77
|
+
ULScript_Kayah_Li,
|
78
|
+
ULScript_Rejang,
|
79
|
+
ULScript_Lycian,
|
80
|
+
ULScript_Carian,
|
81
|
+
ULScript_Lydian,
|
82
|
+
ULScript_Cham,
|
83
|
+
ULScript_NUM_SCRIPTS
|
84
|
+
};
|
85
|
+
|
86
|
+
|
87
|
+
static const UnicodeLScript UNKNOWN_LSCRIPT = ULScript_Common;
|
88
|
+
|
89
|
+
|
90
|
+
// Return the name corresponding to the script ls, e.g. "Latin".
|
91
|
+
// It is a fatal error if ls is not a valid UnicodeLScript.
|
92
|
+
const char* UnicodeLScriptName(const UnicodeLScript ls);
|
93
|
+
|
94
|
+
|
95
|
+
// Return the 4-letter code corresponding to the script ls, e.g. "Latn".
|
96
|
+
// It is a fatal error if ls is not a valid UnicodeLScript.
|
97
|
+
const char* UnicodeLScriptCode(const UnicodeLScript ls);
|
98
|
+
|
99
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_LETTERSCRIPT_ENUM_H__
|
@@ -0,0 +1,259 @@
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
// Remember a subset of a sequence of values, using a modest amount of memory
|
6
|
+
|
7
|
+
/***
|
8
|
+
Design:
|
9
|
+
Accumulate in powers of three, using 3-way median to collapse entries.
|
10
|
+
At any given time, there is one most-dense (highest power of 3) range of
|
11
|
+
entries and a series of less-dense ranges that hold 0..2 entries each. There
|
12
|
+
is a bounded-size storage array of S cells for all the entries.
|
13
|
+
|
14
|
+
The overflow detect is set up so that a new higher power of 3, K+1, is
|
15
|
+
triggered precisely when range K has 3n entries and all ranges < K have
|
16
|
+
zero entries.
|
17
|
+
|
18
|
+
In general, think of the range sizes as a multi-digit base 3 number, except
|
19
|
+
the highest digit may exceed 2:
|
20
|
+
|
21
|
+
3**6 3**5 3**4 3**3 3**2 3**1 3**0 K=2
|
22
|
+
0 0 0 0 3n-1 2 2 unused:1
|
23
|
+
|
24
|
+
There are a total of 3n-1 + 2 + 2 entries in use. Assume a size limit S at
|
25
|
+
one more than that, and we add a new 3**0 entry and "carry" by performing
|
26
|
+
medians on any group of 3 elements:
|
27
|
+
|
28
|
+
3**6 3**5 3**4 3**3 3**2 3**1 3**0 K=2
|
29
|
+
0 0 0 0 3n-1 2 3 unused:0
|
30
|
+
0 0 0 0 3n-1 3 0 carry unused:2
|
31
|
+
0 0 0 0 3n 0 0 carry unused:4
|
32
|
+
|
33
|
+
To accumulate 2 entries at all levels < K and 3 just before the first carry at
|
34
|
+
level 0, we need 2*K + 1 unused cells after doing all carries, or five cells
|
35
|
+
in this case. Since we only have 4 cells in the example above, we need to
|
36
|
+
make room by starting a new power of three:
|
37
|
+
|
38
|
+
3**6 3**5 3**4 3**3 3**2 3**1 3**0
|
39
|
+
0 0 0 0 3n 0 0 K=2 unused:4
|
40
|
+
0 0 0 n 0 0 0 K=3 unused:2n+4
|
41
|
+
|
42
|
+
In the code below, we don't worry about overflow from the topmost place.
|
43
|
+
|
44
|
+
|
45
|
+
***/
|
46
|
+
|
47
|
+
#include "encodings/compact_lang_det/subsetsequence.h"
|
48
|
+
#include <stdio.h>
|
49
|
+
|
50
|
+
#include "encodings/compact_lang_det/win/cld_logging.h"
|
51
|
+
|
52
|
+
void DumpInts(const char* label, const int* v, int n) {
|
53
|
+
printf("%s ", label);
|
54
|
+
for (int i = 0; i < n; ++i) {
|
55
|
+
printf("%d ", v[i]);
|
56
|
+
}
|
57
|
+
printf("\n");
|
58
|
+
}
|
59
|
+
|
60
|
+
void DumpUint8s(const char* label, const uint8* v, int n) {
|
61
|
+
printf("%s ", label);
|
62
|
+
for (int i = 0; i < n; ++i) {
|
63
|
+
printf("%d ", v[i]);
|
64
|
+
}
|
65
|
+
printf("\n");
|
66
|
+
}
|
67
|
+
|
68
|
+
// Return median of seq_[sub] .. seq_[sub+2], favoring middle element
|
69
|
+
uint8 SubsetSequence::Median3(int sub) {
|
70
|
+
if (seq_[sub] == seq_[sub + 1]) {
|
71
|
+
return seq_[sub];
|
72
|
+
}
|
73
|
+
if (seq_[sub] == seq_[sub + 2]) {
|
74
|
+
return seq_[sub];
|
75
|
+
}
|
76
|
+
return seq_[sub + 1];
|
77
|
+
}
|
78
|
+
|
79
|
+
void SubsetSequence::Init() {
|
80
|
+
// printf("Init\n");
|
81
|
+
|
82
|
+
k_ = 0;
|
83
|
+
count_[0] = 0;
|
84
|
+
next_e_ = 0;
|
85
|
+
seq_[0] = 0; // Default value if no calls to Add
|
86
|
+
|
87
|
+
// Want largest <= kMaxSeq_ that allows reserve and makes count_[k_] = 0 mod 3
|
88
|
+
int reserve = (2 * k_ + 1);
|
89
|
+
level_limit_e_ = kMaxSeq_ - reserve;
|
90
|
+
level_limit_e_ = (level_limit_e_ / 3) * 3; // Round down to multiple of 3
|
91
|
+
limit_e_ = level_limit_e_;
|
92
|
+
}
|
93
|
+
|
94
|
+
// Compress level k by 3x, creating level k+1
|
95
|
+
void SubsetSequence::NewLevel() {
|
96
|
+
// printf("NewLevel 3 ** %d\n", k_ + 1);
|
97
|
+
//DumpUint8s("count[k]", count_, k_ + 1);
|
98
|
+
//DumpUint8s("seq[next]", seq_, next_e_);
|
99
|
+
|
100
|
+
// Incoming level must be an exact multiple of three in size
|
101
|
+
CHECK((count_[k_] % 3) == 0);
|
102
|
+
int k_size = count_[k_];
|
103
|
+
int new_size = k_size / 3;
|
104
|
+
|
105
|
+
// Compress down by 3x, via median
|
106
|
+
for (int j = 0; j < new_size; ++j) {
|
107
|
+
seq_[j] = Median3(j * 3);
|
108
|
+
}
|
109
|
+
|
110
|
+
// Update counts
|
111
|
+
count_[k_] = 0;
|
112
|
+
// Else Overflow -- just continue with 3x dense Level K
|
113
|
+
if (k_ < (kMaxLevel_ - 1)) {++k_;}
|
114
|
+
count_[k_] = new_size;
|
115
|
+
|
116
|
+
// Update limits
|
117
|
+
next_e_ = new_size;
|
118
|
+
limit_e_ = next_e_ + 3;
|
119
|
+
|
120
|
+
// Want largest <= kMaxSeq_ that allows reserve and makes count_[k_] = 0 mod 3
|
121
|
+
int reserve = (2 * k_ + 1);
|
122
|
+
level_limit_e_ = kMaxSeq_ - reserve;
|
123
|
+
level_limit_e_ = (level_limit_e_ / 3) * 3; // Round down to multiple of 3
|
124
|
+
//
|
125
|
+
//DumpUint8s("after: count[k]", count_, k_ + 1);
|
126
|
+
//DumpUint8s("after: seq[next]", seq_, next_e_);
|
127
|
+
}
|
128
|
+
|
129
|
+
void SubsetSequence::DoCarries() {
|
130
|
+
CHECK(count_[k_] > 3); // We depend on count_[k_] being > 3 to stop while
|
131
|
+
// Make room by carrying
|
132
|
+
|
133
|
+
//DumpUint8s("DoCarries count[k]", count_, k_ + 1);
|
134
|
+
//DumpUint8s("DoCarries seq[next]", seq_, next_e_);
|
135
|
+
|
136
|
+
int i = 0;
|
137
|
+
while (count_[i] == 3) {
|
138
|
+
next_e_ -= 3;
|
139
|
+
seq_[next_e_] = Median3(next_e_);
|
140
|
+
++next_e_;
|
141
|
+
count_[i] = 0;
|
142
|
+
++count_[i + 1];
|
143
|
+
++i;
|
144
|
+
}
|
145
|
+
limit_e_ = next_e_ + 3;
|
146
|
+
|
147
|
+
//DumpUint8s("after: DoCarries count[k]", count_, k_ + 1);
|
148
|
+
//DumpUint8s("after: DoCarries seq[next]", seq_, next_e_);
|
149
|
+
|
150
|
+
// If we just fully carried into level K,
|
151
|
+
// Make sure there is now enough room, else start level K + 1
|
152
|
+
if (i >= k_) {
|
153
|
+
CHECK(count_[k_] == next_e_);
|
154
|
+
if (next_e_ >= level_limit_e_) {
|
155
|
+
NewLevel();
|
156
|
+
}
|
157
|
+
}
|
158
|
+
}
|
159
|
+
|
160
|
+
void SubsetSequence::Add(uint8 e) {
|
161
|
+
// Add an entry then carry as needed
|
162
|
+
seq_[next_e_] = e;
|
163
|
+
++next_e_;
|
164
|
+
++count_[0];
|
165
|
+
|
166
|
+
if (next_e_ >= limit_e_) {
|
167
|
+
DoCarries();
|
168
|
+
}
|
169
|
+
}
|
170
|
+
|
171
|
+
|
172
|
+
// Collapse tail end by simple median across disparate-weight values,
|
173
|
+
// dropping or duplicating last value if need be.
|
174
|
+
// This routine is idempotent.
|
175
|
+
void SubsetSequence::Flush() {
|
176
|
+
// printf("Flush %d\n", count_[k_]);
|
177
|
+
int start_tail = count_[k_];
|
178
|
+
int size_tail = next_e_ - start_tail;
|
179
|
+
if ((size_tail % 3) == 2) {
|
180
|
+
seq_[next_e_] = seq_[next_e_ - 1]; // Duplicate last value
|
181
|
+
++size_tail;
|
182
|
+
}
|
183
|
+
|
184
|
+
// Compress tail down by 3x, via median
|
185
|
+
int new_size = size_tail / 3; // May delete last value
|
186
|
+
for (int j = 0; j < new_size; ++j) {
|
187
|
+
seq_[start_tail + j] = Median3(start_tail + j * 3);
|
188
|
+
}
|
189
|
+
|
190
|
+
next_e_ = start_tail + new_size;
|
191
|
+
count_[k_] = next_e_;
|
192
|
+
}
|
193
|
+
|
194
|
+
|
195
|
+
// Extract representative pattern of exactly N values into dst[0..n-1]
|
196
|
+
// This routine may be called multiple times, but it may downsample as a
|
197
|
+
// side effect, causing subsequent calls with larger N to get poor answers.
|
198
|
+
void SubsetSequence::Extract(int to_n, uint8* dst) {
|
199
|
+
// Collapse partial-carries in tail
|
200
|
+
Flush();
|
201
|
+
|
202
|
+
// Just use Bresenham to resample
|
203
|
+
int from_n = next_e_;
|
204
|
+
if (to_n >= from_n) {
|
205
|
+
// Up-sample from_n => to_n
|
206
|
+
int err = to_n - 1; // bias toward no overshoot
|
207
|
+
int j = 0;
|
208
|
+
for (int i = 0; i < to_n; ++i) {
|
209
|
+
dst[i] = seq_[j];
|
210
|
+
err -= from_n;
|
211
|
+
if (err < 0) {
|
212
|
+
++j;
|
213
|
+
err += to_n;
|
214
|
+
}
|
215
|
+
}
|
216
|
+
} else {
|
217
|
+
// Get to the point that the number of samples is <= 3 * to_n
|
218
|
+
while (next_e_ > (to_n * 3)) {
|
219
|
+
// Compress down by 3x, via median
|
220
|
+
// printf("Extract, median %d / 3\n", next_e_);
|
221
|
+
if ((next_e_ % 3) == 2) {
|
222
|
+
seq_[next_e_] = seq_[next_e_ - 1]; // Duplicate last value
|
223
|
+
++next_e_;
|
224
|
+
}
|
225
|
+
int new_size = next_e_ / 3; // May delete last value
|
226
|
+
for (int j = 0; j < new_size; ++j) {
|
227
|
+
seq_[j] = Median3(j * 3);
|
228
|
+
}
|
229
|
+
next_e_ = new_size;
|
230
|
+
count_[k_] = next_e_;
|
231
|
+
}
|
232
|
+
from_n = next_e_;
|
233
|
+
|
234
|
+
if (to_n == from_n) {
|
235
|
+
// Copy verbatim
|
236
|
+
for (int i = 0; i < to_n; ++i) {
|
237
|
+
dst[i] = seq_[i];
|
238
|
+
}
|
239
|
+
return;
|
240
|
+
}
|
241
|
+
|
242
|
+
// Down-sample from_n => to_n, using medians
|
243
|
+
int err = 0; // Bias to immediate median sample
|
244
|
+
int j = 0;
|
245
|
+
for (int i = 0; i < from_n; ++i) {
|
246
|
+
err -= to_n;
|
247
|
+
if (err < 0) {
|
248
|
+
if (i <= (next_e_ - 2)) {
|
249
|
+
dst[j] = Median3(i);
|
250
|
+
} else {
|
251
|
+
dst[j] = seq_[i];
|
252
|
+
}
|
253
|
+
++j;
|
254
|
+
err += from_n;
|
255
|
+
}
|
256
|
+
}
|
257
|
+
}
|
258
|
+
|
259
|
+
}
|
@@ -0,0 +1,44 @@
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
// Remember a subset of a sequence of values, using a modest amount of memory
|
6
|
+
|
7
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_SUBSETSEQUENCE_H_
|
8
|
+
#define ENCODINGS_COMPACT_LANG_DET_SUBSETSEQUENCE_H_
|
9
|
+
|
10
|
+
#include "encodings/compact_lang_det/win/cld_basictypes.h"
|
11
|
+
#include "encodings/compact_lang_det/win/cld_google.h"
|
12
|
+
|
13
|
+
|
14
|
+
class SubsetSequence {
|
15
|
+
public:
|
16
|
+
void Init();
|
17
|
+
void Add(uint8 e);
|
18
|
+
void Extract(int n, uint8* dst);
|
19
|
+
SubsetSequence() {Init();}
|
20
|
+
~SubsetSequence() {};
|
21
|
+
|
22
|
+
private:
|
23
|
+
uint8 Median3(int sub);
|
24
|
+
void NewLevel();
|
25
|
+
void DoCarries();
|
26
|
+
void Flush();
|
27
|
+
|
28
|
+
static const int kMaxLevel_ = 16; // 3**16 ~= 43M (3**20 ~= 3.4B)
|
29
|
+
static const int kMaxSeq_ = 128;
|
30
|
+
|
31
|
+
int k_;
|
32
|
+
int next_e_;
|
33
|
+
int limit_e_;
|
34
|
+
int level_limit_e_;
|
35
|
+
uint8 seq_[kMaxSeq_];
|
36
|
+
uint8 count_[kMaxLevel_ + 1]; // +1 allows graceful overflow
|
37
|
+
|
38
|
+
DISALLOW_EVIL_CONSTRUCTORS(SubsetSequence);
|
39
|
+
|
40
|
+
// Require enough room to end up with 40 entries plus carrying space
|
41
|
+
COMPILE_ASSERT(kMaxSeq_ >= (kMaxLevel_ * 2 + 40), kMaxSeq__is_too_small);
|
42
|
+
};
|
43
|
+
|
44
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_SUBSETSEQUENCE_H_
|
@@ -0,0 +1,99 @@
|
|
1
|
+
// Copyright 2008 Google Inc. All Rights Reserved.
|
2
|
+
// Author: dsites@google.com (Dick Sites)
|
3
|
+
/*
|
4
|
+
#include "testing/base/public/gunit.h"
|
5
|
+
#include "testing/lib/strings/overrun_sensitive_memory_block.h"
|
6
|
+
#include "cld/encodings/compact_lang_det/subsetsequence.h"
|
7
|
+
|
8
|
+
// This always passes. It is just scaffolidng to exercise the subsequence
|
9
|
+
// facility, which is likely to get abandoned soon. dsites 2008.11.17
|
10
|
+
//
|
11
|
+
TEST(SubsetSequence, foo) {
|
12
|
+
uint8 dst[120];
|
13
|
+
|
14
|
+
// Create 120-element vector
|
15
|
+
printf("Creating %d items:\n", 120);
|
16
|
+
SubsetSequence ss;
|
17
|
+
for (int i = 0; i < 120; ++i) {
|
18
|
+
ss.Add(i);
|
19
|
+
}
|
20
|
+
|
21
|
+
// Extract various lengths
|
22
|
+
for (int n = 120; n >= 0; --n) {
|
23
|
+
ss.Extract(n, dst);
|
24
|
+
printf("[%d] ", n);
|
25
|
+
for (int i = 0; i < n; ++i) {
|
26
|
+
printf("%d ", dst[i]);
|
27
|
+
}
|
28
|
+
printf("\n");
|
29
|
+
}
|
30
|
+
|
31
|
+
printf("\n");
|
32
|
+
printf("\n");
|
33
|
+
|
34
|
+
// Create 120-element vector of 7 items each
|
35
|
+
printf("Creating %d items:\n", 120);
|
36
|
+
ss.Init();
|
37
|
+
for (int i = 0; i < 120; ++i) {
|
38
|
+
ss.Add(i / 7);
|
39
|
+
}
|
40
|
+
|
41
|
+
// Extract various lengths
|
42
|
+
for (int n = 120; n >= 0; --n) {
|
43
|
+
ss.Extract(n, dst);
|
44
|
+
printf("[%d] ", n);
|
45
|
+
for (int i = 0; i < n; ++i) {
|
46
|
+
printf("%d ", dst[i]);
|
47
|
+
}
|
48
|
+
printf("\n");
|
49
|
+
}
|
50
|
+
|
51
|
+
printf("\n");
|
52
|
+
printf("\n");
|
53
|
+
|
54
|
+
|
55
|
+
// Create 400 element vector of patterns
|
56
|
+
int nn1 = 400;
|
57
|
+
int divisor = (nn1 + 239) / 240; // Max inserted value = 240
|
58
|
+
printf("Creating %d items:\n", nn1);
|
59
|
+
ss.Init();
|
60
|
+
for (int i = 0; i < nn1; ++i) {
|
61
|
+
ss.Add(i / divisor);
|
62
|
+
}
|
63
|
+
|
64
|
+
// Extract 12-item summary lengths
|
65
|
+
int n1 = 12;
|
66
|
+
ss.Extract(n1, dst);
|
67
|
+
printf("[%d] ", n1);
|
68
|
+
for (int i = 0; i < n1; ++i) {
|
69
|
+
printf("%d ", dst[i]);
|
70
|
+
}
|
71
|
+
printf("\n");
|
72
|
+
|
73
|
+
printf("\n");
|
74
|
+
printf("\n");
|
75
|
+
|
76
|
+
// Create 10**n element vector of patterns
|
77
|
+
int pow_10 = 1;
|
78
|
+
for (int nn = 0; nn < 9; ++nn) {
|
79
|
+
printf("Creating %d items:\n", pow_10);
|
80
|
+
int divisor = (pow_10 + 239) / 240; // Max inserted value = 240
|
81
|
+
ss.Init();
|
82
|
+
for (int i = 0; i < pow_10; ++i) {
|
83
|
+
ss.Add(i / divisor);
|
84
|
+
}
|
85
|
+
|
86
|
+
// Extract 12-item summary lengths
|
87
|
+
int n = 12;
|
88
|
+
ss.Extract(n, dst);
|
89
|
+
printf("[%d] ", n);
|
90
|
+
for (int i = 0; i < n; ++i) {
|
91
|
+
printf("%d ", dst[i]);
|
92
|
+
}
|
93
|
+
printf("\n");
|
94
|
+
|
95
|
+
pow_10 *= 10;
|
96
|
+
}
|
97
|
+
|
98
|
+
}
|
99
|
+
*/
|