language_detection 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +19 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +85 -0
- data/Rakefile +11 -0
- data/ext/cld/Makefile +34 -0
- data/ext/cld/base/basictypes.h +348 -0
- data/ext/cld/base/build_config.h +124 -0
- data/ext/cld/base/casts.h +156 -0
- data/ext/cld/base/commandlineflags.h +443 -0
- data/ext/cld/base/crash.h +41 -0
- data/ext/cld/base/dynamic_annotations.h +358 -0
- data/ext/cld/base/global_strip_options.h +59 -0
- data/ext/cld/base/log_severity.h +46 -0
- data/ext/cld/base/logging.h +1403 -0
- data/ext/cld/base/macros.h +243 -0
- data/ext/cld/base/port.h +54 -0
- data/ext/cld/base/scoped_ptr.h +428 -0
- data/ext/cld/base/stl_decl.h +0 -0
- data/ext/cld/base/stl_decl_msvc.h +107 -0
- data/ext/cld/base/string_util.h +29 -0
- data/ext/cld/base/strtoint.h +93 -0
- data/ext/cld/base/template_util.h +96 -0
- data/ext/cld/base/type_traits.h +198 -0
- data/ext/cld/base/vlog_is_on.h +143 -0
- data/ext/cld/cld.so +0 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
- data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
- data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
- data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
- data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
- data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
- data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
- data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
- data/ext/cld/encodings/internal/encodings.cc +12 -0
- data/ext/cld/encodings/lang_enc.h +254 -0
- data/ext/cld/encodings/proto/encodings.pb.h +169 -0
- data/ext/cld/encodings/public/encodings.h +301 -0
- data/ext/cld/extconf.rb +1 -0
- data/ext/cld/language_detection.cc +88 -0
- data/ext/cld/languages/internal/languages.cc +337 -0
- data/ext/cld/languages/proto/languages.pb.h +179 -0
- data/ext/cld/languages/public/languages.h +379 -0
- data/language_detection.gemspec +28 -0
- data/lib/language_detection/string.rb +1 -0
- data/lib/language_detection/version.rb +3 -0
- data/lib/language_detection.rb +54 -0
- data/test/_helper.rb +15 -0
- data/test/fixtures/languages.csv +80 -0
- data/test/language_detection_test.rb +88 -0
- metadata +250 -0
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#include "encodings/compact_lang_det/letterscript_enum.h"
|
|
6
|
+
|
|
7
|
+
#include "encodings/compact_lang_det/win/cld_logging.h"
|
|
8
|
+
|
|
9
|
+
static const char* kUnicodeLScriptNames[ULScript_NUM_SCRIPTS] = {
|
|
10
|
+
"Common",
|
|
11
|
+
"Latin",
|
|
12
|
+
"Greek",
|
|
13
|
+
"Cyrillic",
|
|
14
|
+
"Armenian",
|
|
15
|
+
"Hebrew",
|
|
16
|
+
"Arabic",
|
|
17
|
+
"Syriac",
|
|
18
|
+
"Thaana",
|
|
19
|
+
"Devanagari",
|
|
20
|
+
"Bengali",
|
|
21
|
+
"Gurmukhi",
|
|
22
|
+
"Gujarati",
|
|
23
|
+
"Oriya",
|
|
24
|
+
"Tamil",
|
|
25
|
+
"Telugu",
|
|
26
|
+
"Kannada",
|
|
27
|
+
"Malayalam",
|
|
28
|
+
"Sinhala",
|
|
29
|
+
"Thai",
|
|
30
|
+
"Lao",
|
|
31
|
+
"Tibetan",
|
|
32
|
+
"Myanmar",
|
|
33
|
+
"Georgian",
|
|
34
|
+
"HanCJK",
|
|
35
|
+
"Ethiopic",
|
|
36
|
+
"Cherokee",
|
|
37
|
+
"Canadian_Aboriginal",
|
|
38
|
+
"Ogham",
|
|
39
|
+
"Runic",
|
|
40
|
+
"Khmer",
|
|
41
|
+
"Mongolian",
|
|
42
|
+
"Yi",
|
|
43
|
+
"Old_Italic",
|
|
44
|
+
"Gothic",
|
|
45
|
+
"Deseret",
|
|
46
|
+
"Inherited",
|
|
47
|
+
"Tagalog",
|
|
48
|
+
"Hanunoo",
|
|
49
|
+
"Buhid",
|
|
50
|
+
"Tagbanwa",
|
|
51
|
+
"Limbu",
|
|
52
|
+
"Tai_Le",
|
|
53
|
+
"Linear_B",
|
|
54
|
+
"Ugaritic",
|
|
55
|
+
"Shavian",
|
|
56
|
+
"Osmanya",
|
|
57
|
+
"Cypriot",
|
|
58
|
+
"Buginese",
|
|
59
|
+
"Coptic",
|
|
60
|
+
"New_Tai_Lue",
|
|
61
|
+
"Glagolitic",
|
|
62
|
+
"Tifinagh",
|
|
63
|
+
"Syloti_Nagri",
|
|
64
|
+
"Old_Persian",
|
|
65
|
+
"Kharoshthi",
|
|
66
|
+
"Balinese",
|
|
67
|
+
"Cuneiform",
|
|
68
|
+
"Phoenician",
|
|
69
|
+
"Phags_Pa",
|
|
70
|
+
"Nko",
|
|
71
|
+
|
|
72
|
+
// Unicode 5.1 beta
|
|
73
|
+
"Sundanese",
|
|
74
|
+
"Lepcha",
|
|
75
|
+
"Ol_Chiki",
|
|
76
|
+
"Vai",
|
|
77
|
+
"Saurashtra",
|
|
78
|
+
"Kayah_Li",
|
|
79
|
+
"Rejang",
|
|
80
|
+
"Lycian",
|
|
81
|
+
"Carian",
|
|
82
|
+
"Lydian",
|
|
83
|
+
"Cham",
|
|
84
|
+
};
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
// Unicode 5.1 beta script names from
|
|
88
|
+
// http://www.unicode.org/Public/5.1.0/diffs/5.0.0-5.1.0.all.2.diffs
|
|
89
|
+
// NOTE: 'Vai ' => "Vaii" to make four letters, not three
|
|
90
|
+
// see http://unicode.org/iso15924/iso15924-codes.html
|
|
91
|
+
const char* const kLScriptName4[ULScript_NUM_SCRIPTS] = {
|
|
92
|
+
"Zyyy", "Latn", "Grek", "Cyrl", "Armn", "Hebr", "Arab", "Syrc",
|
|
93
|
+
"Thaa", "Deva", "Beng", "Guru", "Gujr", "Orya", "Taml", "Telu",
|
|
94
|
+
"Knda", "Mlym", "Sinh", "Thai", "Laoo", "Tibt", "Mymr", "Geor",
|
|
95
|
+
"Hani", "Ethi", "Cher", "Cans", "Ogam", "Runr", "Khmr", "Mong",
|
|
96
|
+
|
|
97
|
+
"Yiii", "Ital", "Goth", "Dsrt", "Zzzz", "Tglg", "Hano", "Buhd",
|
|
98
|
+
"Tagb", "Limb", "Tale", "Linb", "Ugar", "Shaw", "Osma", "Cprt",
|
|
99
|
+
"Bugi", "Copt", "Talu", "Glag", "Tfng", "Sylo", "Xpeo", "Khar",
|
|
100
|
+
"Bali", "Xsux", "Phnx", "Phag", "Nkoo",
|
|
101
|
+
|
|
102
|
+
// Unicode 5.1 beta
|
|
103
|
+
"Sund", "Lepc", "Olck", "Vaii", "Saur", "Kali", "Rjng", "Lyci",
|
|
104
|
+
"Cari", "Lydi", "Cham",
|
|
105
|
+
};
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
const char* UnicodeLScriptName(const UnicodeLScript ls) {
|
|
109
|
+
CHECK(ls >= 0 && ls < ULScript_NUM_SCRIPTS);
|
|
110
|
+
return kUnicodeLScriptNames[ls];
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
const char* UnicodeLScriptCode(const UnicodeLScript ls) {
|
|
115
|
+
CHECK(ls >= 0 && ls < ULScript_NUM_SCRIPTS);
|
|
116
|
+
return kLScriptName4[ls];
|
|
117
|
+
}
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_LETTERSCRIPT_ENUM_H__
|
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_LETTERSCRIPT_ENUM_H__
|
|
7
|
+
|
|
8
|
+
enum UnicodeLScript {
|
|
9
|
+
ULScript_Common,
|
|
10
|
+
ULScript_Latin,
|
|
11
|
+
ULScript_Greek,
|
|
12
|
+
ULScript_Cyrillic,
|
|
13
|
+
ULScript_Armenian,
|
|
14
|
+
ULScript_Hebrew,
|
|
15
|
+
ULScript_Arabic,
|
|
16
|
+
ULScript_Syriac,
|
|
17
|
+
ULScript_Thaana,
|
|
18
|
+
ULScript_Devanagari,
|
|
19
|
+
ULScript_Bengali,
|
|
20
|
+
ULScript_Gurmukhi,
|
|
21
|
+
ULScript_Gujarati,
|
|
22
|
+
ULScript_Oriya,
|
|
23
|
+
ULScript_Tamil,
|
|
24
|
+
ULScript_Telugu,
|
|
25
|
+
ULScript_Kannada,
|
|
26
|
+
ULScript_Malayalam,
|
|
27
|
+
ULScript_Sinhala,
|
|
28
|
+
ULScript_Thai,
|
|
29
|
+
ULScript_Lao,
|
|
30
|
+
ULScript_Tibetan,
|
|
31
|
+
ULScript_Myanmar,
|
|
32
|
+
ULScript_Georgian,
|
|
33
|
+
ULScript_HanCJK,
|
|
34
|
+
ULScript_Ethiopic,
|
|
35
|
+
ULScript_Cherokee,
|
|
36
|
+
ULScript_Canadian_Aboriginal,
|
|
37
|
+
ULScript_Ogham,
|
|
38
|
+
ULScript_Runic,
|
|
39
|
+
ULScript_Khmer,
|
|
40
|
+
ULScript_Mongolian,
|
|
41
|
+
ULScript_Yi,
|
|
42
|
+
ULScript_Old_Italic,
|
|
43
|
+
ULScript_Gothic,
|
|
44
|
+
ULScript_Deseret,
|
|
45
|
+
ULScript_Inherited,
|
|
46
|
+
ULScript_Tagalog,
|
|
47
|
+
ULScript_Hanunoo,
|
|
48
|
+
ULScript_Buhid,
|
|
49
|
+
ULScript_Tagbanwa,
|
|
50
|
+
ULScript_Limbu,
|
|
51
|
+
ULScript_Tai_Le,
|
|
52
|
+
ULScript_Linear_B,
|
|
53
|
+
ULScript_Ugaritic,
|
|
54
|
+
ULScript_Shavian,
|
|
55
|
+
ULScript_Osmanya,
|
|
56
|
+
ULScript_Cypriot,
|
|
57
|
+
ULScript_Buginese,
|
|
58
|
+
ULScript_Coptic,
|
|
59
|
+
ULScript_New_Tai_Lue,
|
|
60
|
+
ULScript_Glagolitic,
|
|
61
|
+
ULScript_Tifinagh,
|
|
62
|
+
ULScript_Syloti_Nagri,
|
|
63
|
+
ULScript_Old_Persian,
|
|
64
|
+
ULScript_Kharoshthi,
|
|
65
|
+
ULScript_Balinese,
|
|
66
|
+
ULScript_Cuneiform,
|
|
67
|
+
ULScript_Phoenician,
|
|
68
|
+
ULScript_Phags_Pa,
|
|
69
|
+
ULScript_Nko,
|
|
70
|
+
|
|
71
|
+
// Unicode 5.1
|
|
72
|
+
ULScript_Sundanese,
|
|
73
|
+
ULScript_Lepcha,
|
|
74
|
+
ULScript_Ol_Chiki,
|
|
75
|
+
ULScript_Vai,
|
|
76
|
+
ULScript_Saurashtra,
|
|
77
|
+
ULScript_Kayah_Li,
|
|
78
|
+
ULScript_Rejang,
|
|
79
|
+
ULScript_Lycian,
|
|
80
|
+
ULScript_Carian,
|
|
81
|
+
ULScript_Lydian,
|
|
82
|
+
ULScript_Cham,
|
|
83
|
+
ULScript_NUM_SCRIPTS
|
|
84
|
+
};
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
static const UnicodeLScript UNKNOWN_LSCRIPT = ULScript_Common;
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
// Return the name corresponding to the script ls, e.g. "Latin".
|
|
91
|
+
// It is a fatal error if ls is not a valid UnicodeLScript.
|
|
92
|
+
const char* UnicodeLScriptName(const UnicodeLScript ls);
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
// Return the 4-letter code corresponding to the script ls, e.g. "Latn".
|
|
96
|
+
// It is a fatal error if ls is not a valid UnicodeLScript.
|
|
97
|
+
const char* UnicodeLScriptCode(const UnicodeLScript ls);
|
|
98
|
+
|
|
99
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_LETTERSCRIPT_ENUM_H__
|
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
// Remember a subset of a sequence of values, using a modest amount of memory
|
|
6
|
+
|
|
7
|
+
/***
|
|
8
|
+
Design:
|
|
9
|
+
Accumulate in powers of three, using 3-way median to collapse entries.
|
|
10
|
+
At any given time, there is one most-dense (highest power of 3) range of
|
|
11
|
+
entries and a series of less-dense ranges that hold 0..2 entries each. There
|
|
12
|
+
is a bounded-size storage array of S cells for all the entries.
|
|
13
|
+
|
|
14
|
+
The overflow detect is set up so that a new higher power of 3, K+1, is
|
|
15
|
+
triggered precisely when range K has 3n entries and all ranges < K have
|
|
16
|
+
zero entries.
|
|
17
|
+
|
|
18
|
+
In general, think of the range sizes as a multi-digit base 3 number, except
|
|
19
|
+
the highest digit may exceed 2:
|
|
20
|
+
|
|
21
|
+
3**6 3**5 3**4 3**3 3**2 3**1 3**0 K=2
|
|
22
|
+
0 0 0 0 3n-1 2 2 unused:1
|
|
23
|
+
|
|
24
|
+
There are a total of 3n-1 + 2 + 2 entries in use. Assume a size limit S at
|
|
25
|
+
one more than that, and we add a new 3**0 entry and "carry" by performing
|
|
26
|
+
medians on any group of 3 elements:
|
|
27
|
+
|
|
28
|
+
3**6 3**5 3**4 3**3 3**2 3**1 3**0 K=2
|
|
29
|
+
0 0 0 0 3n-1 2 3 unused:0
|
|
30
|
+
0 0 0 0 3n-1 3 0 carry unused:2
|
|
31
|
+
0 0 0 0 3n 0 0 carry unused:4
|
|
32
|
+
|
|
33
|
+
To accumulate 2 entries at all levels < K and 3 just before the first carry at
|
|
34
|
+
level 0, we need 2*K + 1 unused cells after doing all carries, or five cells
|
|
35
|
+
in this case. Since we only have 4 cells in the example above, we need to
|
|
36
|
+
make room by starting a new power of three:
|
|
37
|
+
|
|
38
|
+
3**6 3**5 3**4 3**3 3**2 3**1 3**0
|
|
39
|
+
0 0 0 0 3n 0 0 K=2 unused:4
|
|
40
|
+
0 0 0 n 0 0 0 K=3 unused:2n+4
|
|
41
|
+
|
|
42
|
+
In the code below, we don't worry about overflow from the topmost place.
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
***/
|
|
46
|
+
|
|
47
|
+
#include "encodings/compact_lang_det/subsetsequence.h"
|
|
48
|
+
#include <stdio.h>
|
|
49
|
+
|
|
50
|
+
#include "encodings/compact_lang_det/win/cld_logging.h"
|
|
51
|
+
|
|
52
|
+
void DumpInts(const char* label, const int* v, int n) {
|
|
53
|
+
printf("%s ", label);
|
|
54
|
+
for (int i = 0; i < n; ++i) {
|
|
55
|
+
printf("%d ", v[i]);
|
|
56
|
+
}
|
|
57
|
+
printf("\n");
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
void DumpUint8s(const char* label, const uint8* v, int n) {
|
|
61
|
+
printf("%s ", label);
|
|
62
|
+
for (int i = 0; i < n; ++i) {
|
|
63
|
+
printf("%d ", v[i]);
|
|
64
|
+
}
|
|
65
|
+
printf("\n");
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// Return median of seq_[sub] .. seq_[sub+2], favoring middle element
|
|
69
|
+
uint8 SubsetSequence::Median3(int sub) {
|
|
70
|
+
if (seq_[sub] == seq_[sub + 1]) {
|
|
71
|
+
return seq_[sub];
|
|
72
|
+
}
|
|
73
|
+
if (seq_[sub] == seq_[sub + 2]) {
|
|
74
|
+
return seq_[sub];
|
|
75
|
+
}
|
|
76
|
+
return seq_[sub + 1];
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
void SubsetSequence::Init() {
|
|
80
|
+
// printf("Init\n");
|
|
81
|
+
|
|
82
|
+
k_ = 0;
|
|
83
|
+
count_[0] = 0;
|
|
84
|
+
next_e_ = 0;
|
|
85
|
+
seq_[0] = 0; // Default value if no calls to Add
|
|
86
|
+
|
|
87
|
+
// Want largest <= kMaxSeq_ that allows reserve and makes count_[k_] = 0 mod 3
|
|
88
|
+
int reserve = (2 * k_ + 1);
|
|
89
|
+
level_limit_e_ = kMaxSeq_ - reserve;
|
|
90
|
+
level_limit_e_ = (level_limit_e_ / 3) * 3; // Round down to multiple of 3
|
|
91
|
+
limit_e_ = level_limit_e_;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
// Compress level k by 3x, creating level k+1
|
|
95
|
+
void SubsetSequence::NewLevel() {
|
|
96
|
+
// printf("NewLevel 3 ** %d\n", k_ + 1);
|
|
97
|
+
//DumpUint8s("count[k]", count_, k_ + 1);
|
|
98
|
+
//DumpUint8s("seq[next]", seq_, next_e_);
|
|
99
|
+
|
|
100
|
+
// Incoming level must be an exact multiple of three in size
|
|
101
|
+
CHECK((count_[k_] % 3) == 0);
|
|
102
|
+
int k_size = count_[k_];
|
|
103
|
+
int new_size = k_size / 3;
|
|
104
|
+
|
|
105
|
+
// Compress down by 3x, via median
|
|
106
|
+
for (int j = 0; j < new_size; ++j) {
|
|
107
|
+
seq_[j] = Median3(j * 3);
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
// Update counts
|
|
111
|
+
count_[k_] = 0;
|
|
112
|
+
// Else Overflow -- just continue with 3x dense Level K
|
|
113
|
+
if (k_ < (kMaxLevel_ - 1)) {++k_;}
|
|
114
|
+
count_[k_] = new_size;
|
|
115
|
+
|
|
116
|
+
// Update limits
|
|
117
|
+
next_e_ = new_size;
|
|
118
|
+
limit_e_ = next_e_ + 3;
|
|
119
|
+
|
|
120
|
+
// Want largest <= kMaxSeq_ that allows reserve and makes count_[k_] = 0 mod 3
|
|
121
|
+
int reserve = (2 * k_ + 1);
|
|
122
|
+
level_limit_e_ = kMaxSeq_ - reserve;
|
|
123
|
+
level_limit_e_ = (level_limit_e_ / 3) * 3; // Round down to multiple of 3
|
|
124
|
+
//
|
|
125
|
+
//DumpUint8s("after: count[k]", count_, k_ + 1);
|
|
126
|
+
//DumpUint8s("after: seq[next]", seq_, next_e_);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
void SubsetSequence::DoCarries() {
|
|
130
|
+
CHECK(count_[k_] > 3); // We depend on count_[k_] being > 3 to stop while
|
|
131
|
+
// Make room by carrying
|
|
132
|
+
|
|
133
|
+
//DumpUint8s("DoCarries count[k]", count_, k_ + 1);
|
|
134
|
+
//DumpUint8s("DoCarries seq[next]", seq_, next_e_);
|
|
135
|
+
|
|
136
|
+
int i = 0;
|
|
137
|
+
while (count_[i] == 3) {
|
|
138
|
+
next_e_ -= 3;
|
|
139
|
+
seq_[next_e_] = Median3(next_e_);
|
|
140
|
+
++next_e_;
|
|
141
|
+
count_[i] = 0;
|
|
142
|
+
++count_[i + 1];
|
|
143
|
+
++i;
|
|
144
|
+
}
|
|
145
|
+
limit_e_ = next_e_ + 3;
|
|
146
|
+
|
|
147
|
+
//DumpUint8s("after: DoCarries count[k]", count_, k_ + 1);
|
|
148
|
+
//DumpUint8s("after: DoCarries seq[next]", seq_, next_e_);
|
|
149
|
+
|
|
150
|
+
// If we just fully carried into level K,
|
|
151
|
+
// Make sure there is now enough room, else start level K + 1
|
|
152
|
+
if (i >= k_) {
|
|
153
|
+
CHECK(count_[k_] == next_e_);
|
|
154
|
+
if (next_e_ >= level_limit_e_) {
|
|
155
|
+
NewLevel();
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
void SubsetSequence::Add(uint8 e) {
|
|
161
|
+
// Add an entry then carry as needed
|
|
162
|
+
seq_[next_e_] = e;
|
|
163
|
+
++next_e_;
|
|
164
|
+
++count_[0];
|
|
165
|
+
|
|
166
|
+
if (next_e_ >= limit_e_) {
|
|
167
|
+
DoCarries();
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
// Collapse tail end by simple median across disparate-weight values,
|
|
173
|
+
// dropping or duplicating last value if need be.
|
|
174
|
+
// This routine is idempotent.
|
|
175
|
+
void SubsetSequence::Flush() {
|
|
176
|
+
// printf("Flush %d\n", count_[k_]);
|
|
177
|
+
int start_tail = count_[k_];
|
|
178
|
+
int size_tail = next_e_ - start_tail;
|
|
179
|
+
if ((size_tail % 3) == 2) {
|
|
180
|
+
seq_[next_e_] = seq_[next_e_ - 1]; // Duplicate last value
|
|
181
|
+
++size_tail;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
// Compress tail down by 3x, via median
|
|
185
|
+
int new_size = size_tail / 3; // May delete last value
|
|
186
|
+
for (int j = 0; j < new_size; ++j) {
|
|
187
|
+
seq_[start_tail + j] = Median3(start_tail + j * 3);
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
next_e_ = start_tail + new_size;
|
|
191
|
+
count_[k_] = next_e_;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
// Extract representative pattern of exactly N values into dst[0..n-1]
|
|
196
|
+
// This routine may be called multiple times, but it may downsample as a
|
|
197
|
+
// side effect, causing subsequent calls with larger N to get poor answers.
|
|
198
|
+
void SubsetSequence::Extract(int to_n, uint8* dst) {
|
|
199
|
+
// Collapse partial-carries in tail
|
|
200
|
+
Flush();
|
|
201
|
+
|
|
202
|
+
// Just use Bresenham to resample
|
|
203
|
+
int from_n = next_e_;
|
|
204
|
+
if (to_n >= from_n) {
|
|
205
|
+
// Up-sample from_n => to_n
|
|
206
|
+
int err = to_n - 1; // bias toward no overshoot
|
|
207
|
+
int j = 0;
|
|
208
|
+
for (int i = 0; i < to_n; ++i) {
|
|
209
|
+
dst[i] = seq_[j];
|
|
210
|
+
err -= from_n;
|
|
211
|
+
if (err < 0) {
|
|
212
|
+
++j;
|
|
213
|
+
err += to_n;
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
} else {
|
|
217
|
+
// Get to the point that the number of samples is <= 3 * to_n
|
|
218
|
+
while (next_e_ > (to_n * 3)) {
|
|
219
|
+
// Compress down by 3x, via median
|
|
220
|
+
// printf("Extract, median %d / 3\n", next_e_);
|
|
221
|
+
if ((next_e_ % 3) == 2) {
|
|
222
|
+
seq_[next_e_] = seq_[next_e_ - 1]; // Duplicate last value
|
|
223
|
+
++next_e_;
|
|
224
|
+
}
|
|
225
|
+
int new_size = next_e_ / 3; // May delete last value
|
|
226
|
+
for (int j = 0; j < new_size; ++j) {
|
|
227
|
+
seq_[j] = Median3(j * 3);
|
|
228
|
+
}
|
|
229
|
+
next_e_ = new_size;
|
|
230
|
+
count_[k_] = next_e_;
|
|
231
|
+
}
|
|
232
|
+
from_n = next_e_;
|
|
233
|
+
|
|
234
|
+
if (to_n == from_n) {
|
|
235
|
+
// Copy verbatim
|
|
236
|
+
for (int i = 0; i < to_n; ++i) {
|
|
237
|
+
dst[i] = seq_[i];
|
|
238
|
+
}
|
|
239
|
+
return;
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
// Down-sample from_n => to_n, using medians
|
|
243
|
+
int err = 0; // Bias to immediate median sample
|
|
244
|
+
int j = 0;
|
|
245
|
+
for (int i = 0; i < from_n; ++i) {
|
|
246
|
+
err -= to_n;
|
|
247
|
+
if (err < 0) {
|
|
248
|
+
if (i <= (next_e_ - 2)) {
|
|
249
|
+
dst[j] = Median3(i);
|
|
250
|
+
} else {
|
|
251
|
+
dst[j] = seq_[i];
|
|
252
|
+
}
|
|
253
|
+
++j;
|
|
254
|
+
err += from_n;
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
// Remember a subset of a sequence of values, using a modest amount of memory
|
|
6
|
+
|
|
7
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_SUBSETSEQUENCE_H_
|
|
8
|
+
#define ENCODINGS_COMPACT_LANG_DET_SUBSETSEQUENCE_H_
|
|
9
|
+
|
|
10
|
+
#include "encodings/compact_lang_det/win/cld_basictypes.h"
|
|
11
|
+
#include "encodings/compact_lang_det/win/cld_google.h"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class SubsetSequence {
|
|
15
|
+
public:
|
|
16
|
+
void Init();
|
|
17
|
+
void Add(uint8 e);
|
|
18
|
+
void Extract(int n, uint8* dst);
|
|
19
|
+
SubsetSequence() {Init();}
|
|
20
|
+
~SubsetSequence() {};
|
|
21
|
+
|
|
22
|
+
private:
|
|
23
|
+
uint8 Median3(int sub);
|
|
24
|
+
void NewLevel();
|
|
25
|
+
void DoCarries();
|
|
26
|
+
void Flush();
|
|
27
|
+
|
|
28
|
+
static const int kMaxLevel_ = 16; // 3**16 ~= 43M (3**20 ~= 3.4B)
|
|
29
|
+
static const int kMaxSeq_ = 128;
|
|
30
|
+
|
|
31
|
+
int k_;
|
|
32
|
+
int next_e_;
|
|
33
|
+
int limit_e_;
|
|
34
|
+
int level_limit_e_;
|
|
35
|
+
uint8 seq_[kMaxSeq_];
|
|
36
|
+
uint8 count_[kMaxLevel_ + 1]; // +1 allows graceful overflow
|
|
37
|
+
|
|
38
|
+
DISALLOW_EVIL_CONSTRUCTORS(SubsetSequence);
|
|
39
|
+
|
|
40
|
+
// Require enough room to end up with 40 entries plus carrying space
|
|
41
|
+
COMPILE_ASSERT(kMaxSeq_ >= (kMaxLevel_ * 2 + 40), kMaxSeq__is_too_small);
|
|
42
|
+
};
|
|
43
|
+
|
|
44
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_SUBSETSEQUENCE_H_
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
// Copyright 2008 Google Inc. All Rights Reserved.
|
|
2
|
+
// Author: dsites@google.com (Dick Sites)
|
|
3
|
+
/*
|
|
4
|
+
#include "testing/base/public/gunit.h"
|
|
5
|
+
#include "testing/lib/strings/overrun_sensitive_memory_block.h"
|
|
6
|
+
#include "cld/encodings/compact_lang_det/subsetsequence.h"
|
|
7
|
+
|
|
8
|
+
// This always passes. It is just scaffolidng to exercise the subsequence
|
|
9
|
+
// facility, which is likely to get abandoned soon. dsites 2008.11.17
|
|
10
|
+
//
|
|
11
|
+
TEST(SubsetSequence, foo) {
|
|
12
|
+
uint8 dst[120];
|
|
13
|
+
|
|
14
|
+
// Create 120-element vector
|
|
15
|
+
printf("Creating %d items:\n", 120);
|
|
16
|
+
SubsetSequence ss;
|
|
17
|
+
for (int i = 0; i < 120; ++i) {
|
|
18
|
+
ss.Add(i);
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
// Extract various lengths
|
|
22
|
+
for (int n = 120; n >= 0; --n) {
|
|
23
|
+
ss.Extract(n, dst);
|
|
24
|
+
printf("[%d] ", n);
|
|
25
|
+
for (int i = 0; i < n; ++i) {
|
|
26
|
+
printf("%d ", dst[i]);
|
|
27
|
+
}
|
|
28
|
+
printf("\n");
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
printf("\n");
|
|
32
|
+
printf("\n");
|
|
33
|
+
|
|
34
|
+
// Create 120-element vector of 7 items each
|
|
35
|
+
printf("Creating %d items:\n", 120);
|
|
36
|
+
ss.Init();
|
|
37
|
+
for (int i = 0; i < 120; ++i) {
|
|
38
|
+
ss.Add(i / 7);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// Extract various lengths
|
|
42
|
+
for (int n = 120; n >= 0; --n) {
|
|
43
|
+
ss.Extract(n, dst);
|
|
44
|
+
printf("[%d] ", n);
|
|
45
|
+
for (int i = 0; i < n; ++i) {
|
|
46
|
+
printf("%d ", dst[i]);
|
|
47
|
+
}
|
|
48
|
+
printf("\n");
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
printf("\n");
|
|
52
|
+
printf("\n");
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
// Create 400 element vector of patterns
|
|
56
|
+
int nn1 = 400;
|
|
57
|
+
int divisor = (nn1 + 239) / 240; // Max inserted value = 240
|
|
58
|
+
printf("Creating %d items:\n", nn1);
|
|
59
|
+
ss.Init();
|
|
60
|
+
for (int i = 0; i < nn1; ++i) {
|
|
61
|
+
ss.Add(i / divisor);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// Extract 12-item summary lengths
|
|
65
|
+
int n1 = 12;
|
|
66
|
+
ss.Extract(n1, dst);
|
|
67
|
+
printf("[%d] ", n1);
|
|
68
|
+
for (int i = 0; i < n1; ++i) {
|
|
69
|
+
printf("%d ", dst[i]);
|
|
70
|
+
}
|
|
71
|
+
printf("\n");
|
|
72
|
+
|
|
73
|
+
printf("\n");
|
|
74
|
+
printf("\n");
|
|
75
|
+
|
|
76
|
+
// Create 10**n element vector of patterns
|
|
77
|
+
int pow_10 = 1;
|
|
78
|
+
for (int nn = 0; nn < 9; ++nn) {
|
|
79
|
+
printf("Creating %d items:\n", pow_10);
|
|
80
|
+
int divisor = (pow_10 + 239) / 240; // Max inserted value = 240
|
|
81
|
+
ss.Init();
|
|
82
|
+
for (int i = 0; i < pow_10; ++i) {
|
|
83
|
+
ss.Add(i / divisor);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// Extract 12-item summary lengths
|
|
87
|
+
int n = 12;
|
|
88
|
+
ss.Extract(n, dst);
|
|
89
|
+
printf("[%d] ", n);
|
|
90
|
+
for (int i = 0; i < n; ++i) {
|
|
91
|
+
printf("%d ", dst[i]);
|
|
92
|
+
}
|
|
93
|
+
printf("\n");
|
|
94
|
+
|
|
95
|
+
pow_10 *= 10;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
}
|
|
99
|
+
*/
|