cld 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +27 -0
- data/Manifest +106 -0
- data/README.rdoc +173 -0
- data/Rakefile +15 -0
- data/base/basictypes.h +348 -0
- data/base/build_config.h +115 -0
- data/base/casts.h +156 -0
- data/base/commandlineflags.h +443 -0
- data/base/crash.h +41 -0
- data/base/dynamic_annotations.h +358 -0
- data/base/global_strip_options.h +59 -0
- data/base/log_severity.h +46 -0
- data/base/logging.h +1403 -0
- data/base/macros.h +243 -0
- data/base/port.h +54 -0
- data/base/scoped_ptr.h +428 -0
- data/base/stl_decl.h +0 -0
- data/base/stl_decl_msvc.h +107 -0
- data/base/string_util.h +29 -0
- data/base/strtoint.h +93 -0
- data/base/template_util.h +96 -0
- data/base/type_traits.h +198 -0
- data/base/vlog_is_on.h +143 -0
- data/build.sh +48 -0
- data/build.win.cmd +28 -0
- data/cld.gemspec +30 -0
- data/cld_encodings.h +95 -0
- data/encodings/compact_lang_det/#cldutil.cc# +905 -0
- data/encodings/compact_lang_det/#cldutil.h# +1205 -0
- data/encodings/compact_lang_det/#compact_lang_det_impl.h# +171 -0
- data/encodings/compact_lang_det/#ext_lang_enc.cc# +545 -0
- data/encodings/compact_lang_det/#ext_lang_enc.h# +119 -0
- data/encodings/compact_lang_det/#getonescriptspan.cc# +570 -0
- data/encodings/compact_lang_det/#getonescriptspan.h# +131 -0
- data/encodings/compact_lang_det/#tote.cc# +299 -0
- data/encodings/compact_lang_det/#tote.h# +89 -0
- data/encodings/compact_lang_det/cldutil.cc +905 -0
- data/encodings/compact_lang_det/cldutil.h +1205 -0
- data/encodings/compact_lang_det/cldutil_dbg.h +76 -0
- data/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
- data/encodings/compact_lang_det/compact_lang_det.cc +62 -0
- data/encodings/compact_lang_det/compact_lang_det.h +145 -0
- data/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
- data/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
- data/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
- data/encodings/compact_lang_det/compile.cmd +1 -0
- data/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
- data/encodings/compact_lang_det/ext_lang_enc.h +119 -0
- data/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
- data/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
- data/encodings/compact_lang_det/getonescriptspan.cc +570 -0
- data/encodings/compact_lang_det/getonescriptspan.h +131 -0
- data/encodings/compact_lang_det/letterscript_enum.cc +117 -0
- data/encodings/compact_lang_det/letterscript_enum.h +99 -0
- data/encodings/compact_lang_det/subsetsequence.cc +259 -0
- data/encodings/compact_lang_det/subsetsequence.h +44 -0
- data/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
- data/encodings/compact_lang_det/tote.cc +299 -0
- data/encodings/compact_lang_det/tote.h +89 -0
- data/encodings/compact_lang_det/unittest_data.h +193 -0
- data/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
- data/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
- data/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
- data/encodings/compact_lang_det/win/#cld_unilib_windows.cc# +29 -0
- data/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
- data/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
- data/encodings/compact_lang_det/win/cld_google.h +18 -0
- data/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
- data/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
- data/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
- data/encodings/compact_lang_det/win/cld_logging.h +21 -0
- data/encodings/compact_lang_det/win/cld_macros.h +19 -0
- data/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
- data/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
- data/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
- data/encodings/compact_lang_det/win/cld_unilib.h +15 -0
- data/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
- data/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
- data/encodings/compact_lang_det/win/cld_utf.h +24 -0
- data/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
- data/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
- data/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
- data/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
- data/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
- data/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
- data/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
- data/encodings/internal/encodings.cc +12 -0
- data/encodings/lang_enc.h +254 -0
- data/encodings/proto/encodings.pb.h +169 -0
- data/encodings/public/encodings.h +301 -0
- data/ext/cld/extconf.rb +7 -0
- data/languages/internal/#languages.cc# +337 -0
- data/languages/internal/languages.cc +337 -0
- data/languages/proto/languages.pb.h +179 -0
- data/languages/public/languages.h +379 -0
- data/lib/cld.rb +12 -0
- data/test/test.rb +570 -0
- data/thunk.cc +131 -0
- metadata +168 -0
data/LICENSE
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
// Copyright (c) 2010 The Chromium Authors. All rights reserved.
|
|
2
|
+
//
|
|
3
|
+
// Redistribution and use in source and binary forms, with or without
|
|
4
|
+
// modification, are permitted provided that the following conditions are
|
|
5
|
+
// met:
|
|
6
|
+
//
|
|
7
|
+
// * Redistributions of source code must retain the above copyright
|
|
8
|
+
// notice, this list of conditions and the following disclaimer.
|
|
9
|
+
// * Redistributions in binary form must reproduce the above
|
|
10
|
+
// copyright notice, this list of conditions and the following disclaimer
|
|
11
|
+
// in the documentation and/or other materials provided with the
|
|
12
|
+
// distribution.
|
|
13
|
+
// * Neither the name of Google Inc. nor the names of its
|
|
14
|
+
// contributors may be used to endorse or promote products derived from
|
|
15
|
+
// this software without specific prior written permission.
|
|
16
|
+
//
|
|
17
|
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
18
|
+
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
19
|
+
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
20
|
+
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
21
|
+
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
22
|
+
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
23
|
+
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
24
|
+
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
25
|
+
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
26
|
+
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
27
|
+
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
data/Manifest
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.rdoc
|
|
3
|
+
Rakefile
|
|
4
|
+
base/basictypes.h
|
|
5
|
+
base/build_config.h
|
|
6
|
+
base/casts.h
|
|
7
|
+
base/commandlineflags.h
|
|
8
|
+
base/crash.h
|
|
9
|
+
base/dynamic_annotations.h
|
|
10
|
+
base/global_strip_options.h
|
|
11
|
+
base/log_severity.h
|
|
12
|
+
base/logging.h
|
|
13
|
+
base/macros.h
|
|
14
|
+
base/port.h
|
|
15
|
+
base/scoped_ptr.h
|
|
16
|
+
base/stl_decl.h
|
|
17
|
+
base/stl_decl_msvc.h
|
|
18
|
+
base/string_util.h
|
|
19
|
+
base/strtoint.h
|
|
20
|
+
base/template_util.h
|
|
21
|
+
base/type_traits.h
|
|
22
|
+
base/vlog_is_on.h
|
|
23
|
+
build.sh
|
|
24
|
+
build.win.cmd
|
|
25
|
+
cld.gemspec
|
|
26
|
+
cld_encodings.h
|
|
27
|
+
encodings/compact_lang_det/#cldutil.cc#
|
|
28
|
+
encodings/compact_lang_det/#cldutil.h#
|
|
29
|
+
encodings/compact_lang_det/#compact_lang_det_impl.h#
|
|
30
|
+
encodings/compact_lang_det/#ext_lang_enc.cc#
|
|
31
|
+
encodings/compact_lang_det/#ext_lang_enc.h#
|
|
32
|
+
encodings/compact_lang_det/#getonescriptspan.cc#
|
|
33
|
+
encodings/compact_lang_det/#getonescriptspan.h#
|
|
34
|
+
encodings/compact_lang_det/#tote.cc#
|
|
35
|
+
encodings/compact_lang_det/#tote.h#
|
|
36
|
+
encodings/compact_lang_det/cldutil.cc
|
|
37
|
+
encodings/compact_lang_det/cldutil.h
|
|
38
|
+
encodings/compact_lang_det/cldutil_dbg.h
|
|
39
|
+
encodings/compact_lang_det/cldutil_dbg_empty.cc
|
|
40
|
+
encodings/compact_lang_det/compact_lang_det.cc
|
|
41
|
+
encodings/compact_lang_det/compact_lang_det.h
|
|
42
|
+
encodings/compact_lang_det/compact_lang_det_impl.cc
|
|
43
|
+
encodings/compact_lang_det/compact_lang_det_impl.h
|
|
44
|
+
encodings/compact_lang_det/compact_lang_det_unittest_small.cc
|
|
45
|
+
encodings/compact_lang_det/compile.cmd
|
|
46
|
+
encodings/compact_lang_det/ext_lang_enc.cc
|
|
47
|
+
encodings/compact_lang_det/ext_lang_enc.h
|
|
48
|
+
encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc
|
|
49
|
+
encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc
|
|
50
|
+
encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc
|
|
51
|
+
encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc
|
|
52
|
+
encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc
|
|
53
|
+
encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc
|
|
54
|
+
encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc
|
|
55
|
+
encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h
|
|
56
|
+
encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc
|
|
57
|
+
encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc
|
|
58
|
+
encodings/compact_lang_det/getonescriptspan.cc
|
|
59
|
+
encodings/compact_lang_det/getonescriptspan.h
|
|
60
|
+
encodings/compact_lang_det/letterscript_enum.cc
|
|
61
|
+
encodings/compact_lang_det/letterscript_enum.h
|
|
62
|
+
encodings/compact_lang_det/subsetsequence.cc
|
|
63
|
+
encodings/compact_lang_det/subsetsequence.h
|
|
64
|
+
encodings/compact_lang_det/subsetsequence_unittest.cc
|
|
65
|
+
encodings/compact_lang_det/tote.cc
|
|
66
|
+
encodings/compact_lang_det/tote.h
|
|
67
|
+
encodings/compact_lang_det/unittest_data.h
|
|
68
|
+
encodings/compact_lang_det/utf8propjustletter.h
|
|
69
|
+
encodings/compact_lang_det/utf8propletterscriptnum.h
|
|
70
|
+
encodings/compact_lang_det/utf8scannotjustletterspecial.h
|
|
71
|
+
encodings/compact_lang_det/win/#cld_unilib_windows.cc#
|
|
72
|
+
encodings/compact_lang_det/win/cld_basictypes.h
|
|
73
|
+
encodings/compact_lang_det/win/cld_commandlineflags.h
|
|
74
|
+
encodings/compact_lang_det/win/cld_google.h
|
|
75
|
+
encodings/compact_lang_det/win/cld_htmlutils.h
|
|
76
|
+
encodings/compact_lang_det/win/cld_htmlutils_google3.cc
|
|
77
|
+
encodings/compact_lang_det/win/cld_htmlutils_windows.cc
|
|
78
|
+
encodings/compact_lang_det/win/cld_logging.h
|
|
79
|
+
encodings/compact_lang_det/win/cld_macros.h
|
|
80
|
+
encodings/compact_lang_det/win/cld_strtoint.h
|
|
81
|
+
encodings/compact_lang_det/win/cld_unicodetext.cc
|
|
82
|
+
encodings/compact_lang_det/win/cld_unicodetext.h
|
|
83
|
+
encodings/compact_lang_det/win/cld_unilib.h
|
|
84
|
+
encodings/compact_lang_det/win/cld_unilib_google3.cc
|
|
85
|
+
encodings/compact_lang_det/win/cld_unilib_windows.cc
|
|
86
|
+
encodings/compact_lang_det/win/cld_utf.h
|
|
87
|
+
encodings/compact_lang_det/win/cld_utf8statetable.cc
|
|
88
|
+
encodings/compact_lang_det/win/cld_utf8statetable.h
|
|
89
|
+
encodings/compact_lang_det/win/cld_utf8utils.h
|
|
90
|
+
encodings/compact_lang_det/win/cld_utf8utils_google3.cc
|
|
91
|
+
encodings/compact_lang_det/win/cld_utf8utils_windows.cc
|
|
92
|
+
encodings/compact_lang_det/win/normalizedunicodetext.cc
|
|
93
|
+
encodings/compact_lang_det/win/normalizedunicodetext.h
|
|
94
|
+
encodings/internal/encodings.cc
|
|
95
|
+
encodings/lang_enc.h
|
|
96
|
+
encodings/proto/encodings.pb.h
|
|
97
|
+
encodings/public/encodings.h
|
|
98
|
+
ext/cld/extconf.rb
|
|
99
|
+
languages/internal/#languages.cc#
|
|
100
|
+
languages/internal/languages.cc
|
|
101
|
+
languages/proto/languages.pb.h
|
|
102
|
+
languages/public/languages.h
|
|
103
|
+
lib/cld.rb
|
|
104
|
+
test/test.rb
|
|
105
|
+
thunk.cc
|
|
106
|
+
Manifest
|
data/README.rdoc
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
This is a wrapper of the Compact Language Detection library from Chrome.
|
|
2
|
+
To use :
|
|
3
|
+
require "cld"
|
|
4
|
+
language = CLD.detect_language("piece of text")
|
|
5
|
+
is_english = CLD.english?("我不是英文")
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
detect_language returns a unique integer representing each language, here are the languages:
|
|
9
|
+
ENGLISH = 0,
|
|
10
|
+
DANISH = 1,
|
|
11
|
+
DUTCH = 2,
|
|
12
|
+
FINNISH = 3,
|
|
13
|
+
FRENCH = 4,
|
|
14
|
+
GERMAN = 5,
|
|
15
|
+
HEBREW = 6,
|
|
16
|
+
ITALIAN = 7,
|
|
17
|
+
JAPANESE = 8,
|
|
18
|
+
KOREAN = 9,
|
|
19
|
+
NORWEGIAN = 10,
|
|
20
|
+
POLISH = 11,
|
|
21
|
+
PORTUGUESE = 12,
|
|
22
|
+
RUSSIAN = 13,
|
|
23
|
+
SPANISH = 14,
|
|
24
|
+
SWEDISH = 15,
|
|
25
|
+
CHINESE = 16,
|
|
26
|
+
CZECH = 17,
|
|
27
|
+
GREEK = 18,
|
|
28
|
+
ICELANDIC = 19,
|
|
29
|
+
LATVIAN = 20,
|
|
30
|
+
LITHUANIAN = 21,
|
|
31
|
+
ROMANIAN = 22,
|
|
32
|
+
HUNGARIAN = 23,
|
|
33
|
+
ESTONIAN = 24,
|
|
34
|
+
TG_UNKNOWN_LANGUAGE = 25,
|
|
35
|
+
UNKNOWN_LANGUAGE = 26,
|
|
36
|
+
BULGARIAN = 27,
|
|
37
|
+
CROATIAN = 28,
|
|
38
|
+
SERBIAN = 29,
|
|
39
|
+
IRISH = 30,
|
|
40
|
+
GALICIAN = 31,
|
|
41
|
+
TAGALOG = 32,
|
|
42
|
+
TURKISH = 33,
|
|
43
|
+
UKRAINIAN = 34,
|
|
44
|
+
HINDI = 35,
|
|
45
|
+
MACEDONIAN = 36,
|
|
46
|
+
BENGALI = 37,
|
|
47
|
+
INDONESIAN = 38,
|
|
48
|
+
LATIN = 39,
|
|
49
|
+
MALAY = 40,
|
|
50
|
+
MALAYALAM = 41,
|
|
51
|
+
WELSH = 42,
|
|
52
|
+
NEPALI = 43,
|
|
53
|
+
TELUGU = 44,
|
|
54
|
+
ALBANIAN = 45,
|
|
55
|
+
TAMIL = 46,
|
|
56
|
+
BELARUSIAN = 47,
|
|
57
|
+
JAVANESE = 48,
|
|
58
|
+
OCCITAN = 49,
|
|
59
|
+
URDU = 50,
|
|
60
|
+
BIHARI = 51,
|
|
61
|
+
GUJARATI = 52,
|
|
62
|
+
THAI = 53,
|
|
63
|
+
ARABIC = 54,
|
|
64
|
+
CATALAN = 55,
|
|
65
|
+
ESPERANTO = 56,
|
|
66
|
+
BASQUE = 57,
|
|
67
|
+
INTERLINGUA = 58,
|
|
68
|
+
KANNADA = 59,
|
|
69
|
+
PUNJABI = 60,
|
|
70
|
+
SCOTS_GAELIC = 61,
|
|
71
|
+
SWAHILI = 62,
|
|
72
|
+
SLOVENIAN = 63,
|
|
73
|
+
MARATHI = 64,
|
|
74
|
+
MALTESE = 65,
|
|
75
|
+
VIETNAMESE = 66,
|
|
76
|
+
FRISIAN = 67,
|
|
77
|
+
SLOVAK = 68,
|
|
78
|
+
CHINESE_T = 69,
|
|
79
|
+
FAROESE = 70,
|
|
80
|
+
SUNDANESE = 71,
|
|
81
|
+
UZBEK = 72,
|
|
82
|
+
AMHARIC = 73,
|
|
83
|
+
AZERBAIJANI = 74,
|
|
84
|
+
GEORGIAN = 75,
|
|
85
|
+
TIGRINYA = 76,
|
|
86
|
+
PERSIAN = 77,
|
|
87
|
+
BOSNIAN = 78,
|
|
88
|
+
SINHALESE = 79,
|
|
89
|
+
NORWEGIAN_N = 80,
|
|
90
|
+
PORTUGUESE_P = 81,
|
|
91
|
+
PORTUGUESE_B = 82,
|
|
92
|
+
XHOSA = 83,
|
|
93
|
+
ZULU = 84,
|
|
94
|
+
GUARANI = 85,
|
|
95
|
+
SESOTHO = 86,
|
|
96
|
+
TURKMEN = 87,
|
|
97
|
+
KYRGYZ = 88,
|
|
98
|
+
BRETON = 89,
|
|
99
|
+
TWI = 90,
|
|
100
|
+
YIDDISH = 91,
|
|
101
|
+
SERBO_CROATIAN= 92,
|
|
102
|
+
SOMALI = 93,
|
|
103
|
+
UIGHUR = 94,
|
|
104
|
+
KURDISH = 95,
|
|
105
|
+
MONGOLIAN = 96,
|
|
106
|
+
ARMENIAN = 97,
|
|
107
|
+
LAOTHIAN = 98,
|
|
108
|
+
SINDHI = 99,
|
|
109
|
+
RHAETO_ROMANCE= 100,
|
|
110
|
+
AFRIKAANS = 101,
|
|
111
|
+
LUXEMBOURGISH = 102,
|
|
112
|
+
BURMESE = 103,
|
|
113
|
+
KHMER = 104,
|
|
114
|
+
TIBETAN = 105,
|
|
115
|
+
DHIVEHI = 106,
|
|
116
|
+
CHEROKEE = 107,
|
|
117
|
+
SYRIAC = 108,
|
|
118
|
+
LIMBU = 109,
|
|
119
|
+
ORIYA = 110,
|
|
120
|
+
ASSAMESE = 111,
|
|
121
|
+
CORSICAN = 112,
|
|
122
|
+
INTERLINGUE = 113,
|
|
123
|
+
KAZAKH = 114,
|
|
124
|
+
LINGALA = 115,
|
|
125
|
+
MOLDAVIAN = 116,
|
|
126
|
+
PASHTO = 117,
|
|
127
|
+
QUECHUA = 118,
|
|
128
|
+
SHONA = 119,
|
|
129
|
+
TAJIK = 120,
|
|
130
|
+
TATAR = 121,
|
|
131
|
+
TONGA = 122,
|
|
132
|
+
YORUBA = 123,
|
|
133
|
+
CREOLES_AND_PIDGINS_ENGLISH_BASED = 124,
|
|
134
|
+
CREOLES_AND_PIDGINS_FRENCH_BASED = 125,
|
|
135
|
+
CREOLES_AND_PIDGINS_PORTUGUESE_BASED = 126,
|
|
136
|
+
CREOLES_AND_PIDGINS_OTHER = 127,
|
|
137
|
+
MAORI = 128,
|
|
138
|
+
WOLOF = 129,
|
|
139
|
+
ABKHAZIAN = 130,
|
|
140
|
+
AFAR = 131,
|
|
141
|
+
AYMARA = 132,
|
|
142
|
+
BASHKIR = 133,
|
|
143
|
+
BISLAMA = 134,
|
|
144
|
+
DZONGKHA = 135,
|
|
145
|
+
FIJIAN = 136,
|
|
146
|
+
GREENLANDIC = 137,
|
|
147
|
+
HAUSA = 138,
|
|
148
|
+
HAITIAN_CREOLE= 139,
|
|
149
|
+
INUPIAK = 140,
|
|
150
|
+
INUKTITUT = 141,
|
|
151
|
+
KASHMIRI = 142,
|
|
152
|
+
KINYARWANDA = 143,
|
|
153
|
+
MALAGASY = 144,
|
|
154
|
+
NAURU = 145,
|
|
155
|
+
OROMO = 146,
|
|
156
|
+
RUNDI = 147,
|
|
157
|
+
SAMOAN = 148,
|
|
158
|
+
SANGO = 149,
|
|
159
|
+
SANSKRIT = 150,
|
|
160
|
+
SISWANT = 151,
|
|
161
|
+
TSONGA = 152,
|
|
162
|
+
TSWANA = 153,
|
|
163
|
+
VOLAPUK = 154,
|
|
164
|
+
ZHUANG = 155,
|
|
165
|
+
KHASI = 156,
|
|
166
|
+
SCOTS = 157,
|
|
167
|
+
GANDA = 158,
|
|
168
|
+
MANX = 159,
|
|
169
|
+
MONTENEGRIN = 160,
|
|
170
|
+
NUM_LANGUAGES = 161,
|
|
171
|
+
|
|
172
|
+
Thanks to Mike McCandless for finding this code and writing a python version
|
|
173
|
+
Thanks to the Chrome Authors.
|
data/Rakefile
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
|
|
2
|
+
require 'rubygems'
|
|
3
|
+
require 'rake'
|
|
4
|
+
require 'echoe'
|
|
5
|
+
|
|
6
|
+
Echoe.new('cld', '0.1.0') do |p|
|
|
7
|
+
p.description = "Compact Language Detection from chrome"
|
|
8
|
+
p.url = "http://github.com/jtoy/cld"
|
|
9
|
+
p.author = "Jason Toy"
|
|
10
|
+
p.email = "jtoy@jtoy.net"
|
|
11
|
+
p.ignore_pattern = ["tmp/*", "script/*"]
|
|
12
|
+
p.development_dependencies = []
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
#Dir["#{File.dirname(__FILE__)}/tasks/*.rake"].sort.each { |ext| load ext }
|
data/base/basictypes.h
ADDED
|
@@ -0,0 +1,348 @@
|
|
|
1
|
+
// Copyright (c) 2010 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#ifndef BASE_BASICTYPES_H_
|
|
6
|
+
#define BASE_BASICTYPES_H_
|
|
7
|
+
|
|
8
|
+
#include <limits.h> // So we can set the bounds of our types
|
|
9
|
+
#include <stddef.h> // For size_t
|
|
10
|
+
#include <string.h> // for memcpy
|
|
11
|
+
|
|
12
|
+
#include "base/port.h" // Types that only need exist on certain systems
|
|
13
|
+
|
|
14
|
+
#ifndef COMPILER_MSVC
|
|
15
|
+
// stdint.h is part of C99 but MSVC doesn't have it.
|
|
16
|
+
#include <stdint.h> // For intptr_t.
|
|
17
|
+
#endif
|
|
18
|
+
|
|
19
|
+
typedef signed char schar;
|
|
20
|
+
typedef signed char int8;
|
|
21
|
+
typedef short int16;
|
|
22
|
+
// TODO(mbelshe) Remove these type guards. These are
|
|
23
|
+
// temporary to avoid conflicts with npapi.h.
|
|
24
|
+
#ifndef _INT32
|
|
25
|
+
#define _INT32
|
|
26
|
+
typedef int int32;
|
|
27
|
+
#endif
|
|
28
|
+
|
|
29
|
+
// The NSPR system headers define 64-bit as |long| when possible. In order to
|
|
30
|
+
// not have typedef mismatches, we do the same on LP64.
|
|
31
|
+
#if __LP64__
|
|
32
|
+
typedef long int64;
|
|
33
|
+
#else
|
|
34
|
+
typedef long long int64;
|
|
35
|
+
#endif
|
|
36
|
+
|
|
37
|
+
// NOTE: unsigned types are DANGEROUS in loops and other arithmetical
|
|
38
|
+
// places. Use the signed types unless your variable represents a bit
|
|
39
|
+
// pattern (eg a hash value) or you really need the extra bit. Do NOT
|
|
40
|
+
// use 'unsigned' to express "this value should always be positive";
|
|
41
|
+
// use assertions for this.
|
|
42
|
+
|
|
43
|
+
typedef unsigned char uint8;
|
|
44
|
+
typedef unsigned short uint16;
|
|
45
|
+
// TODO(mbelshe) Remove these type guards. These are
|
|
46
|
+
// temporary to avoid conflicts with npapi.h.
|
|
47
|
+
#ifndef _UINT32
|
|
48
|
+
#define _UINT32
|
|
49
|
+
typedef unsigned int uint32;
|
|
50
|
+
#endif
|
|
51
|
+
|
|
52
|
+
// See the comment above about NSPR and 64-bit.
|
|
53
|
+
#if __LP64__
|
|
54
|
+
typedef unsigned long uint64;
|
|
55
|
+
#else
|
|
56
|
+
typedef unsigned long long uint64;
|
|
57
|
+
#endif
|
|
58
|
+
|
|
59
|
+
// A type to represent a Unicode code-point value. As of Unicode 4.0,
|
|
60
|
+
// such values require up to 21 bits.
|
|
61
|
+
// (For type-checking on pointers, make this explicitly signed,
|
|
62
|
+
// and it should always be the signed version of whatever int32 is.)
|
|
63
|
+
typedef signed int char32;
|
|
64
|
+
|
|
65
|
+
const uint8 kuint8max = (( uint8) 0xFF);
|
|
66
|
+
const uint16 kuint16max = ((uint16) 0xFFFF);
|
|
67
|
+
const uint32 kuint32max = ((uint32) 0xFFFFFFFF);
|
|
68
|
+
const uint64 kuint64max = ((uint64) GG_LONGLONG(0xFFFFFFFFFFFFFFFF));
|
|
69
|
+
const int8 kint8min = (( int8) 0x80);
|
|
70
|
+
const int8 kint8max = (( int8) 0x7F);
|
|
71
|
+
const int16 kint16min = (( int16) 0x8000);
|
|
72
|
+
const int16 kint16max = (( int16) 0x7FFF);
|
|
73
|
+
const int32 kint32min = (( int32) 0x80000000);
|
|
74
|
+
const int32 kint32max = (( int32) 0x7FFFFFFF);
|
|
75
|
+
const int64 kint64min = (( int64) GG_LONGLONG(0x8000000000000000));
|
|
76
|
+
const int64 kint64max = (( int64) GG_LONGLONG(0x7FFFFFFFFFFFFFFF));
|
|
77
|
+
|
|
78
|
+
// A macro to disallow the copy constructor and operator= functions
|
|
79
|
+
// This should be used in the private: declarations for a class
|
|
80
|
+
#define DISALLOW_COPY_AND_ASSIGN(TypeName) \
|
|
81
|
+
TypeName(const TypeName&); \
|
|
82
|
+
void operator=(const TypeName&)
|
|
83
|
+
|
|
84
|
+
// An older, deprecated, politically incorrect name for the above.
|
|
85
|
+
#define DISALLOW_EVIL_CONSTRUCTORS(TypeName) DISALLOW_COPY_AND_ASSIGN(TypeName)
|
|
86
|
+
|
|
87
|
+
// A macro to disallow all the implicit constructors, namely the
|
|
88
|
+
// default constructor, copy constructor and operator= functions.
|
|
89
|
+
//
|
|
90
|
+
// This should be used in the private: declarations for a class
|
|
91
|
+
// that wants to prevent anyone from instantiating it. This is
|
|
92
|
+
// especially useful for classes containing only static methods.
|
|
93
|
+
#define DISALLOW_IMPLICIT_CONSTRUCTORS(TypeName) \
|
|
94
|
+
TypeName(); \
|
|
95
|
+
DISALLOW_COPY_AND_ASSIGN(TypeName)
|
|
96
|
+
|
|
97
|
+
// The arraysize(arr) macro returns the # of elements in an array arr.
|
|
98
|
+
// The expression is a compile-time constant, and therefore can be
|
|
99
|
+
// used in defining new arrays, for example. If you use arraysize on
|
|
100
|
+
// a pointer by mistake, you will get a compile-time error.
|
|
101
|
+
//
|
|
102
|
+
// One caveat is that arraysize() doesn't accept any array of an
|
|
103
|
+
// anonymous type or a type defined inside a function. In these rare
|
|
104
|
+
// cases, you have to use the unsafe ARRAYSIZE_UNSAFE() macro below. This is
|
|
105
|
+
// due to a limitation in C++'s template system. The limitation might
|
|
106
|
+
// eventually be removed, but it hasn't happened yet.
|
|
107
|
+
|
|
108
|
+
// This template function declaration is used in defining arraysize.
|
|
109
|
+
// Note that the function doesn't need an implementation, as we only
|
|
110
|
+
// use its type.
|
|
111
|
+
template <typename T, size_t N>
|
|
112
|
+
char (&ArraySizeHelper(T (&array)[N]))[N];
|
|
113
|
+
|
|
114
|
+
// That gcc wants both of these prototypes seems mysterious. VC, for
|
|
115
|
+
// its part, can't decide which to use (another mystery). Matching of
|
|
116
|
+
// template overloads: the final frontier.
|
|
117
|
+
#ifndef _MSC_VER
|
|
118
|
+
template <typename T, size_t N>
|
|
119
|
+
char (&ArraySizeHelper(const T (&array)[N]))[N];
|
|
120
|
+
#endif
|
|
121
|
+
|
|
122
|
+
#define arraysize(array) (sizeof(ArraySizeHelper(array)))
|
|
123
|
+
|
|
124
|
+
// ARRAYSIZE_UNSAFE performs essentially the same calculation as arraysize,
|
|
125
|
+
// but can be used on anonymous types or types defined inside
|
|
126
|
+
// functions. It's less safe than arraysize as it accepts some
|
|
127
|
+
// (although not all) pointers. Therefore, you should use arraysize
|
|
128
|
+
// whenever possible.
|
|
129
|
+
//
|
|
130
|
+
// The expression ARRAYSIZE_UNSAFE(a) is a compile-time constant of type
|
|
131
|
+
// size_t.
|
|
132
|
+
//
|
|
133
|
+
// ARRAYSIZE_UNSAFE catches a few type errors. If you see a compiler error
|
|
134
|
+
//
|
|
135
|
+
// "warning: division by zero in ..."
|
|
136
|
+
//
|
|
137
|
+
// when using ARRAYSIZE_UNSAFE, you are (wrongfully) giving it a pointer.
|
|
138
|
+
// You should only use ARRAYSIZE_UNSAFE on statically allocated arrays.
|
|
139
|
+
//
|
|
140
|
+
// The following comments are on the implementation details, and can
|
|
141
|
+
// be ignored by the users.
|
|
142
|
+
//
|
|
143
|
+
// ARRAYSIZE_UNSAFE(arr) works by inspecting sizeof(arr) (the # of bytes in
|
|
144
|
+
// the array) and sizeof(*(arr)) (the # of bytes in one array
|
|
145
|
+
// element). If the former is divisible by the latter, perhaps arr is
|
|
146
|
+
// indeed an array, in which case the division result is the # of
|
|
147
|
+
// elements in the array. Otherwise, arr cannot possibly be an array,
|
|
148
|
+
// and we generate a compiler error to prevent the code from
|
|
149
|
+
// compiling.
|
|
150
|
+
//
|
|
151
|
+
// Since the size of bool is implementation-defined, we need to cast
|
|
152
|
+
// !(sizeof(a) & sizeof(*(a))) to size_t in order to ensure the final
|
|
153
|
+
// result has type size_t.
|
|
154
|
+
//
|
|
155
|
+
// This macro is not perfect as it wrongfully accepts certain
|
|
156
|
+
// pointers, namely where the pointer size is divisible by the pointee
|
|
157
|
+
// size. Since all our code has to go through a 32-bit compiler,
|
|
158
|
+
// where a pointer is 4 bytes, this means all pointers to a type whose
|
|
159
|
+
// size is 3 or greater than 4 will be (righteously) rejected.
|
|
160
|
+
|
|
161
|
+
#define ARRAYSIZE_UNSAFE(a) \
|
|
162
|
+
((sizeof(a) / sizeof(*(a))) / \
|
|
163
|
+
static_cast<size_t>(!(sizeof(a) % sizeof(*(a)))))
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
// Use implicit_cast as a safe version of static_cast or const_cast
|
|
167
|
+
// for upcasting in the type hierarchy (i.e. casting a pointer to Foo
|
|
168
|
+
// to a pointer to SuperclassOfFoo or casting a pointer to Foo to
|
|
169
|
+
// a const pointer to Foo).
|
|
170
|
+
// When you use implicit_cast, the compiler checks that the cast is safe.
|
|
171
|
+
// Such explicit implicit_casts are necessary in surprisingly many
|
|
172
|
+
// situations where C++ demands an exact type match instead of an
|
|
173
|
+
// argument type convertable to a target type.
|
|
174
|
+
//
|
|
175
|
+
// The From type can be inferred, so the preferred syntax for using
|
|
176
|
+
// implicit_cast is the same as for static_cast etc.:
|
|
177
|
+
//
|
|
178
|
+
// implicit_cast<ToType>(expr)
|
|
179
|
+
//
|
|
180
|
+
// implicit_cast would have been part of the C++ standard library,
|
|
181
|
+
// but the proposal was submitted too late. It will probably make
|
|
182
|
+
// its way into the language in the future.
|
|
183
|
+
template<typename To, typename From>
|
|
184
|
+
inline To implicit_cast(From const &f) {
|
|
185
|
+
return f;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
// The COMPILE_ASSERT macro can be used to verify that a compile time
|
|
189
|
+
// expression is true. For example, you could use it to verify the
|
|
190
|
+
// size of a static array:
|
|
191
|
+
//
|
|
192
|
+
// COMPILE_ASSERT(ARRAYSIZE_UNSAFE(content_type_names) == CONTENT_NUM_TYPES,
|
|
193
|
+
// content_type_names_incorrect_size);
|
|
194
|
+
//
|
|
195
|
+
// or to make sure a struct is smaller than a certain size:
|
|
196
|
+
//
|
|
197
|
+
// COMPILE_ASSERT(sizeof(foo) < 128, foo_too_large);
|
|
198
|
+
//
|
|
199
|
+
// The second argument to the macro is the name of the variable. If
|
|
200
|
+
// the expression is false, most compilers will issue a warning/error
|
|
201
|
+
// containing the name of the variable.
|
|
202
|
+
|
|
203
|
+
template <bool>
|
|
204
|
+
struct CompileAssert {
|
|
205
|
+
};
|
|
206
|
+
|
|
207
|
+
#undef COMPILE_ASSERT
|
|
208
|
+
#define COMPILE_ASSERT(expr, msg) \
|
|
209
|
+
typedef CompileAssert<(bool(expr))> msg[bool(expr) ? 1 : -1]
|
|
210
|
+
|
|
211
|
+
// Implementation details of COMPILE_ASSERT:
|
|
212
|
+
//
|
|
213
|
+
// - COMPILE_ASSERT works by defining an array type that has -1
|
|
214
|
+
// elements (and thus is invalid) when the expression is false.
|
|
215
|
+
//
|
|
216
|
+
// - The simpler definition
|
|
217
|
+
//
|
|
218
|
+
// #define COMPILE_ASSERT(expr, msg) typedef char msg[(expr) ? 1 : -1]
|
|
219
|
+
//
|
|
220
|
+
// does not work, as gcc supports variable-length arrays whose sizes
|
|
221
|
+
// are determined at run-time (this is gcc's extension and not part
|
|
222
|
+
// of the C++ standard). As a result, gcc fails to reject the
|
|
223
|
+
// following code with the simple definition:
|
|
224
|
+
//
|
|
225
|
+
// int foo;
|
|
226
|
+
// COMPILE_ASSERT(foo, msg); // not supposed to compile as foo is
|
|
227
|
+
// // not a compile-time constant.
|
|
228
|
+
//
|
|
229
|
+
// - By using the type CompileAssert<(bool(expr))>, we ensures that
|
|
230
|
+
// expr is a compile-time constant. (Template arguments must be
|
|
231
|
+
// determined at compile-time.)
|
|
232
|
+
//
|
|
233
|
+
// - The outter parentheses in CompileAssert<(bool(expr))> are necessary
|
|
234
|
+
// to work around a bug in gcc 3.4.4 and 4.0.1. If we had written
|
|
235
|
+
//
|
|
236
|
+
// CompileAssert<bool(expr)>
|
|
237
|
+
//
|
|
238
|
+
// instead, these compilers will refuse to compile
|
|
239
|
+
//
|
|
240
|
+
// COMPILE_ASSERT(5 > 0, some_message);
|
|
241
|
+
//
|
|
242
|
+
// (They seem to think the ">" in "5 > 0" marks the end of the
|
|
243
|
+
// template argument list.)
|
|
244
|
+
//
|
|
245
|
+
// - The array size is (bool(expr) ? 1 : -1), instead of simply
|
|
246
|
+
//
|
|
247
|
+
// ((expr) ? 1 : -1).
|
|
248
|
+
//
|
|
249
|
+
// This is to avoid running into a bug in MS VC 7.1, which
|
|
250
|
+
// causes ((0.0) ? 1 : -1) to incorrectly evaluate to 1.
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
// MetatagId refers to metatag-id that we assign to
|
|
254
|
+
// each metatag <name, value> pair..
|
|
255
|
+
typedef uint32 MetatagId;
|
|
256
|
+
|
|
257
|
+
// Argument type used in interfaces that can optionally take ownership
|
|
258
|
+
// of a passed in argument. If TAKE_OWNERSHIP is passed, the called
|
|
259
|
+
// object takes ownership of the argument. Otherwise it does not.
|
|
260
|
+
enum Ownership {
|
|
261
|
+
DO_NOT_TAKE_OWNERSHIP,
|
|
262
|
+
TAKE_OWNERSHIP
|
|
263
|
+
};
|
|
264
|
+
|
|
265
|
+
// bit_cast<Dest,Source> is a template function that implements the
|
|
266
|
+
// equivalent of "*reinterpret_cast<Dest*>(&source)". We need this in
|
|
267
|
+
// very low-level functions like the protobuf library and fast math
|
|
268
|
+
// support.
|
|
269
|
+
//
|
|
270
|
+
// float f = 3.14159265358979;
|
|
271
|
+
// int i = bit_cast<int32>(f);
|
|
272
|
+
// // i = 0x40490fdb
|
|
273
|
+
//
|
|
274
|
+
// The classical address-casting method is:
|
|
275
|
+
//
|
|
276
|
+
// // WRONG
|
|
277
|
+
// float f = 3.14159265358979; // WRONG
|
|
278
|
+
// int i = * reinterpret_cast<int*>(&f); // WRONG
|
|
279
|
+
//
|
|
280
|
+
// The address-casting method actually produces undefined behavior
|
|
281
|
+
// according to ISO C++ specification section 3.10 -15 -. Roughly, this
|
|
282
|
+
// section says: if an object in memory has one type, and a program
|
|
283
|
+
// accesses it with a different type, then the result is undefined
|
|
284
|
+
// behavior for most values of "different type".
|
|
285
|
+
//
|
|
286
|
+
// This is true for any cast syntax, either *(int*)&f or
|
|
287
|
+
// *reinterpret_cast<int*>(&f). And it is particularly true for
|
|
288
|
+
// conversions betweeen integral lvalues and floating-point lvalues.
|
|
289
|
+
//
|
|
290
|
+
// The purpose of 3.10 -15- is to allow optimizing compilers to assume
|
|
291
|
+
// that expressions with different types refer to different memory. gcc
|
|
292
|
+
// 4.0.1 has an optimizer that takes advantage of this. So a
|
|
293
|
+
// non-conforming program quietly produces wildly incorrect output.
|
|
294
|
+
//
|
|
295
|
+
// The problem is not the use of reinterpret_cast. The problem is type
|
|
296
|
+
// punning: holding an object in memory of one type and reading its bits
|
|
297
|
+
// back using a different type.
|
|
298
|
+
//
|
|
299
|
+
// The C++ standard is more subtle and complex than this, but that
|
|
300
|
+
// is the basic idea.
|
|
301
|
+
//
|
|
302
|
+
// Anyways ...
|
|
303
|
+
//
|
|
304
|
+
// bit_cast<> calls memcpy() which is blessed by the standard,
|
|
305
|
+
// especially by the example in section 3.9 . Also, of course,
|
|
306
|
+
// bit_cast<> wraps up the nasty logic in one place.
|
|
307
|
+
//
|
|
308
|
+
// Fortunately memcpy() is very fast. In optimized mode, with a
|
|
309
|
+
// constant size, gcc 2.95.3, gcc 4.0.1, and msvc 7.1 produce inline
|
|
310
|
+
// code with the minimal amount of data movement. On a 32-bit system,
|
|
311
|
+
// memcpy(d,s,4) compiles to one load and one store, and memcpy(d,s,8)
|
|
312
|
+
// compiles to two loads and two stores.
|
|
313
|
+
//
|
|
314
|
+
// I tested this code with gcc 2.95.3, gcc 4.0.1, icc 8.1, and msvc 7.1.
|
|
315
|
+
//
|
|
316
|
+
// WARNING: if Dest or Source is a non-POD type, the result of the memcpy
|
|
317
|
+
// is likely to surprise you.
|
|
318
|
+
|
|
319
|
+
template <class Dest, class Source>
|
|
320
|
+
inline Dest bit_cast(const Source& source) {
|
|
321
|
+
// Compile time assertion: sizeof(Dest) == sizeof(Source)
|
|
322
|
+
// A compile error here means your Dest and Source have different sizes.
|
|
323
|
+
typedef char VerifySizesAreEqual [sizeof(Dest) == sizeof(Source) ? 1 : -1];
|
|
324
|
+
|
|
325
|
+
Dest dest;
|
|
326
|
+
memcpy(&dest, &source, sizeof(dest));
|
|
327
|
+
return dest;
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
// The following enum should be used only as a constructor argument to indicate
|
|
331
|
+
// that the variable has static storage class, and that the constructor should
|
|
332
|
+
// do nothing to its state. It indicates to the reader that it is legal to
|
|
333
|
+
// declare a static instance of the class, provided the constructor is given
|
|
334
|
+
// the base::LINKER_INITIALIZED argument. Normally, it is unsafe to declare a
|
|
335
|
+
// static variable that has a constructor or a destructor because invocation
|
|
336
|
+
// order is undefined. However, IF the type can be initialized by filling with
|
|
337
|
+
// zeroes (which the loader does for static variables), AND the destructor also
|
|
338
|
+
// does nothing to the storage, AND there are no virtual methods, then a
|
|
339
|
+
// constructor declared as
|
|
340
|
+
// explicit MyClass(base::LinkerInitialized x) {}
|
|
341
|
+
// and invoked as
|
|
342
|
+
// static MyClass my_variable_name(base::LINKER_INITIALIZED);
|
|
343
|
+
namespace base {
|
|
344
|
+
enum LinkerInitialized { LINKER_INITIALIZED };
|
|
345
|
+
} // base
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
#endif // BASE_BASICTYPES_H_
|