language_detection 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +19 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +85 -0
- data/Rakefile +11 -0
- data/ext/cld/Makefile +34 -0
- data/ext/cld/base/basictypes.h +348 -0
- data/ext/cld/base/build_config.h +124 -0
- data/ext/cld/base/casts.h +156 -0
- data/ext/cld/base/commandlineflags.h +443 -0
- data/ext/cld/base/crash.h +41 -0
- data/ext/cld/base/dynamic_annotations.h +358 -0
- data/ext/cld/base/global_strip_options.h +59 -0
- data/ext/cld/base/log_severity.h +46 -0
- data/ext/cld/base/logging.h +1403 -0
- data/ext/cld/base/macros.h +243 -0
- data/ext/cld/base/port.h +54 -0
- data/ext/cld/base/scoped_ptr.h +428 -0
- data/ext/cld/base/stl_decl.h +0 -0
- data/ext/cld/base/stl_decl_msvc.h +107 -0
- data/ext/cld/base/string_util.h +29 -0
- data/ext/cld/base/strtoint.h +93 -0
- data/ext/cld/base/template_util.h +96 -0
- data/ext/cld/base/type_traits.h +198 -0
- data/ext/cld/base/vlog_is_on.h +143 -0
- data/ext/cld/cld.so +0 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
- data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
- data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
- data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
- data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
- data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
- data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
- data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
- data/ext/cld/encodings/internal/encodings.cc +12 -0
- data/ext/cld/encodings/lang_enc.h +254 -0
- data/ext/cld/encodings/proto/encodings.pb.h +169 -0
- data/ext/cld/encodings/public/encodings.h +301 -0
- data/ext/cld/extconf.rb +1 -0
- data/ext/cld/language_detection.cc +88 -0
- data/ext/cld/languages/internal/languages.cc +337 -0
- data/ext/cld/languages/proto/languages.pb.h +179 -0
- data/ext/cld/languages/public/languages.h +379 -0
- data/language_detection.gemspec +28 -0
- data/lib/language_detection/string.rb +1 -0
- data/lib/language_detection/version.rb +3 -0
- data/lib/language_detection.rb +54 -0
- data/test/_helper.rb +15 -0
- data/test/fixtures/languages.csv +80 -0
- data/test/language_detection_test.rb +88 -0
- metadata +250 -0
@@ -0,0 +1,53 @@
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
//
|
5
|
+
// Created by postproc-shortwords 1.6 on 2008-10-07 16:15:48
|
6
|
+
// From input file /tmp/input_10p_l8_sort.utf8
|
7
|
+
// See compact_lang_det.cc for usage
|
8
|
+
//
|
9
|
+
#include "encodings/compact_lang_det/cldutil.h"
|
10
|
+
|
11
|
+
// Suppressed:
|
12
|
+
// az-Arab az-Cyrl ku-Latn tg-Arab za-Hani zzb-Latn zze-Latn zzh-Latn ru-Latn
|
13
|
+
|
14
|
+
// Remapped:
|
15
|
+
// xxx-Latn=>ut-Latn sh-Latn=>hr-Latn sh-Cyrl=>sr-Cyrl
|
16
|
+
|
17
|
+
// ms/id probabilities leveled
|
18
|
+
|
19
|
+
static const int kLongWord8TableBuildDate = 20081007; // yyyymmdd
|
20
|
+
|
21
|
+
COMPILE_ASSERT(MONTENEGRIN == 160, k_montenegrin_changed);
|
22
|
+
COMPILE_ASSERT(EXT_NUM_LANGUAGES == 209, k_ext_num_languages_changed);
|
23
|
+
|
24
|
+
static const int kLongWord8TableSize = 1; // Bucket count
|
25
|
+
static const int kLongWord8TableKeyMask = 0xffffffff; // Mask hash key
|
26
|
+
|
27
|
+
COMPILE_ASSERT(MONTENEGRIN == 160, k_montenegrin_changed);
|
28
|
+
COMPILE_ASSERT(EXT_NUM_LANGUAGES == 209, k_ext_num_languages_changed);
|
29
|
+
|
30
|
+
// Empty table
|
31
|
+
static const cld::IndirectProbBucket4 kLongWord8Table[kLongWord8TableSize] = {
|
32
|
+
// key[4], words[4] in UTF-8
|
33
|
+
// value[4]
|
34
|
+
{ {0x00000000,0x00000000,0x00000000,0x00000000}}, // [000] c
|
35
|
+
};
|
36
|
+
|
37
|
+
static const uint32 kLongWord8TableInd[1] = {
|
38
|
+
// [0000]
|
39
|
+
0x00000000, };
|
40
|
+
|
41
|
+
COMPILE_ASSERT(1 < (1 << 16), k_indirectbits_too_small);
|
42
|
+
|
43
|
+
|
44
|
+
extern const cld::CLDTableSummary kLongWord8Table_obj = {
|
45
|
+
kLongWord8Table,
|
46
|
+
kLongWord8TableInd,
|
47
|
+
kLongWord8TableSize,
|
48
|
+
arraysize(kLongWord8TableInd),
|
49
|
+
kLongWord8TableKeyMask,
|
50
|
+
kLongWord8TableBuildDate,
|
51
|
+
};
|
52
|
+
|
53
|
+
// End of generated tables
|
@@ -0,0 +1,10 @@
|
|
1
|
+
// Copyright (c) 2011 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_GENERATED_MEANSCORE_H__
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_GENERATED_MEANSCORE_H__
|
7
|
+
|
8
|
+
extern const short kMeanScore[];
|
9
|
+
|
10
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_GENERATED_MEANSCORE_H__
|
@@ -0,0 +1,50 @@
|
|
1
|
+
// Copyright (c) 2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
//
|
5
|
+
// Created by postproc-shortwords 1.8 on 2009-03-22 11:11:34
|
6
|
+
// From input file /tmp/good_quad_input4567_sort.utf8
|
7
|
+
// See compact_lang_det.cc for usage
|
8
|
+
//
|
9
|
+
#include "encodings/compact_lang_det/cldutil.h"
|
10
|
+
|
11
|
+
// Suppressed:
|
12
|
+
// ms-Latn gl-Latn mt-Latn af-Latn eu-Latn mk-Cyrl fa-Arab
|
13
|
+
|
14
|
+
// Remapped:
|
15
|
+
// xxx-Latn=>ut-Latn sh-Latn=>hr-Latn sh-Cyrl=>sr-Cyrl
|
16
|
+
|
17
|
+
// ms/id probabilities leveled
|
18
|
+
|
19
|
+
static const int kQuadTableBuildDate = 20090322; // yyyymmdd
|
20
|
+
|
21
|
+
COMPILE_ASSERT(MONTENEGRIN == 160, k_montenegrin_changed);
|
22
|
+
COMPILE_ASSERT(EXT_NUM_LANGUAGES == 209, k_ext_num_languages_changed);
|
23
|
+
|
24
|
+
|
25
|
+
static const int kQuadTableSize = 1; // Bucket count
|
26
|
+
static const int kQuadTableKeyMask = 0xffffffff; // Mask hash key
|
27
|
+
|
28
|
+
|
29
|
+
// Empty table
|
30
|
+
static const cld::IndirectProbBucket4 kQuadTable[kQuadTableSize] = {
|
31
|
+
// key[4], words[4] in UTF-8
|
32
|
+
// value[4]
|
33
|
+
{ {0x00000000,0x00000000,0x00000000,0x00000000}}, // [000] c
|
34
|
+
};
|
35
|
+
|
36
|
+
static const uint32 kQuadTableInd[1] = {
|
37
|
+
// [0000]
|
38
|
+
0x00000000, };
|
39
|
+
|
40
|
+
|
41
|
+
extern const cld::CLDTableSummary kQuadTable_obj = {
|
42
|
+
kQuadTable,
|
43
|
+
kQuadTableInd,
|
44
|
+
kQuadTableSize,
|
45
|
+
arraysize(kQuadTableInd),
|
46
|
+
kQuadTableKeyMask,
|
47
|
+
kQuadTableBuildDate,
|
48
|
+
};
|
49
|
+
|
50
|
+
// End of generated tables
|