language_detection 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +19 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +85 -0
- data/Rakefile +11 -0
- data/ext/cld/Makefile +34 -0
- data/ext/cld/base/basictypes.h +348 -0
- data/ext/cld/base/build_config.h +124 -0
- data/ext/cld/base/casts.h +156 -0
- data/ext/cld/base/commandlineflags.h +443 -0
- data/ext/cld/base/crash.h +41 -0
- data/ext/cld/base/dynamic_annotations.h +358 -0
- data/ext/cld/base/global_strip_options.h +59 -0
- data/ext/cld/base/log_severity.h +46 -0
- data/ext/cld/base/logging.h +1403 -0
- data/ext/cld/base/macros.h +243 -0
- data/ext/cld/base/port.h +54 -0
- data/ext/cld/base/scoped_ptr.h +428 -0
- data/ext/cld/base/stl_decl.h +0 -0
- data/ext/cld/base/stl_decl_msvc.h +107 -0
- data/ext/cld/base/string_util.h +29 -0
- data/ext/cld/base/strtoint.h +93 -0
- data/ext/cld/base/template_util.h +96 -0
- data/ext/cld/base/type_traits.h +198 -0
- data/ext/cld/base/vlog_is_on.h +143 -0
- data/ext/cld/cld.so +0 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
- data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
- data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
- data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
- data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
- data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
- data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
- data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
- data/ext/cld/encodings/internal/encodings.cc +12 -0
- data/ext/cld/encodings/lang_enc.h +254 -0
- data/ext/cld/encodings/proto/encodings.pb.h +169 -0
- data/ext/cld/encodings/public/encodings.h +301 -0
- data/ext/cld/extconf.rb +1 -0
- data/ext/cld/language_detection.cc +88 -0
- data/ext/cld/languages/internal/languages.cc +337 -0
- data/ext/cld/languages/proto/languages.pb.h +179 -0
- data/ext/cld/languages/public/languages.h +379 -0
- data/language_detection.gemspec +28 -0
- data/lib/language_detection/string.rb +1 -0
- data/lib/language_detection/version.rb +3 -0
- data/lib/language_detection.rb +54 -0
- data/test/_helper.rb +15 -0
- data/test/fixtures/languages.csv +80 -0
- data/test/language_detection_test.rb +88 -0
- metadata +250 -0
@@ -0,0 +1,61 @@
|
|
1
|
+
// Copyright 2009 Google Inc.
|
2
|
+
//
|
3
|
+
// Created by utf8tablebuilder version 2.8
|
4
|
+
// See util/utf8/utf8statetable.h for usage
|
5
|
+
//
|
6
|
+
// Maps properties of all codes from file:
|
7
|
+
// compact_lang_det_generated_ctjkvz.txt
|
8
|
+
// Accepts all other UTF-8 codes 0000..10FFFF
|
9
|
+
// Space optimized
|
10
|
+
//
|
11
|
+
// ** ASSUMES INPUT IS STRUCTURALLY VALID UTF-8 **
|
12
|
+
//
|
13
|
+
// Table offsets for byte 2-of-3 and byte 3-of-4 are
|
14
|
+
// multiplied by 16; offsets for 3-of-3 and 4-of-4 are
|
15
|
+
// relative +/-127 from previous state.
|
16
|
+
|
17
|
+
#include "encodings/compact_lang_det/win/cld_utf8statetable.h"
|
18
|
+
|
19
|
+
|
20
|
+
// Entire table has 1 state block of 64 entries
|
21
|
+
|
22
|
+
static const unsigned int compact_lang_det_generated_ctjkvz_b1_STATE0 = 0; // state[0]
|
23
|
+
static const unsigned int compact_lang_det_generated_ctjkvz_b1_STATE0_SIZE = 64; // =[1]
|
24
|
+
static const unsigned int compact_lang_det_generated_ctjkvz_b1_TOTAL_SIZE = 64;
|
25
|
+
static const unsigned int compact_lang_det_generated_ctjkvz_b1_MAX_EXPAND_X4 = 0;
|
26
|
+
static const unsigned int compact_lang_det_generated_ctjkvz_b1_SHIFT = 6;
|
27
|
+
static const unsigned int compact_lang_det_generated_ctjkvz_b1_BYTES = 1;
|
28
|
+
static const unsigned int compact_lang_det_generated_ctjkvz_b1_LOSUB = 0x80808080;
|
29
|
+
static const unsigned int compact_lang_det_generated_ctjkvz_b1_HIADD = 0x00000000;
|
30
|
+
|
31
|
+
static const uint8 compact_lang_det_generated_ctjkvz_b1[] = {
|
32
|
+
// state[0] 0x000000 Byte 1 (row Ex offsets 16x small)
|
33
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
34
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
35
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
36
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
37
|
+
|
38
|
+
};
|
39
|
+
|
40
|
+
// Remap base[0] = (del, add, string_offset)
|
41
|
+
static const RemapEntry compact_lang_det_generated_ctjkvz_b1_remap_base[] = {
|
42
|
+
{0,0,0} };
|
43
|
+
|
44
|
+
// Remap string[0]
|
45
|
+
static const unsigned char compact_lang_det_generated_ctjkvz_b1_remap_string[] = {
|
46
|
+
0 };
|
47
|
+
|
48
|
+
extern const UTF8PropObj compact_lang_det_generated_ctjkvz_b1_obj = {
|
49
|
+
compact_lang_det_generated_ctjkvz_b1_STATE0,
|
50
|
+
compact_lang_det_generated_ctjkvz_b1_STATE0_SIZE,
|
51
|
+
compact_lang_det_generated_ctjkvz_b1_TOTAL_SIZE,
|
52
|
+
compact_lang_det_generated_ctjkvz_b1_MAX_EXPAND_X4,
|
53
|
+
compact_lang_det_generated_ctjkvz_b1_SHIFT,
|
54
|
+
compact_lang_det_generated_ctjkvz_b1_BYTES,
|
55
|
+
compact_lang_det_generated_ctjkvz_b1_LOSUB,
|
56
|
+
compact_lang_det_generated_ctjkvz_b1_HIADD,
|
57
|
+
compact_lang_det_generated_ctjkvz_b1,
|
58
|
+
compact_lang_det_generated_ctjkvz_b1_remap_base,
|
59
|
+
compact_lang_det_generated_ctjkvz_b1_remap_string,
|
60
|
+
NULL
|
61
|
+
};
|