language_detection 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. data/.gitignore +19 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +85 -0
  5. data/Rakefile +11 -0
  6. data/ext/cld/Makefile +34 -0
  7. data/ext/cld/base/basictypes.h +348 -0
  8. data/ext/cld/base/build_config.h +124 -0
  9. data/ext/cld/base/casts.h +156 -0
  10. data/ext/cld/base/commandlineflags.h +443 -0
  11. data/ext/cld/base/crash.h +41 -0
  12. data/ext/cld/base/dynamic_annotations.h +358 -0
  13. data/ext/cld/base/global_strip_options.h +59 -0
  14. data/ext/cld/base/log_severity.h +46 -0
  15. data/ext/cld/base/logging.h +1403 -0
  16. data/ext/cld/base/macros.h +243 -0
  17. data/ext/cld/base/port.h +54 -0
  18. data/ext/cld/base/scoped_ptr.h +428 -0
  19. data/ext/cld/base/stl_decl.h +0 -0
  20. data/ext/cld/base/stl_decl_msvc.h +107 -0
  21. data/ext/cld/base/string_util.h +29 -0
  22. data/ext/cld/base/strtoint.h +93 -0
  23. data/ext/cld/base/template_util.h +96 -0
  24. data/ext/cld/base/type_traits.h +198 -0
  25. data/ext/cld/base/vlog_is_on.h +143 -0
  26. data/ext/cld/cld.so +0 -0
  27. data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
  28. data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
  29. data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
  30. data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
  31. data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
  32. data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
  33. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
  34. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
  35. data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
  36. data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
  37. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
  38. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
  39. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
  40. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
  41. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
  42. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
  43. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
  44. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
  45. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
  46. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
  47. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
  48. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
  49. data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
  50. data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
  51. data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
  52. data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
  53. data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
  54. data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
  55. data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
  56. data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
  57. data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
  58. data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
  59. data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
  60. data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
  61. data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
  62. data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
  63. data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
  64. data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
  65. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
  66. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
  67. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
  68. data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
  69. data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
  70. data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
  71. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
  72. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
  73. data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
  74. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
  75. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
  76. data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
  77. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
  78. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
  79. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
  80. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
  81. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
  82. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
  83. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
  84. data/ext/cld/encodings/internal/encodings.cc +12 -0
  85. data/ext/cld/encodings/lang_enc.h +254 -0
  86. data/ext/cld/encodings/proto/encodings.pb.h +169 -0
  87. data/ext/cld/encodings/public/encodings.h +301 -0
  88. data/ext/cld/extconf.rb +1 -0
  89. data/ext/cld/language_detection.cc +88 -0
  90. data/ext/cld/languages/internal/languages.cc +337 -0
  91. data/ext/cld/languages/proto/languages.pb.h +179 -0
  92. data/ext/cld/languages/public/languages.h +379 -0
  93. data/language_detection.gemspec +28 -0
  94. data/lib/language_detection/string.rb +1 -0
  95. data/lib/language_detection/version.rb +3 -0
  96. data/lib/language_detection.rb +54 -0
  97. data/test/_helper.rb +15 -0
  98. data/test/fixtures/languages.csv +80 -0
  99. data/test/language_detection_test.rb +88 -0
  100. metadata +250 -0
@@ -0,0 +1,53 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+ //
5
+ // Created by postproc-shortwords 1.6 on 2008-10-07 16:15:48
6
+ // From input file /tmp/input_10p_l8_sort.utf8
7
+ // See compact_lang_det.cc for usage
8
+ //
9
+ #include "encodings/compact_lang_det/cldutil.h"
10
+
11
+ // Suppressed:
12
+ // az-Arab az-Cyrl ku-Latn tg-Arab za-Hani zzb-Latn zze-Latn zzh-Latn ru-Latn
13
+
14
+ // Remapped:
15
+ // xxx-Latn=>ut-Latn sh-Latn=>hr-Latn sh-Cyrl=>sr-Cyrl
16
+
17
+ // ms/id probabilities leveled
18
+
19
+ static const int kLongWord8TableBuildDate = 20081007; // yyyymmdd
20
+
21
+ COMPILE_ASSERT(MONTENEGRIN == 160, k_montenegrin_changed);
22
+ COMPILE_ASSERT(EXT_NUM_LANGUAGES == 209, k_ext_num_languages_changed);
23
+
24
+ static const int kLongWord8TableSize = 1; // Bucket count
25
+ static const int kLongWord8TableKeyMask = 0xffffffff; // Mask hash key
26
+
27
+ COMPILE_ASSERT(MONTENEGRIN == 160, k_montenegrin_changed);
28
+ COMPILE_ASSERT(EXT_NUM_LANGUAGES == 209, k_ext_num_languages_changed);
29
+
30
+ // Empty table
31
+ static const cld::IndirectProbBucket4 kLongWord8Table[kLongWord8TableSize] = {
32
+ // key[4], words[4] in UTF-8
33
+ // value[4]
34
+ { {0x00000000,0x00000000,0x00000000,0x00000000}}, // [000] c
35
+ };
36
+
37
+ static const uint32 kLongWord8TableInd[1] = {
38
+ // [0000]
39
+ 0x00000000, };
40
+
41
+ COMPILE_ASSERT(1 < (1 << 16), k_indirectbits_too_small);
42
+
43
+
44
+ extern const cld::CLDTableSummary kLongWord8Table_obj = {
45
+ kLongWord8Table,
46
+ kLongWord8TableInd,
47
+ kLongWord8TableSize,
48
+ arraysize(kLongWord8TableInd),
49
+ kLongWord8TableKeyMask,
50
+ kLongWord8TableBuildDate,
51
+ };
52
+
53
+ // End of generated tables
@@ -0,0 +1,10 @@
1
+ // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_GENERATED_MEANSCORE_H__
6
+ #define ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_GENERATED_MEANSCORE_H__
7
+
8
+ extern const short kMeanScore[];
9
+
10
+ #endif // ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_GENERATED_MEANSCORE_H__
@@ -0,0 +1,50 @@
1
+ // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+ //
5
+ // Created by postproc-shortwords 1.8 on 2009-03-22 11:11:34
6
+ // From input file /tmp/good_quad_input4567_sort.utf8
7
+ // See compact_lang_det.cc for usage
8
+ //
9
+ #include "encodings/compact_lang_det/cldutil.h"
10
+
11
+ // Suppressed:
12
+ // ms-Latn gl-Latn mt-Latn af-Latn eu-Latn mk-Cyrl fa-Arab
13
+
14
+ // Remapped:
15
+ // xxx-Latn=>ut-Latn sh-Latn=>hr-Latn sh-Cyrl=>sr-Cyrl
16
+
17
+ // ms/id probabilities leveled
18
+
19
+ static const int kQuadTableBuildDate = 20090322; // yyyymmdd
20
+
21
+ COMPILE_ASSERT(MONTENEGRIN == 160, k_montenegrin_changed);
22
+ COMPILE_ASSERT(EXT_NUM_LANGUAGES == 209, k_ext_num_languages_changed);
23
+
24
+
25
+ static const int kQuadTableSize = 1; // Bucket count
26
+ static const int kQuadTableKeyMask = 0xffffffff; // Mask hash key
27
+
28
+
29
+ // Empty table
30
+ static const cld::IndirectProbBucket4 kQuadTable[kQuadTableSize] = {
31
+ // key[4], words[4] in UTF-8
32
+ // value[4]
33
+ { {0x00000000,0x00000000,0x00000000,0x00000000}}, // [000] c
34
+ };
35
+
36
+ static const uint32 kQuadTableInd[1] = {
37
+ // [0000]
38
+ 0x00000000, };
39
+
40
+
41
+ extern const cld::CLDTableSummary kQuadTable_obj = {
42
+ kQuadTable,
43
+ kQuadTableInd,
44
+ kQuadTableSize,
45
+ arraysize(kQuadTableInd),
46
+ kQuadTableKeyMask,
47
+ kQuadTableBuildDate,
48
+ };
49
+
50
+ // End of generated tables