cld-fixed 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +20 -0
  3. data/.rspec +2 -0
  4. data/Gemfile +6 -0
  5. data/LICENSE +27 -0
  6. data/README.md +34 -0
  7. data/Rakefile +5 -0
  8. data/cld.gemspec +22 -0
  9. data/ext/cld/Makefile.am +28 -0
  10. data/ext/cld/Makefile.in +790 -0
  11. data/ext/cld/aclocal.m4 +8895 -0
  12. data/ext/cld/base/basictypes.h +348 -0
  13. data/ext/cld/base/build_config.h +115 -0
  14. data/ext/cld/base/casts.h +156 -0
  15. data/ext/cld/base/commandlineflags.h +443 -0
  16. data/ext/cld/base/crash.h +41 -0
  17. data/ext/cld/base/dynamic_annotations.h +358 -0
  18. data/ext/cld/base/global_strip_options.h +59 -0
  19. data/ext/cld/base/log_severity.h +46 -0
  20. data/ext/cld/base/logging.h +1403 -0
  21. data/ext/cld/base/macros.h +243 -0
  22. data/ext/cld/base/port.h +54 -0
  23. data/ext/cld/base/scoped_ptr.h +428 -0
  24. data/ext/cld/base/stl_decl.h +0 -0
  25. data/ext/cld/base/stl_decl_msvc.h +107 -0
  26. data/ext/cld/base/string_util.h +29 -0
  27. data/ext/cld/base/strtoint.h +93 -0
  28. data/ext/cld/base/template_util.h +96 -0
  29. data/ext/cld/base/type_traits.h +198 -0
  30. data/ext/cld/base/vlog_is_on.h +143 -0
  31. data/ext/cld/build_aux/config.guess +1500 -0
  32. data/ext/cld/build_aux/config.sub +1616 -0
  33. data/ext/cld/build_aux/depcomp +584 -0
  34. data/ext/cld/build_aux/install-sh +507 -0
  35. data/ext/cld/build_aux/ltmain.sh +8745 -0
  36. data/ext/cld/build_aux/missing +367 -0
  37. data/ext/cld/cld_encodings.h +95 -0
  38. data/ext/cld/configure +17362 -0
  39. data/ext/cld/configure.ac +14 -0
  40. data/ext/cld/encodings/compact_lang_det/#cldutil.cc# +905 -0
  41. data/ext/cld/encodings/compact_lang_det/#cldutil.h# +1205 -0
  42. data/ext/cld/encodings/compact_lang_det/#compact_lang_det_impl.h# +171 -0
  43. data/ext/cld/encodings/compact_lang_det/#ext_lang_enc.cc# +545 -0
  44. data/ext/cld/encodings/compact_lang_det/#ext_lang_enc.h# +119 -0
  45. data/ext/cld/encodings/compact_lang_det/#getonescriptspan.cc# +570 -0
  46. data/ext/cld/encodings/compact_lang_det/#getonescriptspan.h# +131 -0
  47. data/ext/cld/encodings/compact_lang_det/#tote.cc# +299 -0
  48. data/ext/cld/encodings/compact_lang_det/#tote.h# +89 -0
  49. data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
  50. data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
  51. data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
  52. data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
  53. data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
  54. data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
  55. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
  56. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
  57. data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
  58. data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
  59. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
  60. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
  61. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
  62. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
  63. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
  64. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
  65. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
  66. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
  67. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
  68. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
  69. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
  70. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
  71. data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
  72. data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
  73. data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
  74. data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
  75. data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
  76. data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
  77. data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
  78. data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
  79. data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
  80. data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
  81. data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
  82. data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
  83. data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
  84. data/ext/cld/encodings/compact_lang_det/win/#cld_unilib_windows.cc# +29 -0
  85. data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
  86. data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
  87. data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
  88. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
  89. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
  90. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
  91. data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
  92. data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
  93. data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
  94. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
  95. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
  96. data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
  97. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
  98. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
  99. data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
  100. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
  101. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
  102. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
  103. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
  104. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
  105. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
  106. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
  107. data/ext/cld/encodings/internal/encodings.cc +12 -0
  108. data/ext/cld/encodings/lang_enc.h +254 -0
  109. data/ext/cld/encodings/proto/encodings.pb.h +169 -0
  110. data/ext/cld/encodings/public/encodings.h +301 -0
  111. data/ext/cld/extconf.rb +7 -0
  112. data/ext/cld/languages/internal/#languages.cc# +337 -0
  113. data/ext/cld/languages/internal/languages.cc +336 -0
  114. data/ext/cld/languages/proto/languages.pb.h +179 -0
  115. data/ext/cld/languages/public/languages.h +379 -0
  116. data/ext/cld/thunk.cc +55 -0
  117. data/lib/cld.rb +21 -0
  118. data/lib/cld/version.rb +3 -0
  119. data/spec/cld_spec.rb +67 -0
  120. data/spec/spec_helper.rb +6 -0
  121. metadata +193 -0
@@ -0,0 +1,171 @@
1
+ // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_IMPL_H_
6
+ #define ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_IMPL_H_
7
+
8
+ #include "encodings/lang_enc.h"
9
+ #include "encodings/compact_lang_det/win/cld_basictypes.h"
10
+
11
+
12
+ static const int kCLDFlagFinish = 1;
13
+ static const int kCLDFlagSqueeze = 2;
14
+ static const int kCLDFlagRepeats = 4;
15
+ static const int kCLDFlagTop40 = 8;
16
+ static const int kCLDFlagShort = 16;
17
+ static const int kCLDFlagHint = 32; // Experimental, undebugged
18
+ static const int kCLDFlagUseWords = 64;
19
+
20
+ /***
21
+
22
+ Flag meanings:
23
+
24
+ Flags are used in the context of a recursive call from Detect to itself,
25
+ trying to deal in a more restrictive way with input that was not reliably
26
+ identified in the top-level call.
27
+
28
+ Finish -- Do not further recurse; return whatever result ensues, even if it is
29
+ unreliable. Typically set in any recursive call to take a second try
30
+ on unreliable text.
31
+
32
+ Squeeze -- For each text run, do an inplace cheapsqueeze to remove chunks of
33
+ highly repetitive text and chunks of text with too many 1- and
34
+ 2-letter words. This avoids scoring repetitive or useless non-text
35
+ crap in large files such bogus JPEGs within an HTML file.
36
+
37
+ Repeats -- When scoring a text run, do a cheap prediction of each character
38
+ and do not score a unigram/quadgram if the last character of same is
39
+ correctly predicted. This is a slower, finer-grained form of
40
+ cheapsqueeze, typically used when the first pass got unreliable
41
+ results.
42
+
43
+ Top40 -- Restrict the set of scored languages to the Google "Top 40*", which is
44
+ actually 38 languages. This gets rid of about 110 language that
45
+ represent about 0.7% of the web. Typically used when the first pass
46
+ got unreliable results.
47
+
48
+ Short -- Use trigram (three letter) scoring instad of quadgrams. Restricted to
49
+ the top 40* languages, Latin and Cyrillic scripts only.
50
+ Not as precise as quadgrams, but it gives some plausible result on
51
+ 1- or 2-word text in major languages.
52
+
53
+ Hint -- EXPERIMENTAL flag for compact_lang_det_test.cc to indicate a language
54
+ hint supplied in parameter plus_one.
55
+
56
+ UseWords -- In additon to scoring quad/uni/nil-grams, score complete words
57
+
58
+
59
+ Tentative decision logic:
60
+
61
+ In the middle of first pass -- After 4KB of text, look at the front 256 bytes
62
+ of every full 4KB buffer. If it compresses very well (say 3:1) or has
63
+ lots of spaces (say 1 of every 4 bytes), assume that the input is
64
+ large and contains lots of bogus non-text. Recurse, passing the
65
+ Squeeze flag to strip out chunks of this non-text.
66
+
67
+ At the end of the first pass --
68
+ If the top language is reliable and >= 70% of the document, return.
69
+ Else if the top language is reliable and top+2nd >= say 94%, return.
70
+ Else, either the top language is not reliable or there is a lot of
71
+ other crap.
72
+ ***/
73
+
74
+
75
+ namespace CompactLangDet {
76
+ struct DetectionTables;
77
+ } // namespace CompactLangDet
78
+
79
+
80
+ namespace CompactLangDetImpl {
81
+ // Scan interchange-valid UTF-8 bytes and detect most likely language,
82
+ // or set of languages.
83
+ //
84
+ // Design goals:
85
+ // Skip over big stretches of HTML tags
86
+ // Able to return ranges of different languages
87
+ // Relatively small tables and relatively fast processing
88
+ // Thread safe
89
+ //
90
+
91
+ typedef struct {
92
+ int perscript_count;
93
+ const Language* perscript_lang;
94
+ } PerScriptPair;
95
+
96
+ typedef struct {
97
+ // Constants for hashing 4-7 byte quadgram to 32 bits
98
+ const int kQuadHashB4Shift;
99
+ const int kQuadHashB4bShift;
100
+ const int kQuadHashB5Shift;
101
+ const int kQuadHashB5bShift;
102
+ // Constants for hashing 32 bits to kQuadKeyTable subscript/key
103
+ const int kHashvalToSubShift;
104
+ const uint32 kHashvalToSubMask;
105
+ const int kHashvalToKeyShift;
106
+ const uint32 kHashvalToKeyMask;
107
+ const int kHashvalAssociativity;
108
+ // Pointers to the actual tables
109
+ const PerScriptPair* kPerScriptPair;
110
+ const uint16* kQuadKeyTable;
111
+ const uint32* kQuadValueTable;
112
+ } LangDetObj;
113
+
114
+ // For HTML documents, tags are skipped, along with <script> ... </script>
115
+ // and <style> ... </style> sequences, and entities are expanded.
116
+ //
117
+ // We distinguish between bytes of the raw input buffer and bytes of non-tag
118
+ // text letters. Since tags can be over 50% of the bytes of an HTML Page,
119
+ // and are nearly all seven-bit ASCII English, we prefer to distinguish
120
+ // language mixture fractions based on just the non-tag text.
121
+ //
122
+ // Inputs: text and text_length
123
+ // is_plain_text if true says to NOT parse/skip HTML tags nor entities
124
+ // Outputs:
125
+ // language3 is an array of the top 3 languages or UNKNOWN_LANGUAGE
126
+ // percent3 is an array of the text percentages 0..100 of the top 3 languages
127
+ // normalized_score3 is an array of internal scores, normalized to the
128
+ // average score for each language over a body of training text. A
129
+ // normalized score significantly away from 1.0 indicates very skewed text
130
+ // or gibberish.
131
+ //
132
+ // text_bytes is the amount of non-tag/letters-only text found
133
+ // is_reliable set true if the returned Language is at least 2**30 times more
134
+ // probable then the second-best Language
135
+ //
136
+ // Return value: the most likely Language for the majority of the input text
137
+ // Length 0 input and text with no reliable letter sequences returns
138
+ // UNKNOWN_LANGUAGE
139
+ //
140
+ // Subsetting: For fast detection over large documents, these routines will
141
+ // scan non-tag text of the initial part of a document, then will
142
+ // skip 4-16 bytes and subsample text in the rest of the document, up to a
143
+ // fixed limit (currently 160KB of non-tag letters).
144
+ //
145
+
146
+ Language DetectLanguageSummaryV25(
147
+ const CompactLangDet::DetectionTables* tables,
148
+ const char* buffer,
149
+ int buffer_length,
150
+ bool is_plain_text,
151
+ const char* tld_hint, // "id" boosts Indonesian
152
+ int encoding_hint, // SJS boosts Japanese
153
+ Language language_hint, // ITALIAN boosts it
154
+ bool allow_extended_lang,
155
+ int flags,
156
+ Language plus_one,
157
+ Language* language3,
158
+ int* percent3,
159
+ double* normalized_score3,
160
+ int* text_bytes,
161
+ bool* is_reliable);
162
+
163
+ // For unit testing:
164
+ // Remove portions of text that have a high density of spaces, or that are
165
+ // overly repetitive, squeezing the remaining text in-place to the front
166
+ // of the input buffer.
167
+ // Return the new, possibly-shorter length
168
+ int CheapSqueezeInplace(char* isrc, int srclen, int ichunksize);
169
+ }; // End namespace CompactLangDetImpl
170
+
171
+ #endif // ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_IMPL_H_
@@ -0,0 +1,545 @@
1
+ // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ // This file extends lang_enc.cc with additional languages and extended routines
6
+ // It is current with Unicode 5.1 (beta Jan 2008)
7
+ //
8
+
9
+ #include <stdlib.h>
10
+ #include <stdio.h>
11
+ #include <string.h>
12
+
13
+ #include "encodings/compact_lang_det/ext_lang_enc.h"
14
+ #include "encodings/compact_lang_det/win/cld_macros.h"
15
+ #include "encodings/compact_lang_det/win/cld_strtoint.h"
16
+
17
+ // Language names above NUM_LANGUAGES
18
+ // These are also the C enum declared names
19
+ static const char* const kExtLanguageName[] = {
20
+ "X_BORK_BORK_BORK", "X_PIG_LATIN", "X_HACKER", "X_KLINGON", "X_ELMER_FUDD",
21
+
22
+ // Pseudo-languages for Unicode scripts that express a single language
23
+ "X_OGHAM", "X_RUNIC", "X_YI", "X_OLD_ITALIC", "X_GOTHIC",
24
+ "X_DESERET", "X_HANUNOO", "X_BUHID", "X_TAGBANWA", "X_TAI_LE",
25
+ "X_LINEAR_B", "X_UGARITIC", "X_SHAVIAN", "X_OSMANYA", "X_CYPRIOT",
26
+ "X_BUGINESE", "X_COPTIC", "X_NEW_TAI_LUE", "X_GLAGOLITIC", "X_TIFINAGH",
27
+ "X_SYLOTI_NAGRI", "X_OLD_PERSIAN", "X_KHAROSHTHI", "X_BALINESE", "X_CUNEIFORM",
28
+ "X_PHOENICIAN", "X_PHAGS_PA", "X_NKO",
29
+
30
+ // Unicode 5.1
31
+ "X_SUDANESE", "X_LEPCHA", "X_OL_CHIKI", "X_VAI", "X_SAURASHTRA",
32
+ "X_KAYAH_LI", "X_REJANG", "X_LYCIAN", "X_CARIAN", "X_LYDIAN",
33
+ "X_CHAM",
34
+ };
35
+
36
+
37
+ // These are the C enum declared names, for programs creating C code
38
+ static const char* const kExtLangDeclaredName[] = {
39
+ "ENGLISH", /* 0 */
40
+ "DANISH", /* 1 */
41
+ "DUTCH", /* 2 */
42
+ "FINNISH", /* 3 */
43
+ "FRENCH", /* 4 */
44
+ "GERMAN", /* 5 */
45
+ "HEBREW", /* 6 */
46
+ "ITALIAN", /* 7 */
47
+ "JAPANESE", /* 8 */
48
+ "KOREAN", /* 9 */
49
+ "NORWEGIAN", /* 10 */
50
+ "POLISH", /* 11 */
51
+ "PORTUGUESE", /* 12 */
52
+ "RUSSIAN", /* 13 */
53
+ "SPANISH", /* 14 */
54
+ "SWEDISH", /* 15 */
55
+ "CHINESE", /* 16 */
56
+ "CZECH", /* 17 */
57
+ "GREEK", /* 18 */
58
+ "ICELANDIC", /* 19 */
59
+ "LATVIAN", /* 20 */
60
+ "LITHUANIAN", /* 21 */
61
+ "ROMANIAN", /* 22 */
62
+ "HUNGARIAN", /* 23 */
63
+ "ESTONIAN", /* 24 */
64
+ "TG_UNKNOWN_LANGUAGE", /* 25 */
65
+ "UNKNOWN_LANGUAGE", /* 26 */
66
+ "BULGARIAN", /* 27 */
67
+ "CROATIAN", /* 28 */
68
+ "SERBIAN", /* 29 */
69
+ "IRISH", /* 30 */
70
+ "GALICIAN", /* 31 */
71
+ "TAGALOG", /* 32 */
72
+ "TURKISH", /* 33 */
73
+ "UKRAINIAN", /* 34 */
74
+ "HINDI", /* 35 */
75
+ "MACEDONIAN", /* 36 */
76
+ "BENGALI", /* 37 */
77
+ "INDONESIAN", /* 38 */
78
+ "LATIN", /* 39 */
79
+ "MALAY", /* 40 */
80
+ "MALAYALAM", /* 41 */
81
+ "WELSH", /* 42 */
82
+ "NEPALI", /* 43 */
83
+ "TELUGU", /* 44 */
84
+ "ALBANIAN", /* 45 */
85
+ "TAMIL", /* 46 */
86
+ "BELARUSIAN", /* 47 */
87
+ "JAVANESE", /* 48 */
88
+ "OCCITAN", /* 49 */
89
+ "URDU", /* 50 */
90
+ "BIHARI", /* 51 */
91
+ "GUJARATI", /* 52 */
92
+ "THAI", /* 53 */
93
+ "ARABIC", /* 54 */
94
+ "CATALAN", /* 55 */
95
+ "ESPERANTO", /* 56 */
96
+ "BASQUE", /* 57 */
97
+ "INTERLINGUA", /* 58 */
98
+ "KANNADA", /* 59 */
99
+ "PUNJABI", /* 60 */
100
+ "SCOTS_GAELIC", /* 61 */
101
+ "SWAHILI", /* 62 */
102
+ "SLOVENIAN", /* 63 */
103
+ "MARATHI", /* 64 */
104
+ "MALTESE", /* 65 */
105
+ "VIETNAMESE", /* 66 */
106
+ "FRISIAN", /* 67 */
107
+ "SLOVAK", /* 68 */
108
+ "CHINESE_T", /* 69 */
109
+ "FAROESE", /* 70 */
110
+ "SUNDANESE", /* 71 */
111
+ "UZBEK", /* 72 */
112
+ "AMHARIC", /* 73 */
113
+ "AZERBAIJANI", /* 74 */
114
+ "GEORGIAN", /* 75 */
115
+ "TIGRINYA", /* 76 */
116
+ "PERSIAN", /* 77 */
117
+ "BOSNIAN", /* 78 */
118
+ "SINHALESE", /* 79 */
119
+ "NORWEGIAN_N", /* 80 */
120
+ "PORTUGUESE_P", /* 81 */
121
+ "PORTUGUESE_B", /* 82 */
122
+ "XHOSA", /* 83 */
123
+ "ZULU", /* 84 */
124
+ "GUARANI", /* 85 */
125
+ "SESOTHO", /* 86 */
126
+ "TURKMEN", /* 87 */
127
+ "KYRGYZ", /* 88 */
128
+ "BRETON", /* 89 */
129
+ "TWI", /* 90 */
130
+ "YIDDISH", /* 91 */
131
+ "SERBO_CROATIAN", /* 92 */
132
+ "SOMALI", /* 93 */
133
+ "UIGHUR", /* 94 */
134
+ "KURDISH", /* 95 */
135
+ "MONGOLIAN", /* 96 */
136
+ "ARMENIAN", /* 97 */
137
+ "LAOTHIAN", /* 98 */
138
+ "SINDHI", /* 99 */
139
+ "RHAETO_ROMANCE", /* 100 */
140
+ "AFRIKAANS", /* 101 */
141
+ "LUXEMBOURGISH", /* 102 */
142
+ "BURMESE", /* 103 */
143
+ "KHMER", /* 104 */
144
+ "TIBETAN", /* 105 */
145
+ "DHIVEHI", /* 106 */ // sometimes spelled Divehi; lang of Maldives
146
+ "CHEROKEE", /* 107 */
147
+ "SYRIAC", /* 108 */
148
+ "LIMBU", /* 109 */
149
+ "ORIYA", /* 110 */
150
+ "ASSAMESE", /* 111 */
151
+ "CORSICAN", /* 112 */
152
+ "INTERLINGUE", /* 113 */
153
+ "KAZAKH", /* 114 */
154
+ "LINGALA", /* 115 */
155
+ "MOLDAVIAN", /* 116 */
156
+ "PASHTO", /* 117 */
157
+ "QUECHUA", /* 118 */
158
+ "SHONA", /* 119 */
159
+ "TAJIK", /* 120 */
160
+ "TATAR", /* 121 */
161
+ "TONGA", /* 122 */
162
+ "YORUBA", /* 123 */
163
+ "CREOLES_AND_PIDGINS_ENGLISH_BASED", /* 124 */
164
+ "CREOLES_AND_PIDGINS_FRENCH_BASED", /* 125 */
165
+ "CREOLES_AND_PIDGINS_PORTUGUESE_BASED", /* 126 */
166
+ "CREOLES_AND_PIDGINS_OTHER", /* 127 */
167
+ "MAORI", /* 128 */
168
+ "WOLOF", /* 129 */
169
+ "ABKHAZIAN", /* 130 */
170
+ "AFAR", /* 131 */
171
+ "AYMARA", /* 132 */
172
+ "BASHKIR", /* 133 */
173
+ "BISLAMA", /* 134 */
174
+ "DZONGKHA", /* 135 */
175
+ "FIJIAN", /* 136 */
176
+ "GREENLANDIC", /* 137 */
177
+ "HAUSA", /* 138 */
178
+ "HAITIAN_CREOLE", /* 139 */
179
+ "INUPIAK", /* 140 */
180
+ "INUKTITUT", /* 141 */
181
+ "KASHMIRI", /* 142 */
182
+ "KINYARWANDA", /* 143 */
183
+ "MALAGASY", /* 144 */
184
+ "NAURU", /* 145 */
185
+ "OROMO", /* 146 */
186
+ "RUNDI", /* 147 */
187
+ "SAMOAN", /* 148 */
188
+ "SANGO", /* 149 */
189
+ "SANSKRIT", /* 150 */
190
+ "SISWANT", /* 151 */
191
+ "TSONGA", /* 152 */
192
+ "TSWANA", /* 153 */
193
+ "VOLAPUK", /* 154 */
194
+ "ZHUANG", /* 155 */
195
+ "KHASI", /* 156 */
196
+ "SCOTS", /* 157 */
197
+ "GANDA", /* 158 */
198
+ "MANX", /* 159 */
199
+ "MONTENEGRIN", /* 160 */
200
+ // Add new language declared names just before here
201
+ };
202
+
203
+ COMPILE_ASSERT(arraysize(kExtLangDeclaredName) == NUM_LANGUAGES,
204
+ kExtLangDeclaredName_has_incorrect_length);
205
+
206
+
207
+ // Language codes above NUM_LANGUAGES
208
+ // I made all these up, except Klingon from ISO-639-2 (dsites)
209
+ // NOTE: zza is a standard name
210
+ static const char* const kExtLanguageCode[] = {
211
+ // "X_BORK_BORK_BORK", "X_PIG_LATIN", "X_HACKER", "X_KLINGON", "X_ELMER_FUDD",
212
+ // All Latin script
213
+ "zzb", "zzp", "zzh", "tlh", "zze",
214
+
215
+ // Pseudo-languages for Unicode scripts that express a single language
216
+ "xx-Ogam", "xx-Runr", "xx-Yiii", "xx-Ital", "xx-Goth",
217
+ "xx-Dsrt", "xx-Hano", "xx-Buhd", "xx-Tagb", "xx-Tale",
218
+ "xx-Linb", "xx-Ugar", "xx-Shaw", "xx-Osma", "xx-Cprt",
219
+ "xx-Bugi", "xx-Copt", "xx-Talu", "xx-Glag", "xx-Tfng",
220
+ "xx-Sylo", "xx-Xpeo", "xx-Khar", "xx-Bali", "xx-Xsux",
221
+ "xx-Phnx", "xx-Phag", "xx-Nkoo",
222
+
223
+ // Unicode 5.1
224
+ "xx-Sund", "xx-Lepc", "xx-Olck", "xx-Vaii", "xx-Saur",
225
+ "xx-Kali", "xx-Rjng", "xx-Lyci", "xx-Cari", "xx-Lydi",
226
+ "xx-Cham",
227
+ };
228
+
229
+
230
+ // Given the Language, returns its string name used as the output by
231
+ // the lang/enc identifier, e.g. "Korean"
232
+ // "invalid_language" if the input is invalid.
233
+ // TG_UNKNOWN_LANGUAGE is used as a placeholder for the "ignore me" language,
234
+ // used to subtract out HTML, link farms, DNA strings, and alittle English porn
235
+ const char* ExtLanguageName(const Language lang) {
236
+ if (lang < 0) {
237
+ // No-text-at-all result from a Tote
238
+ return "";
239
+ }
240
+ // CompactLanguageDetect extension
241
+ if (lang == TG_UNKNOWN_LANGUAGE) {
242
+ return "Ignore";
243
+ }
244
+ if ((0 <= lang) && (lang < NUM_LANGUAGES)) {
245
+ return LanguageName(lang);
246
+ }
247
+ if ((EXT_LANGUAGE_BASE <= lang) && (lang < EXT_NUM_LANGUAGES)) {
248
+ return kExtLanguageName[lang - EXT_LANGUAGE_BASE];
249
+ }
250
+ return invalid_language_name();
251
+ }
252
+
253
+
254
+ // Given the Language, returns its Language enum spelling, for use by
255
+ // programs that create C declarations, e.g. "KOREAN"
256
+ // "UNKNOWN_LANGUAGE" if the input is invalid.
257
+ const char* ExtLanguageDeclaredName(const Language lang) {
258
+ if ((0 <= lang) && (lang < NUM_LANGUAGES)) {
259
+ return kExtLangDeclaredName[lang];
260
+ }
261
+ if ((EXT_LANGUAGE_BASE <= lang) && (lang < EXT_NUM_LANGUAGES)) {
262
+ return kExtLanguageName[lang - EXT_LANGUAGE_BASE];
263
+ }
264
+ return "UNKNOWN_LANGUAGE";
265
+ }
266
+
267
+ // Given the Language, return the language code, e.g. "ko"
268
+ const char* ExtLanguageCode(const Language lang) {
269
+ // Hack for ignore/porn pseudo-language
270
+ if (lang == TG_UNKNOWN_LANGUAGE) {
271
+ return "xxx";
272
+ }
273
+ if ((0 <= lang) && (lang < NUM_LANGUAGES)) {
274
+ return LanguageCode(lang);
275
+ }
276
+ if ((EXT_LANGUAGE_BASE <= lang) && (lang < EXT_NUM_LANGUAGES)) {
277
+ return kExtLanguageCode[lang - EXT_LANGUAGE_BASE];
278
+ }
279
+ return "??";
280
+ }
281
+
282
+
283
+ // Convert "en-Latn-GB" to ENGLISH
284
+ // Normalize to PORTUGUESE, not PORTUGUESE_B nor PORTUGUESE_P
285
+ // Consider for later: NORWEGIAN, NORWEGIAN_N
286
+ // Consider for later: SCOTS, SCOTS_GAELIC
287
+ // Consider for later: SERBO_CROATIAN, SERBIAN, CROATIAN, BOSNIAN
288
+ //
289
+ Language GetLanguageFromNumberOrName(const char* src) {
290
+ if (strspn(src, "0123456789") == strlen(src)) {
291
+ // All digits
292
+ return static_cast<Language>(strto32(src, NULL, 10));
293
+ }
294
+
295
+ Language retlang = UNKNOWN_LANGUAGE;
296
+ size_t len = strlen(src);
297
+
298
+ if (true /*FLAGS_mergepairs*/) {
299
+ // Merge sets of langauges pt-xx en-xx fr-xx, NOT bs/hr/sr
300
+ if (memcmp(src, "pt-", 3) == 0) {return PORTUGUESE;}
301
+ if (memcmp(src, "en-", 3) == 0) {return ENGLISH;}
302
+ if (memcmp(src, "fr-", 3) == 0) {return FRENCH;}
303
+ // Use NormalizeLanguage instead
304
+ if (memcmp(src, "bs-", 3) == 0) {return CROATIAN;}
305
+ if (memcmp(src, "hr-", 3) == 0) {return CROATIAN;}
306
+ if (memcmp(src, "sr-Latn", 7) == 0) {return CROATIAN;}
307
+ if (memcmp(src, "sh-Latn", 7) == 0) {return CROATIAN;}
308
+ if (memcmp(src, "sr-Cyrl", 7) == 0) {return SERBIAN;}
309
+ if (memcmp(src, "sh-Cyrl", 7) == 0) {return SERBIAN;}
310
+ }
311
+
312
+ // Extensions
313
+ if (len >= 3) {
314
+ // Standin for ignore/porn "language"
315
+ if (memcmp(src, "xxx", 3) == 0) {return TG_UNKNOWN_LANGUAGE;}
316
+
317
+ if (memcmp(src, "zzb", 3) == 0) {return X_BORK_BORK_BORK;}
318
+ if (memcmp(src, "zzp", 3) == 0) {return X_PIG_LATIN;}
319
+ if (memcmp(src, "zzh", 3) == 0) {return X_HACKER;}
320
+ if (memcmp(src, "tlh", 3) == 0) {return X_KLINGON;}
321
+ if (memcmp(src, "zze", 3) == 0) {return X_ELMER_FUDD;}
322
+ }
323
+
324
+ // We have a name like en-Latn-GB or pt-BR
325
+ // First, get rid of some special cases
326
+ if (len <= 3) {
327
+ LanguageFromCode(src, &retlang);
328
+ } else if (len == 7) {
329
+ // More Extensions
330
+ if (memcmp(src, "xx-", 3) == 0) {
331
+ if (memcmp(src, "xx-Ogam", 7) == 0) {return X_OGHAM;}
332
+ if (memcmp(src, "xx-Runr", 7) == 0) {return X_RUNIC;}
333
+ if (memcmp(src, "xx-Yiii", 7) == 0) {return X_YI;}
334
+ if (memcmp(src, "xx-Ital", 7) == 0) {return X_OLD_ITALIC;}
335
+ if (memcmp(src, "xx-Goth", 7) == 0) {return X_GOTHIC;}
336
+ if (memcmp(src, "xx-Dsrt", 7) == 0) {return X_DESERET;}
337
+ if (memcmp(src, "xx-Hano", 7) == 0) {return X_HANUNOO;}
338
+ if (memcmp(src, "xx-Buhd", 7) == 0) {return X_BUHID;}
339
+ if (memcmp(src, "xx-Tagb", 7) == 0) {return X_TAGBANWA;}
340
+ if (memcmp(src, "xx-Tale", 7) == 0) {return X_TAI_LE;}
341
+ if (memcmp(src, "xx-Linb", 7) == 0) {return X_LINEAR_B;}
342
+ if (memcmp(src, "xx-Ugar", 7) == 0) {return X_UGARITIC;}
343
+ if (memcmp(src, "xx-Shaw", 7) == 0) {return X_SHAVIAN;}
344
+ if (memcmp(src, "xx-Osma", 7) == 0) {return X_OSMANYA;}
345
+ if (memcmp(src, "xx-Cprt", 7) == 0) {return X_CYPRIOT;}
346
+ if (memcmp(src, "xx-Bugi", 7) == 0) {return X_BUGINESE;}
347
+ if (memcmp(src, "xx-Copt", 7) == 0) {return X_COPTIC;}
348
+ if (memcmp(src, "xx-Talu", 7) == 0) {return X_NEW_TAI_LUE;}
349
+ if (memcmp(src, "xx-Glag", 7) == 0) {return X_GLAGOLITIC;}
350
+ if (memcmp(src, "xx-Tfng", 7) == 0) {return X_TIFINAGH;}
351
+ if (memcmp(src, "xx-Sylo", 7) == 0) {return X_SYLOTI_NAGRI;}
352
+ if (memcmp(src, "xx-Xpeo", 7) == 0) {return X_OLD_PERSIAN;}
353
+ if (memcmp(src, "xx-Khar", 7) == 0) {return X_KHAROSHTHI;}
354
+ if (memcmp(src, "xx-Bali", 7) == 0) {return X_BALINESE;}
355
+ if (memcmp(src, "xx-Xsux", 7) == 0) {return X_CUNEIFORM;}
356
+ if (memcmp(src, "xx-Phnx", 7) == 0) {return X_PHOENICIAN;}
357
+ if (memcmp(src, "xx-Phag", 7) == 0) {return X_PHAGS_PA;}
358
+ if (memcmp(src, "xx-Nkoo", 7) == 0) {return X_NKO;}
359
+
360
+ // Unicode 5.1
361
+ if (memcmp(src, "xx-Sund", 7) == 0) {return X_SUDANESE;}
362
+ if (memcmp(src, "xx-Lepc", 7) == 0) {return X_LEPCHA;}
363
+ if (memcmp(src, "xx-Olck", 7) == 0) {return X_OL_CHIKI;}
364
+ if (memcmp(src, "xx-Vaii", 7) == 0) {return X_VAI;}
365
+ if (memcmp(src, "xx-Saur", 7) == 0) {return X_SAURASHTRA;}
366
+ if (memcmp(src, "xx-Kali", 7) == 0) {return X_KAYAH_LI;}
367
+ if (memcmp(src, "xx-Rjng", 7) == 0) {return X_REJANG;}
368
+ if (memcmp(src, "xx-Lyci", 7) == 0) {return X_LYCIAN;}
369
+ if (memcmp(src, "xx-Cari", 7) == 0) {return X_CARIAN;}
370
+ if (memcmp(src, "xx-Lydi", 7) == 0) {return X_LYDIAN;}
371
+ if (memcmp(src, "xx-Cham", 7) == 0) {return X_CHAM;}
372
+ }
373
+ }
374
+ // Some other weird ones
375
+ // Could be Latn or Limb; all our current training data is Latn
376
+ if (strcmp(src, "sit-NP") == 0) {return LIMBU;}
377
+ if (strcmp(src, "un-Latn") == 0) {return UNKNOWN_LANGUAGE;}
378
+
379
+ // Multi-country langauges
380
+ if (memcmp(src, "zh", 2) == 0) {
381
+ if (memcmp(&src[len - 2], "TW", 2) == 0) {return CHINESE_T;}
382
+ if (memcmp(&src[len - 2], "HK", 2) == 0) {return CHINESE_T;}
383
+ return CHINESE;
384
+ }
385
+ if (memcmp(src, "pt", 2) == 0) {
386
+ if (memcmp(&src[len - 2], "BR", 2) == 0) {return PORTUGUESE;}
387
+ return PORTUGUESE;
388
+ }
389
+ if (memcmp(src, "fr", 2) == 0) {
390
+ if (memcmp(&src[len -2], "CA", 2) == 0) {return FRENCH;}
391
+ return FRENCH;
392
+ }
393
+
394
+ // None of the special cases matched
395
+ if (src[2] == '-') {
396
+ char temp[4];
397
+ memcpy(temp, src, 4);
398
+ temp[2] = '\0';
399
+ LanguageFromCode(temp, &retlang);
400
+ }
401
+ if (src[3] == '-') {
402
+ char temp[4];
403
+ memcpy(temp, src, 4);
404
+ temp[3] = '\0';
405
+ LanguageFromCode(temp, &retlang);
406
+ }
407
+ if (retlang != UNKNOWN_LANGUAGE) {
408
+ return retlang;
409
+ }
410
+
411
+ return retlang;
412
+ }
413
+
414
+ typedef struct {
415
+ const char* name;
416
+ UnicodeLScript lscript;
417
+ } NameScriptPair;
418
+
419
+ // In alphabetic order for binary search
420
+ static const NameScriptPair kNameScriptPair[] = {
421
+ // Unicode 5.1 additional scripts
422
+ {"Arab", ULScript_Arabic},
423
+ {"Armn", ULScript_Armenian},
424
+ {"Bali", ULScript_Balinese},
425
+ {"Beng", ULScript_Bengali},
426
+ {"Bugi", ULScript_Buginese},
427
+ {"Buhd", ULScript_Buhid},
428
+ {"Cans", ULScript_Canadian_Aboriginal},
429
+ {"Cari", ULScript_Carian}, // Unicode 5.1
430
+ {"Cham", ULScript_Cham}, // Unicode 5.1
431
+ {"Cher", ULScript_Cherokee},
432
+ {"Copt", ULScript_Coptic},
433
+ {"Cprt", ULScript_Cypriot},
434
+ {"Cyrl", ULScript_Cyrillic},
435
+ {"Deva", ULScript_Devanagari},
436
+ {"Dsrt", ULScript_Deseret},
437
+ {"Ethi", ULScript_Ethiopic},
438
+ {"Geor", ULScript_Georgian},
439
+ {"Glag", ULScript_Glagolitic},
440
+ {"Goth", ULScript_Gothic},
441
+ {"Grek", ULScript_Greek},
442
+ {"Gujr", ULScript_Gujarati},
443
+ {"Guru", ULScript_Gurmukhi},
444
+ {"Hani", ULScript_HanCJK},
445
+ {"Hano", ULScript_Hanunoo},
446
+ {"Hebr", ULScript_Hebrew},
447
+ {"Ital", ULScript_Old_Italic},
448
+ {"Kali", ULScript_Kayah_Li}, // Unicode 5.1
449
+ {"Khar", ULScript_Kharoshthi},
450
+ {"Khmr", ULScript_Khmer},
451
+ {"Knda", ULScript_Kannada},
452
+ {"Laoo", ULScript_Lao},
453
+ {"Latn", ULScript_Latin},
454
+ {"Lepc", ULScript_Lepcha}, // Unicode 5.1
455
+ {"Limb", ULScript_Limbu},
456
+ {"Linb", ULScript_Linear_B},
457
+ {"Lyci", ULScript_Lycian}, // Unicode 5.1
458
+ {"Lydi", ULScript_Lydian}, // Unicode 5.1
459
+ {"Mlym", ULScript_Malayalam},
460
+ {"Mong", ULScript_Mongolian},
461
+ {"Mymr", ULScript_Myanmar},
462
+ {"Nkoo", ULScript_Nko},
463
+ {"Ogam", ULScript_Ogham},
464
+ {"Olck", ULScript_Ol_Chiki}, // Unicode 5.1
465
+ {"Orya", ULScript_Oriya},
466
+ {"Osma", ULScript_Osmanya},
467
+ {"Phag", ULScript_Phags_Pa},
468
+ {"Phnx", ULScript_Phoenician},
469
+ {"Rjng", ULScript_Rejang}, // Unicode 5.1
470
+ {"Runr", ULScript_Runic},
471
+ {"Saur", ULScript_Saurashtra}, // Unicode 5.1
472
+ {"Shaw", ULScript_Shavian},
473
+ {"Sinh", ULScript_Sinhala},
474
+ {"Sund", ULScript_Sundanese}, // Unicode 5.1
475
+ {"Sylo", ULScript_Syloti_Nagri},
476
+ {"Syrc", ULScript_Syriac},
477
+ {"Tagb", ULScript_Tagbanwa},
478
+ {"Tale", ULScript_Tai_Le},
479
+ {"Talu", ULScript_New_Tai_Lue},
480
+ {"Taml", ULScript_Tamil},
481
+ {"Telu", ULScript_Telugu},
482
+ {"Tfng", ULScript_Tifinagh},
483
+ {"Tglg", ULScript_Tagalog},
484
+ {"Thaa", ULScript_Thaana},
485
+ {"Thai", ULScript_Thai},
486
+ {"Tibt", ULScript_Tibetan},
487
+ {"Ugar", ULScript_Ugaritic},
488
+ {"Vaii", ULScript_Vai}, // Unicode 5.1 // NOTE: apparently 'Vai '
489
+ {"Xpeo", ULScript_Old_Persian},
490
+ {"Xsux", ULScript_Cuneiform},
491
+ {"Yiii", ULScript_Yi},
492
+ {"Zyyy", ULScript_Common},
493
+ {"Zzzz", ULScript_Inherited},
494
+ };
495
+
496
+ // Convert "en-Latn-GB" to ULScript_Latin
497
+ UnicodeLScript GetLScriptFromNumberOrName(const char* src) {
498
+ if (strspn(src, "0123456789") == strlen(src)) {
499
+ // All digits
500
+ return static_cast<UnicodeLScript>(strto32(src, NULL, 10));
501
+ }
502
+
503
+ if (strcmp(src, "zh-TW") == 0) {return ULScript_HanCJK;}
504
+ if (strcmp(src, "zh-CN") == 0) {return ULScript_HanCJK;}
505
+ if (strcmp(src, "pt-BR") == 0) {return ULScript_Latin;}
506
+ if (strcmp(src, "pt-PT") == 0) {return ULScript_Latin;}
507
+ // Could be Latn or Limb; all our current training data is Latn
508
+ if (strcmp(src, "sit-NP") == 0) {return ULScript_Latin;}
509
+
510
+ // Isolate just the script field
511
+ char temp[5];
512
+ const char* src2 = strchr(src, '-');
513
+ if (src2 == NULL) {return ULScript_Latin;}
514
+ src2 += 1; // over the -
515
+ memcpy(temp, src2, 4);
516
+ temp[4] = '\0';
517
+
518
+ int lo = 0;
519
+ int hi = ULScript_NUM_SCRIPTS;
520
+ while (lo < hi) {
521
+ int mid = (lo + hi) >> 1;
522
+ if (strcmp(temp, kNameScriptPair[mid].name) < 0) {
523
+ hi = mid;
524
+ } else if (strcmp(temp, kNameScriptPair[mid].name) > 0) {
525
+ lo = mid + 1;
526
+ } else {
527
+ return kNameScriptPair[mid].lscript;
528
+ }
529
+ }
530
+ return ULScript_Latin;
531
+ }
532
+
533
+
534
+ // Merge together some languages, such as bo/hr/sr
535
+ // Croatian Latin and Serbian Cyrillic now.
536
+ Language NormalizeLanguage(Language lang) {
537
+ if (lang == BOSNIAN) {return CROATIAN;}
538
+ if (lang == SERBO_CROATIAN) {return SERBIAN;}
539
+
540
+ if (lang == PORTUGUESE_P) {return PORTUGUESE;}
541
+ if (lang == PORTUGUESE_B) {return PORTUGUESE;}
542
+
543
+ return lang;
544
+ }
545
+