language_detection 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. data/.gitignore +19 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +85 -0
  5. data/Rakefile +11 -0
  6. data/ext/cld/Makefile +34 -0
  7. data/ext/cld/base/basictypes.h +348 -0
  8. data/ext/cld/base/build_config.h +124 -0
  9. data/ext/cld/base/casts.h +156 -0
  10. data/ext/cld/base/commandlineflags.h +443 -0
  11. data/ext/cld/base/crash.h +41 -0
  12. data/ext/cld/base/dynamic_annotations.h +358 -0
  13. data/ext/cld/base/global_strip_options.h +59 -0
  14. data/ext/cld/base/log_severity.h +46 -0
  15. data/ext/cld/base/logging.h +1403 -0
  16. data/ext/cld/base/macros.h +243 -0
  17. data/ext/cld/base/port.h +54 -0
  18. data/ext/cld/base/scoped_ptr.h +428 -0
  19. data/ext/cld/base/stl_decl.h +0 -0
  20. data/ext/cld/base/stl_decl_msvc.h +107 -0
  21. data/ext/cld/base/string_util.h +29 -0
  22. data/ext/cld/base/strtoint.h +93 -0
  23. data/ext/cld/base/template_util.h +96 -0
  24. data/ext/cld/base/type_traits.h +198 -0
  25. data/ext/cld/base/vlog_is_on.h +143 -0
  26. data/ext/cld/cld.so +0 -0
  27. data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
  28. data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
  29. data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
  30. data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
  31. data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
  32. data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
  33. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
  34. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
  35. data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
  36. data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
  37. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
  38. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
  39. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
  40. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
  41. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
  42. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
  43. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
  44. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
  45. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
  46. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
  47. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
  48. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
  49. data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
  50. data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
  51. data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
  52. data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
  53. data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
  54. data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
  55. data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
  56. data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
  57. data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
  58. data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
  59. data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
  60. data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
  61. data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
  62. data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
  63. data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
  64. data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
  65. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
  66. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
  67. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
  68. data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
  69. data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
  70. data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
  71. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
  72. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
  73. data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
  74. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
  75. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
  76. data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
  77. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
  78. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
  79. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
  80. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
  81. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
  82. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
  83. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
  84. data/ext/cld/encodings/internal/encodings.cc +12 -0
  85. data/ext/cld/encodings/lang_enc.h +254 -0
  86. data/ext/cld/encodings/proto/encodings.pb.h +169 -0
  87. data/ext/cld/encodings/public/encodings.h +301 -0
  88. data/ext/cld/extconf.rb +1 -0
  89. data/ext/cld/language_detection.cc +88 -0
  90. data/ext/cld/languages/internal/languages.cc +337 -0
  91. data/ext/cld/languages/proto/languages.pb.h +179 -0
  92. data/ext/cld/languages/public/languages.h +379 -0
  93. data/language_detection.gemspec +28 -0
  94. data/lib/language_detection/string.rb +1 -0
  95. data/lib/language_detection/version.rb +3 -0
  96. data/lib/language_detection.rb +54 -0
  97. data/test/_helper.rb +15 -0
  98. data/test/fixtures/languages.csv +80 -0
  99. data/test/language_detection_test.rb +88 -0
  100. metadata +250 -0
@@ -0,0 +1,301 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_PUBLIC_ENCODINGS_H_
6
+ #define ENCODINGS_PUBLIC_ENCODINGS_H_
7
+
8
+ // This interface defines the Encoding enum and various functions that
9
+ // depend only on Encoding values.
10
+
11
+ // A hash-function for Encoding, hash<Encoding>, is defined in
12
+ // i18n/encodings/public/encodings-hash.h
13
+
14
+ // On some Windows projects, UNICODE may be defined, which would prevent the
15
+ // Encoding enum below from compiling. Note that this is a quick fix that does
16
+ // not break any existing projects. The UNICODE enum may someday be changed
17
+ // to something more specific and non-colliding, but this involves careful
18
+ // testing of changes in many other projects.
19
+ #undef UNICODE
20
+
21
+ // NOTE: The Encoding enum must always start at 0. This assumption has
22
+ // been made and used.
23
+
24
+ #ifndef SWIG
25
+
26
+ #include "encodings/proto/encodings.pb.h"
27
+
28
+ // We must have this for compatibility.
29
+ // COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE
30
+ //using namespace i18n::encodings;
31
+
32
+ #else
33
+
34
+ // Special proto SWIG workaround header file.
35
+ #include "i18n/encodings/internal/encodings_proto_wrapper.h"
36
+
37
+ #endif
38
+
39
+ const int kNumEncodings = NUM_ENCODINGS;
40
+
41
+ // some of the popular encoding aliases
42
+ // TODO(jrm) Make these static const Encoding values instead of macros.
43
+ #define LATIN1 ISO_8859_1
44
+ #define LATIN2 ISO_8859_2
45
+ #define LATIN3 ISO_8859_3
46
+ #define LATIN4 ISO_8859_4
47
+ #define CYRILLIC ISO_8859_5
48
+ #define ARABIC_ENCODING ISO_8859_6 // avoiding the same name as language
49
+ #define GREEK_ENCODING ISO_8859_7 // avoiding the same name as language
50
+ #define HEBREW_ENCODING ISO_8859_8 // avoiding the same name as language
51
+ #define LATIN5 ISO_8859_9
52
+ #define LATIN6 ISO_8859_10
53
+ #define KOREAN_HANGUL KOREAN_EUC_KR
54
+
55
+ // The default Encoding (LATIN1).
56
+ Encoding default_encoding();
57
+
58
+
59
+
60
+ // *************************************************************
61
+ // Encoding predicates
62
+ // IsValidEncoding()
63
+ // IsEncEncCompatible
64
+ // IsSupersetOfAscii7Bit
65
+ // Is8BitEncoding
66
+ // IsCJKEncoding
67
+ // IsHebrewEncoding
68
+ // IsRightToLeftEncoding
69
+ // IsLogicalRightToLeftEncoding
70
+ // IsVisualRightToLeftEncoding
71
+ // IsIso2022Encoding
72
+ // IsIso2022JpOrVariant
73
+ // IsShiftJisOrVariant
74
+ // IsJapaneseCellPhoneCarrierSpecificEncoding
75
+ // *************************************************************
76
+
77
+ // IsValidEncoding
78
+ // ===================================
79
+ //
80
+ // Function to check if the input language enum is within range.
81
+ //
82
+
83
+ bool IsValidEncoding(Encoding enc);
84
+
85
+ //
86
+ // IsEncEncCompatible
87
+ // ------------------
88
+ //
89
+ // This function is to determine whether or not converting from the
90
+ // first encoding to the second requires any changes to the underlying
91
+ // text (e.g. ASCII_7BIT is a subset of UTF8).
92
+ //
93
+ // TODO(someone more familiar with i18n): the current implementation
94
+ // is likely incomplete. It would be good to consider the full matrix
95
+ // of all pairs of encodings and to fish out all compatible pairs.
96
+ //
97
+ bool IsEncEncCompatible(const Encoding from, const Encoding to);
98
+
99
+ // To be a superset of 7-bit Ascii means that bytes 0...127 in the given
100
+ // encoding represent the same characters as they do in ISO_8859_1.
101
+
102
+ // WARNING: This function does not currently return true for all encodings that
103
+ // are supersets of Ascii 7-bit.
104
+ bool IsSupersetOfAscii7Bit(Encoding e);
105
+
106
+ // To be an 8-bit encoding means that there are fewer than 256 symbols.
107
+ // Each byte determines a new character; there are no multi-byte sequences.
108
+
109
+ // WARNING: This function does not currently return true for all encodings that
110
+ // are 8-bit encodings.
111
+ bool Is8BitEncoding(Encoding e);
112
+
113
+ // IsCJKEncoding
114
+ // -------------
115
+ //
116
+ // This function returns true if the encoding is either Chinese
117
+ // (simplified or traditional), Japanese, or Korean. Note: UTF8 is not
118
+ // considered a CJK encoding.
119
+ bool IsCJKEncoding(Encoding e);
120
+
121
+ // IsHebrewEncoding
122
+ // -------------
123
+ //
124
+ // This function returns true if the encoding is a Hebrew specific
125
+ // encoding (not UTF8, etc).
126
+ bool IsHebrewEncoding(Encoding e);
127
+
128
+ // IsRightToLeftEncoding
129
+ // ---------------------
130
+ //
131
+ // Returns true if the encoding is a right-to-left encoding.
132
+ //
133
+ // Note that the name of this function is somewhat misleading. There is nothing
134
+ // "right to left" about these encodings. They merely contain code points for
135
+ // characters in RTL languages such as Hebrew and Arabic. But this is also
136
+ // true for UTF-8.
137
+ //
138
+ // TODO(benjy): Get rid of this function. The only special-case we
139
+ // should need to worry about are visual encodings. Anything we
140
+ // need to do for all 'RTL' encodings we need to do for UTF-8 as well.
141
+ bool IsRightToLeftEncoding(Encoding enc);
142
+
143
+ // IsLogicalRightToLeftEncoding
144
+ // ----------------------------
145
+ //
146
+ // Returns true if the encoding is a logical right-to-left encoding.
147
+ // Logical right-to-left encodings are those that the browser renders
148
+ // right-to-left and applies the BiDi algorithm to. Therefore the characters
149
+ // appear in reading order in the file, and indexing, snippet generation etc.
150
+ // should all just work with no special processing.
151
+ //
152
+ // TODO(benjy): Get rid of this function. The only special-case we
153
+ // should need to worry about are visual encodings.
154
+ bool IsLogicalRightToLeftEncoding(Encoding enc);
155
+
156
+ // IsVisualRightToLeftEncoding
157
+ // ---------------------------
158
+ //
159
+ // Returns true if the encoding is a visual right-to-left encoding.
160
+ // Visual right-to-left encodings are those that the browser renders
161
+ // left-to-right and does not apply the BiDi algorithm to. Therefore each
162
+ // line appears in reverse order in the file, lines are manually wrapped
163
+ // by abusing <br> or <p> tags, etc. Visual RTL encoding is a relic of
164
+ // the prehistoric days when browsers couldn't render right-to-left, but
165
+ // unfortunately some visual pages persist to this day. These documents require
166
+ // special processing so that we don't index or snippet them with each line
167
+ // reversed.
168
+ bool IsVisualRightToLeftEncoding(Encoding enc);
169
+
170
+ // IsIso2022Encoding
171
+ // -----------------
172
+ //
173
+ // Returns true if the encoding is a kind of ISO 2022 such as
174
+ // ISO-2022-JP.
175
+ bool IsIso2022Encoding(Encoding enc);
176
+
177
+ // IsIso2022JpOrVariant
178
+ // --------------------
179
+ //
180
+ // Returns true if the encoding is ISO-2022-JP or a variant such as
181
+ // KDDI's ISO-2022-JP.
182
+ bool IsIso2022JpOrVariant(Encoding enc);
183
+
184
+ // IsShiftJisOrVariant
185
+ // --------------------
186
+ //
187
+ // Returns true if the encoding is Shift_JIS or a variant such as
188
+ // KDDI's Shift_JIS.
189
+ bool IsShiftJisOrVariant(Encoding enc);
190
+
191
+ // IsJapanesCellPhoneCarrierSpecificEncoding
192
+ // -----------------------------------------
193
+ //
194
+ // Returns true if it's Japanese cell phone carrier specific encoding
195
+ // such as KDDI_SHIFT_JIS.
196
+ bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc);
197
+
198
+
199
+
200
+ // *************************************************************
201
+ // ENCODING NAMES
202
+ //
203
+ // This interface defines a standard name for each valid encoding, and
204
+ // a standard name for invalid encodings. (Some names use all upper
205
+ // case, but others use mixed case.)
206
+ //
207
+ // EncodingName() [Encoding to name]
208
+ // MimeEncodingName() [Encoding to name]
209
+ // EncodingFromName() [name to Encoding]
210
+ // EncodingNameAliasToEncoding() [name to Encoding]
211
+ // default_encoding_name()
212
+ // invalid_encoding_name()
213
+ // *************************************************************
214
+
215
+ // EncodingName
216
+ // ------------
217
+ //
218
+ // Given the encoding, returns its standard name.
219
+ // Return invalid_encoding_name() if the encoding is invalid.
220
+ //
221
+ const char* EncodingName(Encoding enc);
222
+
223
+ //
224
+ // MimeEncodingName
225
+ // ----------------
226
+ //
227
+ // Return the "preferred MIME name" of an encoding.
228
+ //
229
+ // This name is suitable for using in HTTP headers, HTML tags,
230
+ // and as the "charset" parameter of a MIME Content-Type.
231
+ const char* MimeEncodingName(Encoding enc);
232
+
233
+
234
+ // The maximum length of an encoding name
235
+ const int kMaxEncodingNameSize = 50;
236
+
237
+ // The standard name of the default encoding.
238
+ const char* default_encoding_name();
239
+
240
+ // The name used for an invalid encoding.
241
+ const char* invalid_encoding_name();
242
+
243
+ // EncodingFromName
244
+ // ----------------
245
+ //
246
+ // If enc_name matches the standard name of an Encoding, using a
247
+ // case-insensitive comparison, set *encoding to that Encoding and
248
+ // return true. Otherwise set *encoding to UNKNOWN_ENCODING and
249
+ // return false.
250
+ //
251
+ // REQUIRES: encoding must not be NULL.
252
+ //
253
+ bool EncodingFromName(const char* enc_name, Encoding *encoding);
254
+
255
+ //
256
+ // EncodingNameAliasToEncoding
257
+ // ---------------------------
258
+ //
259
+ // If enc_name matches the standard name or an alias of an Encoding,
260
+ // using a case-insensitive comparison, return that
261
+ // Encoding. Otherwise, return UNKNOWN_ENCODING.
262
+ //
263
+ // Aliases include most mime-encoding names (e.g., "ISO-8859-7" for
264
+ // GREEK), alternate names (e.g., "cyrillic" for ISO_8859_5) and
265
+ // common variations with hyphens and underscores (e.g., "koi8-u" and
266
+ // "koi8u" for RUSSIAN_KOI8_R).
267
+
268
+ Encoding EncodingNameAliasToEncoding(const char *enc_name);
269
+
270
+
271
+ // *************************************************************
272
+ // Miscellany
273
+ // *************************************************************
274
+
275
+ // PreferredWebOutputEncoding
276
+ // --------------------------
277
+ //
278
+ // Some multi-byte encodings use byte values that coincide with the
279
+ // ASCII codes for HTML syntax characters <>"&' and browsers like MSIE
280
+ // can misinterpret these, as indicated in an external XSS report from
281
+ // 2007-02-15. Here, we map these dangerous encodings to safer ones. We
282
+ // also use UTF8 instead of encodings that we don't support in our
283
+ // output, and we generally try to be conservative in what we send out.
284
+ // Where the client asks for single- or double-byte encodings that are
285
+ // not as common, we substitute a more common single- or double-byte
286
+ // encoding, if there is one, thereby preserving the client's intent
287
+ // to use less space than UTF-8. This also means that characters
288
+ // outside the destination set will be converted to HTML NCRs (&#NNN;)
289
+ // if requested.
290
+ Encoding PreferredWebOutputEncoding(Encoding enc);
291
+
292
+
293
+ // InitEncodings
294
+ // -------------
295
+ //
296
+ // Ensures the encodings module has been initialized. Normally this happens
297
+ // during InitGoogle, but this allows access for scripts that don't
298
+ // support InitGoogle.
299
+ void InitEncodings();
300
+
301
+ #endif // ENCODINGS_PUBLIC_ENCODINGS_H_
@@ -0,0 +1 @@
1
+ # TODO: Generate Makefile
@@ -0,0 +1,88 @@
1
+ #include <stdio.h>
2
+ #include <string.h>
3
+ #include "encodings/compact_lang_det/compact_lang_det.h"
4
+ #include "encodings/compact_lang_det/ext_lang_enc.h"
5
+ #include "encodings/proto/encodings.pb.h"
6
+
7
+ typedef struct {
8
+ const char *name;
9
+ const char *code;
10
+ int percent;
11
+ double score;
12
+ } LanguageDetail;
13
+
14
+ typedef struct {
15
+ const char *name;
16
+ const char *code;
17
+ bool reliable;
18
+ int text_bytes;
19
+ LanguageDetail *details;
20
+ } DetectedLanguage;
21
+
22
+ extern "C" {
23
+ DetectedLanguage language_detection(const char * src, bool is_plain_text) {
24
+ bool do_allow_extended_languages = true;
25
+ bool do_pick_summary_language = false;
26
+ bool do_remove_weak_matches = false;
27
+
28
+ bool is_reliable;
29
+
30
+ // "id" boosts Indonesian
31
+ //
32
+ const char* tld_hint = NULL;
33
+
34
+ // SJS boosts Japanese
35
+ //
36
+ int encoding_hint = UNKNOWN_ENCODING;
37
+
38
+ // ITALIAN boosts it
39
+ //
40
+ Language language_hint = UNKNOWN_LANGUAGE;
41
+
42
+ double normalized_score3[3];
43
+ Language language3[3];
44
+ int percent3[3];
45
+ int text_bytes;
46
+
47
+ Language lang;
48
+ lang = CompactLangDet::DetectLanguage(0,
49
+ src, strlen(src),
50
+ is_plain_text,
51
+ do_allow_extended_languages,
52
+ do_pick_summary_language,
53
+ do_remove_weak_matches,
54
+ tld_hint,
55
+ encoding_hint,
56
+ language_hint,
57
+ language3,
58
+ percent3,
59
+ normalized_score3,
60
+ &text_bytes,
61
+ &is_reliable);
62
+
63
+
64
+ DetectedLanguage detected_language;
65
+ LanguageDetail * details = new LanguageDetail [3];
66
+
67
+ detected_language.name = LanguageName(lang);
68
+ detected_language.code = ExtLanguageCode(lang);
69
+ detected_language.reliable = is_reliable;
70
+ detected_language.text_bytes = text_bytes;
71
+
72
+ for(int i = 0; i < 3; i++) {
73
+ Language lang = language3[i];
74
+ LanguageDetail detail;
75
+
76
+ detail.name = LanguageName(lang);
77
+ detail.code = ExtLanguageCode(lang);
78
+ detail.percent = percent3[i];
79
+ detail.score = normalized_score3[i];
80
+
81
+ details[i] = detail;
82
+ }
83
+
84
+ detected_language.details = details;
85
+
86
+ return detected_language;
87
+ }
88
+ }