language_detection 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (100) hide show
  1. data/.gitignore +19 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +85 -0
  5. data/Rakefile +11 -0
  6. data/ext/cld/Makefile +34 -0
  7. data/ext/cld/base/basictypes.h +348 -0
  8. data/ext/cld/base/build_config.h +124 -0
  9. data/ext/cld/base/casts.h +156 -0
  10. data/ext/cld/base/commandlineflags.h +443 -0
  11. data/ext/cld/base/crash.h +41 -0
  12. data/ext/cld/base/dynamic_annotations.h +358 -0
  13. data/ext/cld/base/global_strip_options.h +59 -0
  14. data/ext/cld/base/log_severity.h +46 -0
  15. data/ext/cld/base/logging.h +1403 -0
  16. data/ext/cld/base/macros.h +243 -0
  17. data/ext/cld/base/port.h +54 -0
  18. data/ext/cld/base/scoped_ptr.h +428 -0
  19. data/ext/cld/base/stl_decl.h +0 -0
  20. data/ext/cld/base/stl_decl_msvc.h +107 -0
  21. data/ext/cld/base/string_util.h +29 -0
  22. data/ext/cld/base/strtoint.h +93 -0
  23. data/ext/cld/base/template_util.h +96 -0
  24. data/ext/cld/base/type_traits.h +198 -0
  25. data/ext/cld/base/vlog_is_on.h +143 -0
  26. data/ext/cld/cld.so +0 -0
  27. data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
  28. data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
  29. data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
  30. data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
  31. data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
  32. data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
  33. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
  34. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
  35. data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
  36. data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
  37. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
  38. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
  39. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
  40. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
  41. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
  42. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
  43. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
  44. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
  45. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
  46. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
  47. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
  48. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
  49. data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
  50. data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
  51. data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
  52. data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
  53. data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
  54. data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
  55. data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
  56. data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
  57. data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
  58. data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
  59. data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
  60. data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
  61. data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
  62. data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
  63. data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
  64. data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
  65. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
  66. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
  67. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
  68. data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
  69. data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
  70. data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
  71. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
  72. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
  73. data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
  74. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
  75. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
  76. data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
  77. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
  78. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
  79. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
  80. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
  81. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
  82. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
  83. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
  84. data/ext/cld/encodings/internal/encodings.cc +12 -0
  85. data/ext/cld/encodings/lang_enc.h +254 -0
  86. data/ext/cld/encodings/proto/encodings.pb.h +169 -0
  87. data/ext/cld/encodings/public/encodings.h +301 -0
  88. data/ext/cld/extconf.rb +1 -0
  89. data/ext/cld/language_detection.cc +88 -0
  90. data/ext/cld/languages/internal/languages.cc +337 -0
  91. data/ext/cld/languages/proto/languages.pb.h +179 -0
  92. data/ext/cld/languages/public/languages.h +379 -0
  93. data/language_detection.gemspec +28 -0
  94. data/lib/language_detection/string.rb +1 -0
  95. data/lib/language_detection/version.rb +3 -0
  96. data/lib/language_detection.rb +54 -0
  97. data/test/_helper.rb +15 -0
  98. data/test/fixtures/languages.csv +80 -0
  99. data/test/language_detection_test.rb +88 -0
  100. metadata +250 -0
@@ -0,0 +1,301 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_PUBLIC_ENCODINGS_H_
6
+ #define ENCODINGS_PUBLIC_ENCODINGS_H_
7
+
8
+ // This interface defines the Encoding enum and various functions that
9
+ // depend only on Encoding values.
10
+
11
+ // A hash-function for Encoding, hash<Encoding>, is defined in
12
+ // i18n/encodings/public/encodings-hash.h
13
+
14
+ // On some Windows projects, UNICODE may be defined, which would prevent the
15
+ // Encoding enum below from compiling. Note that this is a quick fix that does
16
+ // not break any existing projects. The UNICODE enum may someday be changed
17
+ // to something more specific and non-colliding, but this involves careful
18
+ // testing of changes in many other projects.
19
+ #undef UNICODE
20
+
21
+ // NOTE: The Encoding enum must always start at 0. This assumption has
22
+ // been made and used.
23
+
24
+ #ifndef SWIG
25
+
26
+ #include "encodings/proto/encodings.pb.h"
27
+
28
+ // We must have this for compatibility.
29
+ // COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE
30
+ //using namespace i18n::encodings;
31
+
32
+ #else
33
+
34
+ // Special proto SWIG workaround header file.
35
+ #include "i18n/encodings/internal/encodings_proto_wrapper.h"
36
+
37
+ #endif
38
+
39
+ const int kNumEncodings = NUM_ENCODINGS;
40
+
41
+ // some of the popular encoding aliases
42
+ // TODO(jrm) Make these static const Encoding values instead of macros.
43
+ #define LATIN1 ISO_8859_1
44
+ #define LATIN2 ISO_8859_2
45
+ #define LATIN3 ISO_8859_3
46
+ #define LATIN4 ISO_8859_4
47
+ #define CYRILLIC ISO_8859_5
48
+ #define ARABIC_ENCODING ISO_8859_6 // avoiding the same name as language
49
+ #define GREEK_ENCODING ISO_8859_7 // avoiding the same name as language
50
+ #define HEBREW_ENCODING ISO_8859_8 // avoiding the same name as language
51
+ #define LATIN5 ISO_8859_9
52
+ #define LATIN6 ISO_8859_10
53
+ #define KOREAN_HANGUL KOREAN_EUC_KR
54
+
55
+ // The default Encoding (LATIN1).
56
+ Encoding default_encoding();
57
+
58
+
59
+
60
+ // *************************************************************
61
+ // Encoding predicates
62
+ // IsValidEncoding()
63
+ // IsEncEncCompatible
64
+ // IsSupersetOfAscii7Bit
65
+ // Is8BitEncoding
66
+ // IsCJKEncoding
67
+ // IsHebrewEncoding
68
+ // IsRightToLeftEncoding
69
+ // IsLogicalRightToLeftEncoding
70
+ // IsVisualRightToLeftEncoding
71
+ // IsIso2022Encoding
72
+ // IsIso2022JpOrVariant
73
+ // IsShiftJisOrVariant
74
+ // IsJapaneseCellPhoneCarrierSpecificEncoding
75
+ // *************************************************************
76
+
77
+ // IsValidEncoding
78
+ // ===================================
79
+ //
80
+ // Function to check if the input language enum is within range.
81
+ //
82
+
83
+ bool IsValidEncoding(Encoding enc);
84
+
85
+ //
86
+ // IsEncEncCompatible
87
+ // ------------------
88
+ //
89
+ // This function is to determine whether or not converting from the
90
+ // first encoding to the second requires any changes to the underlying
91
+ // text (e.g. ASCII_7BIT is a subset of UTF8).
92
+ //
93
+ // TODO(someone more familiar with i18n): the current implementation
94
+ // is likely incomplete. It would be good to consider the full matrix
95
+ // of all pairs of encodings and to fish out all compatible pairs.
96
+ //
97
+ bool IsEncEncCompatible(const Encoding from, const Encoding to);
98
+
99
+ // To be a superset of 7-bit Ascii means that bytes 0...127 in the given
100
+ // encoding represent the same characters as they do in ISO_8859_1.
101
+
102
+ // WARNING: This function does not currently return true for all encodings that
103
+ // are supersets of Ascii 7-bit.
104
+ bool IsSupersetOfAscii7Bit(Encoding e);
105
+
106
+ // To be an 8-bit encoding means that there are fewer than 256 symbols.
107
+ // Each byte determines a new character; there are no multi-byte sequences.
108
+
109
+ // WARNING: This function does not currently return true for all encodings that
110
+ // are 8-bit encodings.
111
+ bool Is8BitEncoding(Encoding e);
112
+
113
+ // IsCJKEncoding
114
+ // -------------
115
+ //
116
+ // This function returns true if the encoding is either Chinese
117
+ // (simplified or traditional), Japanese, or Korean. Note: UTF8 is not
118
+ // considered a CJK encoding.
119
+ bool IsCJKEncoding(Encoding e);
120
+
121
+ // IsHebrewEncoding
122
+ // -------------
123
+ //
124
+ // This function returns true if the encoding is a Hebrew specific
125
+ // encoding (not UTF8, etc).
126
+ bool IsHebrewEncoding(Encoding e);
127
+
128
+ // IsRightToLeftEncoding
129
+ // ---------------------
130
+ //
131
+ // Returns true if the encoding is a right-to-left encoding.
132
+ //
133
+ // Note that the name of this function is somewhat misleading. There is nothing
134
+ // "right to left" about these encodings. They merely contain code points for
135
+ // characters in RTL languages such as Hebrew and Arabic. But this is also
136
+ // true for UTF-8.
137
+ //
138
+ // TODO(benjy): Get rid of this function. The only special-case we
139
+ // should need to worry about are visual encodings. Anything we
140
+ // need to do for all 'RTL' encodings we need to do for UTF-8 as well.
141
+ bool IsRightToLeftEncoding(Encoding enc);
142
+
143
+ // IsLogicalRightToLeftEncoding
144
+ // ----------------------------
145
+ //
146
+ // Returns true if the encoding is a logical right-to-left encoding.
147
+ // Logical right-to-left encodings are those that the browser renders
148
+ // right-to-left and applies the BiDi algorithm to. Therefore the characters
149
+ // appear in reading order in the file, and indexing, snippet generation etc.
150
+ // should all just work with no special processing.
151
+ //
152
+ // TODO(benjy): Get rid of this function. The only special-case we
153
+ // should need to worry about are visual encodings.
154
+ bool IsLogicalRightToLeftEncoding(Encoding enc);
155
+
156
+ // IsVisualRightToLeftEncoding
157
+ // ---------------------------
158
+ //
159
+ // Returns true if the encoding is a visual right-to-left encoding.
160
+ // Visual right-to-left encodings are those that the browser renders
161
+ // left-to-right and does not apply the BiDi algorithm to. Therefore each
162
+ // line appears in reverse order in the file, lines are manually wrapped
163
+ // by abusing <br> or <p> tags, etc. Visual RTL encoding is a relic of
164
+ // the prehistoric days when browsers couldn't render right-to-left, but
165
+ // unfortunately some visual pages persist to this day. These documents require
166
+ // special processing so that we don't index or snippet them with each line
167
+ // reversed.
168
+ bool IsVisualRightToLeftEncoding(Encoding enc);
169
+
170
+ // IsIso2022Encoding
171
+ // -----------------
172
+ //
173
+ // Returns true if the encoding is a kind of ISO 2022 such as
174
+ // ISO-2022-JP.
175
+ bool IsIso2022Encoding(Encoding enc);
176
+
177
+ // IsIso2022JpOrVariant
178
+ // --------------------
179
+ //
180
+ // Returns true if the encoding is ISO-2022-JP or a variant such as
181
+ // KDDI's ISO-2022-JP.
182
+ bool IsIso2022JpOrVariant(Encoding enc);
183
+
184
+ // IsShiftJisOrVariant
185
+ // --------------------
186
+ //
187
+ // Returns true if the encoding is Shift_JIS or a variant such as
188
+ // KDDI's Shift_JIS.
189
+ bool IsShiftJisOrVariant(Encoding enc);
190
+
191
+ // IsJapanesCellPhoneCarrierSpecificEncoding
192
+ // -----------------------------------------
193
+ //
194
+ // Returns true if it's Japanese cell phone carrier specific encoding
195
+ // such as KDDI_SHIFT_JIS.
196
+ bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc);
197
+
198
+
199
+
200
+ // *************************************************************
201
+ // ENCODING NAMES
202
+ //
203
+ // This interface defines a standard name for each valid encoding, and
204
+ // a standard name for invalid encodings. (Some names use all upper
205
+ // case, but others use mixed case.)
206
+ //
207
+ // EncodingName() [Encoding to name]
208
+ // MimeEncodingName() [Encoding to name]
209
+ // EncodingFromName() [name to Encoding]
210
+ // EncodingNameAliasToEncoding() [name to Encoding]
211
+ // default_encoding_name()
212
+ // invalid_encoding_name()
213
+ // *************************************************************
214
+
215
+ // EncodingName
216
+ // ------------
217
+ //
218
+ // Given the encoding, returns its standard name.
219
+ // Return invalid_encoding_name() if the encoding is invalid.
220
+ //
221
+ const char* EncodingName(Encoding enc);
222
+
223
+ //
224
+ // MimeEncodingName
225
+ // ----------------
226
+ //
227
+ // Return the "preferred MIME name" of an encoding.
228
+ //
229
+ // This name is suitable for using in HTTP headers, HTML tags,
230
+ // and as the "charset" parameter of a MIME Content-Type.
231
+ const char* MimeEncodingName(Encoding enc);
232
+
233
+
234
+ // The maximum length of an encoding name
235
+ const int kMaxEncodingNameSize = 50;
236
+
237
+ // The standard name of the default encoding.
238
+ const char* default_encoding_name();
239
+
240
+ // The name used for an invalid encoding.
241
+ const char* invalid_encoding_name();
242
+
243
+ // EncodingFromName
244
+ // ----------------
245
+ //
246
+ // If enc_name matches the standard name of an Encoding, using a
247
+ // case-insensitive comparison, set *encoding to that Encoding and
248
+ // return true. Otherwise set *encoding to UNKNOWN_ENCODING and
249
+ // return false.
250
+ //
251
+ // REQUIRES: encoding must not be NULL.
252
+ //
253
+ bool EncodingFromName(const char* enc_name, Encoding *encoding);
254
+
255
+ //
256
+ // EncodingNameAliasToEncoding
257
+ // ---------------------------
258
+ //
259
+ // If enc_name matches the standard name or an alias of an Encoding,
260
+ // using a case-insensitive comparison, return that
261
+ // Encoding. Otherwise, return UNKNOWN_ENCODING.
262
+ //
263
+ // Aliases include most mime-encoding names (e.g., "ISO-8859-7" for
264
+ // GREEK), alternate names (e.g., "cyrillic" for ISO_8859_5) and
265
+ // common variations with hyphens and underscores (e.g., "koi8-u" and
266
+ // "koi8u" for RUSSIAN_KOI8_R).
267
+
268
+ Encoding EncodingNameAliasToEncoding(const char *enc_name);
269
+
270
+
271
+ // *************************************************************
272
+ // Miscellany
273
+ // *************************************************************
274
+
275
+ // PreferredWebOutputEncoding
276
+ // --------------------------
277
+ //
278
+ // Some multi-byte encodings use byte values that coincide with the
279
+ // ASCII codes for HTML syntax characters <>"&' and browsers like MSIE
280
+ // can misinterpret these, as indicated in an external XSS report from
281
+ // 2007-02-15. Here, we map these dangerous encodings to safer ones. We
282
+ // also use UTF8 instead of encodings that we don't support in our
283
+ // output, and we generally try to be conservative in what we send out.
284
+ // Where the client asks for single- or double-byte encodings that are
285
+ // not as common, we substitute a more common single- or double-byte
286
+ // encoding, if there is one, thereby preserving the client's intent
287
+ // to use less space than UTF-8. This also means that characters
288
+ // outside the destination set will be converted to HTML NCRs (&#NNN;)
289
+ // if requested.
290
+ Encoding PreferredWebOutputEncoding(Encoding enc);
291
+
292
+
293
+ // InitEncodings
294
+ // -------------
295
+ //
296
+ // Ensures the encodings module has been initialized. Normally this happens
297
+ // during InitGoogle, but this allows access for scripts that don't
298
+ // support InitGoogle.
299
+ void InitEncodings();
300
+
301
+ #endif // ENCODINGS_PUBLIC_ENCODINGS_H_
@@ -0,0 +1 @@
1
+ # TODO: Generate Makefile
@@ -0,0 +1,88 @@
1
+ #include <stdio.h>
2
+ #include <string.h>
3
+ #include "encodings/compact_lang_det/compact_lang_det.h"
4
+ #include "encodings/compact_lang_det/ext_lang_enc.h"
5
+ #include "encodings/proto/encodings.pb.h"
6
+
7
+ typedef struct {
8
+ const char *name;
9
+ const char *code;
10
+ int percent;
11
+ double score;
12
+ } LanguageDetail;
13
+
14
+ typedef struct {
15
+ const char *name;
16
+ const char *code;
17
+ bool reliable;
18
+ int text_bytes;
19
+ LanguageDetail *details;
20
+ } DetectedLanguage;
21
+
22
+ extern "C" {
23
+ DetectedLanguage language_detection(const char * src, bool is_plain_text) {
24
+ bool do_allow_extended_languages = true;
25
+ bool do_pick_summary_language = false;
26
+ bool do_remove_weak_matches = false;
27
+
28
+ bool is_reliable;
29
+
30
+ // "id" boosts Indonesian
31
+ //
32
+ const char* tld_hint = NULL;
33
+
34
+ // SJS boosts Japanese
35
+ //
36
+ int encoding_hint = UNKNOWN_ENCODING;
37
+
38
+ // ITALIAN boosts it
39
+ //
40
+ Language language_hint = UNKNOWN_LANGUAGE;
41
+
42
+ double normalized_score3[3];
43
+ Language language3[3];
44
+ int percent3[3];
45
+ int text_bytes;
46
+
47
+ Language lang;
48
+ lang = CompactLangDet::DetectLanguage(0,
49
+ src, strlen(src),
50
+ is_plain_text,
51
+ do_allow_extended_languages,
52
+ do_pick_summary_language,
53
+ do_remove_weak_matches,
54
+ tld_hint,
55
+ encoding_hint,
56
+ language_hint,
57
+ language3,
58
+ percent3,
59
+ normalized_score3,
60
+ &text_bytes,
61
+ &is_reliable);
62
+
63
+
64
+ DetectedLanguage detected_language;
65
+ LanguageDetail * details = new LanguageDetail [3];
66
+
67
+ detected_language.name = LanguageName(lang);
68
+ detected_language.code = ExtLanguageCode(lang);
69
+ detected_language.reliable = is_reliable;
70
+ detected_language.text_bytes = text_bytes;
71
+
72
+ for(int i = 0; i < 3; i++) {
73
+ Language lang = language3[i];
74
+ LanguageDetail detail;
75
+
76
+ detail.name = LanguageName(lang);
77
+ detail.code = ExtLanguageCode(lang);
78
+ detail.percent = percent3[i];
79
+ detail.score = normalized_score3[i];
80
+
81
+ details[i] = detail;
82
+ }
83
+
84
+ detected_language.details = details;
85
+
86
+ return detected_language;
87
+ }
88
+ }