language_detection 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (100) hide show
  1. data/.gitignore +19 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +85 -0
  5. data/Rakefile +11 -0
  6. data/ext/cld/Makefile +34 -0
  7. data/ext/cld/base/basictypes.h +348 -0
  8. data/ext/cld/base/build_config.h +124 -0
  9. data/ext/cld/base/casts.h +156 -0
  10. data/ext/cld/base/commandlineflags.h +443 -0
  11. data/ext/cld/base/crash.h +41 -0
  12. data/ext/cld/base/dynamic_annotations.h +358 -0
  13. data/ext/cld/base/global_strip_options.h +59 -0
  14. data/ext/cld/base/log_severity.h +46 -0
  15. data/ext/cld/base/logging.h +1403 -0
  16. data/ext/cld/base/macros.h +243 -0
  17. data/ext/cld/base/port.h +54 -0
  18. data/ext/cld/base/scoped_ptr.h +428 -0
  19. data/ext/cld/base/stl_decl.h +0 -0
  20. data/ext/cld/base/stl_decl_msvc.h +107 -0
  21. data/ext/cld/base/string_util.h +29 -0
  22. data/ext/cld/base/strtoint.h +93 -0
  23. data/ext/cld/base/template_util.h +96 -0
  24. data/ext/cld/base/type_traits.h +198 -0
  25. data/ext/cld/base/vlog_is_on.h +143 -0
  26. data/ext/cld/cld.so +0 -0
  27. data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
  28. data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
  29. data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
  30. data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
  31. data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
  32. data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
  33. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
  34. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
  35. data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
  36. data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
  37. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
  38. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
  39. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
  40. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
  41. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
  42. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
  43. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
  44. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
  45. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
  46. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
  47. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
  48. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
  49. data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
  50. data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
  51. data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
  52. data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
  53. data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
  54. data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
  55. data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
  56. data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
  57. data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
  58. data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
  59. data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
  60. data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
  61. data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
  62. data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
  63. data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
  64. data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
  65. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
  66. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
  67. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
  68. data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
  69. data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
  70. data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
  71. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
  72. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
  73. data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
  74. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
  75. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
  76. data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
  77. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
  78. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
  79. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
  80. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
  81. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
  82. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
  83. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
  84. data/ext/cld/encodings/internal/encodings.cc +12 -0
  85. data/ext/cld/encodings/lang_enc.h +254 -0
  86. data/ext/cld/encodings/proto/encodings.pb.h +169 -0
  87. data/ext/cld/encodings/public/encodings.h +301 -0
  88. data/ext/cld/extconf.rb +1 -0
  89. data/ext/cld/language_detection.cc +88 -0
  90. data/ext/cld/languages/internal/languages.cc +337 -0
  91. data/ext/cld/languages/proto/languages.pb.h +179 -0
  92. data/ext/cld/languages/public/languages.h +379 -0
  93. data/language_detection.gemspec +28 -0
  94. data/lib/language_detection/string.rb +1 -0
  95. data/lib/language_detection/version.rb +3 -0
  96. data/lib/language_detection.rb +54 -0
  97. data/test/_helper.rb +15 -0
  98. data/test/fixtures/languages.csv +80 -0
  99. data/test/language_detection_test.rb +88 -0
  100. metadata +250 -0
@@ -0,0 +1,243 @@
1
+ // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+ //
5
+ // Various Google-specific macros.
6
+ //
7
+ // This code is compiled directly on many platforms, including client
8
+ // platforms like Windows, Mac, and embedded systems. Before making
9
+ // any changes here, make sure that you're not breaking any platforms.
10
+ //
11
+
12
+ #ifndef BASE_MACROS_H_
13
+ #define BASE_MACROS_H_
14
+
15
+ #include <stddef.h> // For size_t
16
+
17
+ #include "base/type_traits.h"
18
+
19
+
20
+ // The COMPILE_ASSERT macro can be used to verify that a compile time
21
+ // expression is true. For example, you could use it to verify the
22
+ // size of a static array:
23
+ //
24
+ // COMPILE_ASSERT(ARRAYSIZE(content_type_names) == CONTENT_NUM_TYPES,
25
+ // content_type_names_incorrect_size);
26
+ //
27
+ // or to make sure a struct is smaller than a certain size:
28
+ //
29
+ // COMPILE_ASSERT(sizeof(foo) < 128, foo_too_large);
30
+ //
31
+ // The second argument to the macro is the name of the variable. If
32
+ // the expression is false, most compilers will issue a warning/error
33
+ // containing the name of the variable.
34
+
35
+ #define COMPILE_ASSERT(expr, msg) \
36
+ typedef CompileAssert<(bool(expr))> msg[bool(expr) ? 1 : -1]
37
+
38
+ // Implementation details of COMPILE_ASSERT:
39
+ //
40
+ // - COMPILE_ASSERT works by defining an array type that has -1
41
+ // elements (and thus is invalid) when the expression is false.
42
+ //
43
+ // - The simpler definition
44
+ //
45
+ // #define COMPILE_ASSERT(expr, msg) typedef char msg[(expr) ? 1 : -1]
46
+ //
47
+ // does not work, as gcc supports variable-length arrays whose sizes
48
+ // are determined at run-time (this is gcc's extension and not part
49
+ // of the C++ standard). As a result, gcc fails to reject the
50
+ // following code with the simple definition:
51
+ //
52
+ // int foo;
53
+ // COMPILE_ASSERT(foo, msg); // not supposed to compile as foo is
54
+ // // not a compile-time constant.
55
+ //
56
+ // - By using the type CompileAssert<(bool(expr))>, we ensures that
57
+ // expr is a compile-time constant. (Template arguments must be
58
+ // determined at compile-time.)
59
+ //
60
+ // - The outter parentheses in CompileAssert<(bool(expr))> are necessary
61
+ // to work around a bug in gcc 3.4.4 and 4.0.1. If we had written
62
+ //
63
+ // CompileAssert<bool(expr)>
64
+ //
65
+ // instead, these compilers will refuse to compile
66
+ //
67
+ // COMPILE_ASSERT(5 > 0, some_message);
68
+ //
69
+ // (They seem to think the ">" in "5 > 0" marks the end of the
70
+ // template argument list.)
71
+ //
72
+ // - The array size is (bool(expr) ? 1 : -1), instead of simply
73
+ //
74
+ // ((expr) ? 1 : -1).
75
+ //
76
+ // This is to avoid running into a bug in MS VC 7.1, which
77
+ // causes ((0.0) ? 1 : -1) to incorrectly evaluate to 1.
78
+
79
+
80
+ // A macro to disallow the copy constructor and operator= functions
81
+ // This should be used in the private: declarations for a class
82
+ //
83
+ // For disallowing only assign or copy, write the code directly, but declare
84
+ // the intend in a comment, for example:
85
+ // void operator=(const TypeName&); // DISALLOW_ASSIGN
86
+ // Note, that most uses of DISALLOW_ASSIGN and DISALLOW_COPY are broken
87
+ // semantically, one should either use disallow both or neither. Try to
88
+ // avoid these in new code.
89
+ #define DISALLOW_COPY_AND_ASSIGN(TypeName) \
90
+ TypeName(const TypeName&); \
91
+ void operator=(const TypeName&)
92
+
93
+ // An older, politically incorrect name for the above.
94
+ // Prefer DISALLOW_COPY_AND_ASSIGN for new code.
95
+ #define DISALLOW_EVIL_CONSTRUCTORS(TypeName) DISALLOW_COPY_AND_ASSIGN(TypeName)
96
+
97
+ // A macro to disallow all the implicit constructors, namely the
98
+ // default constructor, copy constructor and operator= functions.
99
+ //
100
+ // This should be used in the private: declarations for a class
101
+ // that wants to prevent anyone from instantiating it. This is
102
+ // especially useful for classes containing only static methods.
103
+ #define DISALLOW_IMPLICIT_CONSTRUCTORS(TypeName) \
104
+ TypeName(); \
105
+ DISALLOW_COPY_AND_ASSIGN(TypeName)
106
+
107
+ // The arraysize(arr) macro returns the # of elements in an array arr.
108
+ // The expression is a compile-time constant, and therefore can be
109
+ // used in defining new arrays, for example. If you use arraysize on
110
+ // a pointer by mistake, you will get a compile-time error.
111
+ //
112
+ // One caveat is that arraysize() doesn't accept any array of an
113
+ // anonymous type or a type defined inside a function. In these rare
114
+ // cases, you have to use the unsafe ARRAYSIZE() macro below. This is
115
+ // due to a limitation in C++'s template system. The limitation might
116
+ // eventually be removed, but it hasn't happened yet.
117
+
118
+ // This template function declaration is used in defining arraysize.
119
+ // Note that the function doesn't need an implementation, as we only
120
+ // use its type.
121
+ template <typename T, size_t N>
122
+ char (&ArraySizeHelper(T (&array)[N]))[N];
123
+
124
+ // That gcc wants both of these prototypes seems mysterious. VC, for
125
+ // its part, can't decide which to use (another mystery). Matching of
126
+ // template overloads: the final frontier.
127
+ #ifndef COMPILER_MSVC
128
+ template <typename T, size_t N>
129
+ char (&ArraySizeHelper(const T (&array)[N]))[N];
130
+ #endif
131
+
132
+ #define arraysize(array) (sizeof(ArraySizeHelper(array)))
133
+
134
+ // ARRAYSIZE performs essentially the same calculation as arraysize,
135
+ // but can be used on anonymous types or types defined inside
136
+ // functions. It's less safe than arraysize as it accepts some
137
+ // (although not all) pointers. Therefore, you should use arraysize
138
+ // whenever possible.
139
+ //
140
+ // The expression ARRAYSIZE(a) is a compile-time constant of type
141
+ // size_t.
142
+ //
143
+ // ARRAYSIZE catches a few type errors. If you see a compiler error
144
+ //
145
+ // "warning: division by zero in ..."
146
+ //
147
+ // when using ARRAYSIZE, you are (wrongfully) giving it a pointer.
148
+ // You should only use ARRAYSIZE on statically allocated arrays.
149
+ //
150
+ // The following comments are on the implementation details, and can
151
+ // be ignored by the users.
152
+ //
153
+ // ARRAYSIZE(arr) works by inspecting sizeof(arr) (the # of bytes in
154
+ // the array) and sizeof(*(arr)) (the # of bytes in one array
155
+ // element). If the former is divisible by the latter, perhaps arr is
156
+ // indeed an array, in which case the division result is the # of
157
+ // elements in the array. Otherwise, arr cannot possibly be an array,
158
+ // and we generate a compiler error to prevent the code from
159
+ // compiling.
160
+ //
161
+ // Since the size of bool is implementation-defined, we need to cast
162
+ // !(sizeof(a) & sizeof(*(a))) to size_t in order to ensure the final
163
+ // result has type size_t.
164
+ //
165
+ // This macro is not perfect as it wrongfully accepts certain
166
+ // pointers, namely where the pointer size is divisible by the pointee
167
+ // size. Since all our code has to go through a 32-bit compiler,
168
+ // where a pointer is 4 bytes, this means all pointers to a type whose
169
+ // size is 3 or greater than 4 will be (righteously) rejected.
170
+ //
171
+ // Kudos to Jorg Brown for this simple and elegant implementation.
172
+ //
173
+ // - wan 2005-11-16
174
+ //
175
+ // Starting with Visual C++ 2005, WinNT.h includes ARRAYSIZE.
176
+ #if !defined(COMPILER_MSVC) || (defined(_MSC_VER) && _MSC_VER < 1400)
177
+ #define ARRAYSIZE(a) \
178
+ ((sizeof(a) / sizeof(*(a))) / \
179
+ static_cast<size_t>(!(sizeof(a) % sizeof(*(a)))))
180
+ #endif
181
+
182
+ // A macro to turn a symbol into a string
183
+ #define AS_STRING(x) AS_STRING_INTERNAL(x)
184
+ #define AS_STRING_INTERNAL(x) #x
185
+
186
+
187
+ // One of the type traits, is_pod, makes it possible to query whether
188
+ // a type is a POD type. It is impossible for type_traits.h to get
189
+ // this right without compiler support, so it fails conservatively. It
190
+ // knows that fundamental types and pointers are PODs, but it can't
191
+ // tell whether user classes are PODs. The DECLARE_POD macro is used
192
+ // to inform the type traits library that a user class is a POD.
193
+ //
194
+ // Implementation note: the typedef at the end is just to make it legal
195
+ // to put a semicolon after DECLARE_POD(foo).
196
+ //
197
+ //
198
+ // So what's a POD? The C++ standard (clause 9 paragraph 4) gives a
199
+ // full definition, but a good rule of thumb is that a struct is a POD
200
+ // ("plain old data") if it doesn't use any of the features that make
201
+ // C++ different from C. A POD struct can't have constructors,
202
+ // destructors, assignment operators, base classes, private or
203
+ // protected members, or virtual functions, and all of its member
204
+ // variables must themselves be PODs.
205
+
206
+ #define DECLARE_POD(TypeName) \
207
+ namespace base { \
208
+ template<> struct is_pod<TypeName> : true_type { }; \
209
+ } \
210
+ typedef int Dummy_Type_For_DECLARE_POD \
211
+
212
+ // We once needed a different technique to assert that a nested class
213
+ // is a POD. This is no longer necessary, and DECLARE_NESTED_POD is
214
+ // just a synonym for DECLARE_POD. We continue to provide
215
+ // DECLARE_NESTED_POD only so we don't have to change client
216
+ // code. Regardless of whether you use DECLARE_POD or
217
+ // DECLARE_NESTED_POD: use it after the outer class. Using it within a
218
+ // class definition will give a compiler error.
219
+ #define DECLARE_NESTED_POD(TypeName) DECLARE_POD(TypeName)
220
+
221
+ // Declare that TemplateName<T> is a POD whenever T is
222
+ #define PROPAGATE_POD_FROM_TEMPLATE_ARGUMENT(TemplateName) \
223
+ namespace base { \
224
+ template <typename T> struct is_pod<TemplateName<T> > : is_pod<T> { }; \
225
+ } \
226
+ typedef int Dummy_Type_For_PROPAGATE_POD_FROM_TEMPLATE_ARGUMENT
227
+
228
+ // Macro that does nothing if TypeName is a POD, and gives a compiler
229
+ // error if TypeName is a non-POD. You should put a descriptive
230
+ // comment right next to the macro call so that people can tell what
231
+ // the compiler error is about.
232
+ //
233
+ // Implementation note: this works by taking the size of a type that's
234
+ // complete when TypeName is a POD and incomplete otherwise.
235
+
236
+ template <typename Boolean> struct ERROR_TYPE_MUST_BE_POD;
237
+ template <> struct ERROR_TYPE_MUST_BE_POD<base::true_type> { };
238
+ #define ENFORCE_POD(TypeName) \
239
+ enum { dummy_##TypeName \
240
+ = sizeof(ERROR_TYPE_MUST_BE_POD< \
241
+ typename base::is_pod<TypeName>::type>) }
242
+
243
+ #endif // BASE_MACROS_H_
@@ -0,0 +1,54 @@
1
+ // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef BASE_PORT_H_
6
+ #define BASE_PORT_H_
7
+
8
+ #include <stdarg.h>
9
+ #include "base/build_config.h"
10
+
11
+ #ifdef COMPILER_MSVC
12
+ #define GG_LONGLONG(x) x##I64
13
+ #define GG_ULONGLONG(x) x##UI64
14
+ #else
15
+ #define GG_LONGLONG(x) x##LL
16
+ #define GG_ULONGLONG(x) x##ULL
17
+ #endif
18
+
19
+ // Per C99 7.8.14, define __STDC_CONSTANT_MACROS before including <stdint.h>
20
+ // to get the INTn_C and UINTn_C macros for integer constants. It's difficult
21
+ // to guarantee any specific ordering of header includes, so it's difficult to
22
+ // guarantee that the INTn_C macros can be defined by including <stdint.h> at
23
+ // any specific point. Provide GG_INTn_C macros instead.
24
+
25
+ #define GG_INT8_C(x) (x)
26
+ #define GG_INT16_C(x) (x)
27
+ #define GG_INT32_C(x) (x)
28
+ #define GG_INT64_C(x) GG_LONGLONG(x)
29
+
30
+ #define GG_UINT8_C(x) (x ## U)
31
+ #define GG_UINT16_C(x) (x ## U)
32
+ #define GG_UINT32_C(x) (x ## U)
33
+ #define GG_UINT64_C(x) GG_ULONGLONG(x)
34
+
35
+ // It's possible for functions that use a va_list, such as StringPrintf, to
36
+ // invalidate the data in it upon use. The fix is to make a copy of the
37
+ // structure before using it and use that copy instead. va_copy is provided
38
+ // for this purpose. MSVC does not provide va_copy, so define an
39
+ // implementation here. It is not guaranteed that assignment is a copy, so the
40
+ // StringUtil.VariableArgsFunc unit test tests this capability.
41
+ #if defined(COMPILER_GCC)
42
+ #define GG_VA_COPY(a, b) (va_copy(a, b))
43
+ #elif defined(COMPILER_MSVC)
44
+ #define GG_VA_COPY(a, b) (a = b)
45
+ #endif
46
+
47
+ // Define an OS-neutral wrapper for shared library entry points
48
+ #if defined(OS_WIN)
49
+ #define API_CALL __stdcall
50
+ #else
51
+ #define API_CALL
52
+ #endif
53
+
54
+ #endif // BASE_PORT_H_