language_detection 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. data/.gitignore +19 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +85 -0
  5. data/Rakefile +11 -0
  6. data/ext/cld/Makefile +34 -0
  7. data/ext/cld/base/basictypes.h +348 -0
  8. data/ext/cld/base/build_config.h +124 -0
  9. data/ext/cld/base/casts.h +156 -0
  10. data/ext/cld/base/commandlineflags.h +443 -0
  11. data/ext/cld/base/crash.h +41 -0
  12. data/ext/cld/base/dynamic_annotations.h +358 -0
  13. data/ext/cld/base/global_strip_options.h +59 -0
  14. data/ext/cld/base/log_severity.h +46 -0
  15. data/ext/cld/base/logging.h +1403 -0
  16. data/ext/cld/base/macros.h +243 -0
  17. data/ext/cld/base/port.h +54 -0
  18. data/ext/cld/base/scoped_ptr.h +428 -0
  19. data/ext/cld/base/stl_decl.h +0 -0
  20. data/ext/cld/base/stl_decl_msvc.h +107 -0
  21. data/ext/cld/base/string_util.h +29 -0
  22. data/ext/cld/base/strtoint.h +93 -0
  23. data/ext/cld/base/template_util.h +96 -0
  24. data/ext/cld/base/type_traits.h +198 -0
  25. data/ext/cld/base/vlog_is_on.h +143 -0
  26. data/ext/cld/cld.so +0 -0
  27. data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
  28. data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
  29. data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
  30. data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
  31. data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
  32. data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
  33. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
  34. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
  35. data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
  36. data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
  37. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
  38. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
  39. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
  40. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
  41. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
  42. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
  43. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
  44. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
  45. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
  46. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
  47. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
  48. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
  49. data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
  50. data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
  51. data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
  52. data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
  53. data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
  54. data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
  55. data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
  56. data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
  57. data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
  58. data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
  59. data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
  60. data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
  61. data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
  62. data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
  63. data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
  64. data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
  65. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
  66. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
  67. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
  68. data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
  69. data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
  70. data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
  71. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
  72. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
  73. data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
  74. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
  75. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
  76. data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
  77. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
  78. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
  79. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
  80. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
  81. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
  82. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
  83. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
  84. data/ext/cld/encodings/internal/encodings.cc +12 -0
  85. data/ext/cld/encodings/lang_enc.h +254 -0
  86. data/ext/cld/encodings/proto/encodings.pb.h +169 -0
  87. data/ext/cld/encodings/public/encodings.h +301 -0
  88. data/ext/cld/extconf.rb +1 -0
  89. data/ext/cld/language_detection.cc +88 -0
  90. data/ext/cld/languages/internal/languages.cc +337 -0
  91. data/ext/cld/languages/proto/languages.pb.h +179 -0
  92. data/ext/cld/languages/public/languages.h +379 -0
  93. data/language_detection.gemspec +28 -0
  94. data/lib/language_detection/string.rb +1 -0
  95. data/lib/language_detection/version.rb +3 -0
  96. data/lib/language_detection.rb +54 -0
  97. data/test/_helper.rb +15 -0
  98. data/test/fixtures/languages.csv +80 -0
  99. data/test/language_detection_test.rb +88 -0
  100. metadata +250 -0
@@ -0,0 +1,243 @@
1
+ // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+ //
5
+ // Various Google-specific macros.
6
+ //
7
+ // This code is compiled directly on many platforms, including client
8
+ // platforms like Windows, Mac, and embedded systems. Before making
9
+ // any changes here, make sure that you're not breaking any platforms.
10
+ //
11
+
12
+ #ifndef BASE_MACROS_H_
13
+ #define BASE_MACROS_H_
14
+
15
+ #include <stddef.h> // For size_t
16
+
17
+ #include "base/type_traits.h"
18
+
19
+
20
+ // The COMPILE_ASSERT macro can be used to verify that a compile time
21
+ // expression is true. For example, you could use it to verify the
22
+ // size of a static array:
23
+ //
24
+ // COMPILE_ASSERT(ARRAYSIZE(content_type_names) == CONTENT_NUM_TYPES,
25
+ // content_type_names_incorrect_size);
26
+ //
27
+ // or to make sure a struct is smaller than a certain size:
28
+ //
29
+ // COMPILE_ASSERT(sizeof(foo) < 128, foo_too_large);
30
+ //
31
+ // The second argument to the macro is the name of the variable. If
32
+ // the expression is false, most compilers will issue a warning/error
33
+ // containing the name of the variable.
34
+
35
+ #define COMPILE_ASSERT(expr, msg) \
36
+ typedef CompileAssert<(bool(expr))> msg[bool(expr) ? 1 : -1]
37
+
38
+ // Implementation details of COMPILE_ASSERT:
39
+ //
40
+ // - COMPILE_ASSERT works by defining an array type that has -1
41
+ // elements (and thus is invalid) when the expression is false.
42
+ //
43
+ // - The simpler definition
44
+ //
45
+ // #define COMPILE_ASSERT(expr, msg) typedef char msg[(expr) ? 1 : -1]
46
+ //
47
+ // does not work, as gcc supports variable-length arrays whose sizes
48
+ // are determined at run-time (this is gcc's extension and not part
49
+ // of the C++ standard). As a result, gcc fails to reject the
50
+ // following code with the simple definition:
51
+ //
52
+ // int foo;
53
+ // COMPILE_ASSERT(foo, msg); // not supposed to compile as foo is
54
+ // // not a compile-time constant.
55
+ //
56
+ // - By using the type CompileAssert<(bool(expr))>, we ensures that
57
+ // expr is a compile-time constant. (Template arguments must be
58
+ // determined at compile-time.)
59
+ //
60
+ // - The outter parentheses in CompileAssert<(bool(expr))> are necessary
61
+ // to work around a bug in gcc 3.4.4 and 4.0.1. If we had written
62
+ //
63
+ // CompileAssert<bool(expr)>
64
+ //
65
+ // instead, these compilers will refuse to compile
66
+ //
67
+ // COMPILE_ASSERT(5 > 0, some_message);
68
+ //
69
+ // (They seem to think the ">" in "5 > 0" marks the end of the
70
+ // template argument list.)
71
+ //
72
+ // - The array size is (bool(expr) ? 1 : -1), instead of simply
73
+ //
74
+ // ((expr) ? 1 : -1).
75
+ //
76
+ // This is to avoid running into a bug in MS VC 7.1, which
77
+ // causes ((0.0) ? 1 : -1) to incorrectly evaluate to 1.
78
+
79
+
80
+ // A macro to disallow the copy constructor and operator= functions
81
+ // This should be used in the private: declarations for a class
82
+ //
83
+ // For disallowing only assign or copy, write the code directly, but declare
84
+ // the intend in a comment, for example:
85
+ // void operator=(const TypeName&); // DISALLOW_ASSIGN
86
+ // Note, that most uses of DISALLOW_ASSIGN and DISALLOW_COPY are broken
87
+ // semantically, one should either use disallow both or neither. Try to
88
+ // avoid these in new code.
89
+ #define DISALLOW_COPY_AND_ASSIGN(TypeName) \
90
+ TypeName(const TypeName&); \
91
+ void operator=(const TypeName&)
92
+
93
+ // An older, politically incorrect name for the above.
94
+ // Prefer DISALLOW_COPY_AND_ASSIGN for new code.
95
+ #define DISALLOW_EVIL_CONSTRUCTORS(TypeName) DISALLOW_COPY_AND_ASSIGN(TypeName)
96
+
97
+ // A macro to disallow all the implicit constructors, namely the
98
+ // default constructor, copy constructor and operator= functions.
99
+ //
100
+ // This should be used in the private: declarations for a class
101
+ // that wants to prevent anyone from instantiating it. This is
102
+ // especially useful for classes containing only static methods.
103
+ #define DISALLOW_IMPLICIT_CONSTRUCTORS(TypeName) \
104
+ TypeName(); \
105
+ DISALLOW_COPY_AND_ASSIGN(TypeName)
106
+
107
+ // The arraysize(arr) macro returns the # of elements in an array arr.
108
+ // The expression is a compile-time constant, and therefore can be
109
+ // used in defining new arrays, for example. If you use arraysize on
110
+ // a pointer by mistake, you will get a compile-time error.
111
+ //
112
+ // One caveat is that arraysize() doesn't accept any array of an
113
+ // anonymous type or a type defined inside a function. In these rare
114
+ // cases, you have to use the unsafe ARRAYSIZE() macro below. This is
115
+ // due to a limitation in C++'s template system. The limitation might
116
+ // eventually be removed, but it hasn't happened yet.
117
+
118
+ // This template function declaration is used in defining arraysize.
119
+ // Note that the function doesn't need an implementation, as we only
120
+ // use its type.
121
+ template <typename T, size_t N>
122
+ char (&ArraySizeHelper(T (&array)[N]))[N];
123
+
124
+ // That gcc wants both of these prototypes seems mysterious. VC, for
125
+ // its part, can't decide which to use (another mystery). Matching of
126
+ // template overloads: the final frontier.
127
+ #ifndef COMPILER_MSVC
128
+ template <typename T, size_t N>
129
+ char (&ArraySizeHelper(const T (&array)[N]))[N];
130
+ #endif
131
+
132
+ #define arraysize(array) (sizeof(ArraySizeHelper(array)))
133
+
134
+ // ARRAYSIZE performs essentially the same calculation as arraysize,
135
+ // but can be used on anonymous types or types defined inside
136
+ // functions. It's less safe than arraysize as it accepts some
137
+ // (although not all) pointers. Therefore, you should use arraysize
138
+ // whenever possible.
139
+ //
140
+ // The expression ARRAYSIZE(a) is a compile-time constant of type
141
+ // size_t.
142
+ //
143
+ // ARRAYSIZE catches a few type errors. If you see a compiler error
144
+ //
145
+ // "warning: division by zero in ..."
146
+ //
147
+ // when using ARRAYSIZE, you are (wrongfully) giving it a pointer.
148
+ // You should only use ARRAYSIZE on statically allocated arrays.
149
+ //
150
+ // The following comments are on the implementation details, and can
151
+ // be ignored by the users.
152
+ //
153
+ // ARRAYSIZE(arr) works by inspecting sizeof(arr) (the # of bytes in
154
+ // the array) and sizeof(*(arr)) (the # of bytes in one array
155
+ // element). If the former is divisible by the latter, perhaps arr is
156
+ // indeed an array, in which case the division result is the # of
157
+ // elements in the array. Otherwise, arr cannot possibly be an array,
158
+ // and we generate a compiler error to prevent the code from
159
+ // compiling.
160
+ //
161
+ // Since the size of bool is implementation-defined, we need to cast
162
+ // !(sizeof(a) & sizeof(*(a))) to size_t in order to ensure the final
163
+ // result has type size_t.
164
+ //
165
+ // This macro is not perfect as it wrongfully accepts certain
166
+ // pointers, namely where the pointer size is divisible by the pointee
167
+ // size. Since all our code has to go through a 32-bit compiler,
168
+ // where a pointer is 4 bytes, this means all pointers to a type whose
169
+ // size is 3 or greater than 4 will be (righteously) rejected.
170
+ //
171
+ // Kudos to Jorg Brown for this simple and elegant implementation.
172
+ //
173
+ // - wan 2005-11-16
174
+ //
175
+ // Starting with Visual C++ 2005, WinNT.h includes ARRAYSIZE.
176
+ #if !defined(COMPILER_MSVC) || (defined(_MSC_VER) && _MSC_VER < 1400)
177
+ #define ARRAYSIZE(a) \
178
+ ((sizeof(a) / sizeof(*(a))) / \
179
+ static_cast<size_t>(!(sizeof(a) % sizeof(*(a)))))
180
+ #endif
181
+
182
+ // A macro to turn a symbol into a string
183
+ #define AS_STRING(x) AS_STRING_INTERNAL(x)
184
+ #define AS_STRING_INTERNAL(x) #x
185
+
186
+
187
+ // One of the type traits, is_pod, makes it possible to query whether
188
+ // a type is a POD type. It is impossible for type_traits.h to get
189
+ // this right without compiler support, so it fails conservatively. It
190
+ // knows that fundamental types and pointers are PODs, but it can't
191
+ // tell whether user classes are PODs. The DECLARE_POD macro is used
192
+ // to inform the type traits library that a user class is a POD.
193
+ //
194
+ // Implementation note: the typedef at the end is just to make it legal
195
+ // to put a semicolon after DECLARE_POD(foo).
196
+ //
197
+ //
198
+ // So what's a POD? The C++ standard (clause 9 paragraph 4) gives a
199
+ // full definition, but a good rule of thumb is that a struct is a POD
200
+ // ("plain old data") if it doesn't use any of the features that make
201
+ // C++ different from C. A POD struct can't have constructors,
202
+ // destructors, assignment operators, base classes, private or
203
+ // protected members, or virtual functions, and all of its member
204
+ // variables must themselves be PODs.
205
+
206
+ #define DECLARE_POD(TypeName) \
207
+ namespace base { \
208
+ template<> struct is_pod<TypeName> : true_type { }; \
209
+ } \
210
+ typedef int Dummy_Type_For_DECLARE_POD \
211
+
212
+ // We once needed a different technique to assert that a nested class
213
+ // is a POD. This is no longer necessary, and DECLARE_NESTED_POD is
214
+ // just a synonym for DECLARE_POD. We continue to provide
215
+ // DECLARE_NESTED_POD only so we don't have to change client
216
+ // code. Regardless of whether you use DECLARE_POD or
217
+ // DECLARE_NESTED_POD: use it after the outer class. Using it within a
218
+ // class definition will give a compiler error.
219
+ #define DECLARE_NESTED_POD(TypeName) DECLARE_POD(TypeName)
220
+
221
+ // Declare that TemplateName<T> is a POD whenever T is
222
+ #define PROPAGATE_POD_FROM_TEMPLATE_ARGUMENT(TemplateName) \
223
+ namespace base { \
224
+ template <typename T> struct is_pod<TemplateName<T> > : is_pod<T> { }; \
225
+ } \
226
+ typedef int Dummy_Type_For_PROPAGATE_POD_FROM_TEMPLATE_ARGUMENT
227
+
228
+ // Macro that does nothing if TypeName is a POD, and gives a compiler
229
+ // error if TypeName is a non-POD. You should put a descriptive
230
+ // comment right next to the macro call so that people can tell what
231
+ // the compiler error is about.
232
+ //
233
+ // Implementation note: this works by taking the size of a type that's
234
+ // complete when TypeName is a POD and incomplete otherwise.
235
+
236
+ template <typename Boolean> struct ERROR_TYPE_MUST_BE_POD;
237
+ template <> struct ERROR_TYPE_MUST_BE_POD<base::true_type> { };
238
+ #define ENFORCE_POD(TypeName) \
239
+ enum { dummy_##TypeName \
240
+ = sizeof(ERROR_TYPE_MUST_BE_POD< \
241
+ typename base::is_pod<TypeName>::type>) }
242
+
243
+ #endif // BASE_MACROS_H_
@@ -0,0 +1,54 @@
1
+ // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef BASE_PORT_H_
6
+ #define BASE_PORT_H_
7
+
8
+ #include <stdarg.h>
9
+ #include "base/build_config.h"
10
+
11
+ #ifdef COMPILER_MSVC
12
+ #define GG_LONGLONG(x) x##I64
13
+ #define GG_ULONGLONG(x) x##UI64
14
+ #else
15
+ #define GG_LONGLONG(x) x##LL
16
+ #define GG_ULONGLONG(x) x##ULL
17
+ #endif
18
+
19
+ // Per C99 7.8.14, define __STDC_CONSTANT_MACROS before including <stdint.h>
20
+ // to get the INTn_C and UINTn_C macros for integer constants. It's difficult
21
+ // to guarantee any specific ordering of header includes, so it's difficult to
22
+ // guarantee that the INTn_C macros can be defined by including <stdint.h> at
23
+ // any specific point. Provide GG_INTn_C macros instead.
24
+
25
+ #define GG_INT8_C(x) (x)
26
+ #define GG_INT16_C(x) (x)
27
+ #define GG_INT32_C(x) (x)
28
+ #define GG_INT64_C(x) GG_LONGLONG(x)
29
+
30
+ #define GG_UINT8_C(x) (x ## U)
31
+ #define GG_UINT16_C(x) (x ## U)
32
+ #define GG_UINT32_C(x) (x ## U)
33
+ #define GG_UINT64_C(x) GG_ULONGLONG(x)
34
+
35
+ // It's possible for functions that use a va_list, such as StringPrintf, to
36
+ // invalidate the data in it upon use. The fix is to make a copy of the
37
+ // structure before using it and use that copy instead. va_copy is provided
38
+ // for this purpose. MSVC does not provide va_copy, so define an
39
+ // implementation here. It is not guaranteed that assignment is a copy, so the
40
+ // StringUtil.VariableArgsFunc unit test tests this capability.
41
+ #if defined(COMPILER_GCC)
42
+ #define GG_VA_COPY(a, b) (va_copy(a, b))
43
+ #elif defined(COMPILER_MSVC)
44
+ #define GG_VA_COPY(a, b) (a = b)
45
+ #endif
46
+
47
+ // Define an OS-neutral wrapper for shared library entry points
48
+ #if defined(OS_WIN)
49
+ #define API_CALL __stdcall
50
+ #else
51
+ #define API_CALL
52
+ #endif
53
+
54
+ #endif // BASE_PORT_H_