krukid-cld 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (108) hide show
  1. data/LICENSE +27 -0
  2. data/Manifest +106 -0
  3. data/README.rdoc +173 -0
  4. data/Rakefile +15 -0
  5. data/base/basictypes.h +348 -0
  6. data/base/build_config.h +115 -0
  7. data/base/casts.h +156 -0
  8. data/base/commandlineflags.h +443 -0
  9. data/base/crash.h +41 -0
  10. data/base/dynamic_annotations.h +358 -0
  11. data/base/global_strip_options.h +59 -0
  12. data/base/log_severity.h +46 -0
  13. data/base/logging.h +1403 -0
  14. data/base/macros.h +243 -0
  15. data/base/port.h +54 -0
  16. data/base/scoped_ptr.h +428 -0
  17. data/base/stl_decl.h +0 -0
  18. data/base/stl_decl_msvc.h +107 -0
  19. data/base/string_util.h +29 -0
  20. data/base/strtoint.h +93 -0
  21. data/base/template_util.h +96 -0
  22. data/base/type_traits.h +198 -0
  23. data/base/vlog_is_on.h +143 -0
  24. data/build.sh +48 -0
  25. data/build.win.cmd +28 -0
  26. data/cld.gemspec +33 -0
  27. data/cld_encodings.h +95 -0
  28. data/encodings/compact_lang_det/#cldutil.cc# +905 -0
  29. data/encodings/compact_lang_det/#cldutil.h# +1205 -0
  30. data/encodings/compact_lang_det/#compact_lang_det_impl.h# +171 -0
  31. data/encodings/compact_lang_det/#ext_lang_enc.cc# +545 -0
  32. data/encodings/compact_lang_det/#ext_lang_enc.h# +119 -0
  33. data/encodings/compact_lang_det/#getonescriptspan.cc# +570 -0
  34. data/encodings/compact_lang_det/#getonescriptspan.h# +131 -0
  35. data/encodings/compact_lang_det/#tote.cc# +299 -0
  36. data/encodings/compact_lang_det/#tote.h# +89 -0
  37. data/encodings/compact_lang_det/cldutil.cc +905 -0
  38. data/encodings/compact_lang_det/cldutil.h +1205 -0
  39. data/encodings/compact_lang_det/cldutil_dbg.h +76 -0
  40. data/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
  41. data/encodings/compact_lang_det/compact_lang_det.cc +62 -0
  42. data/encodings/compact_lang_det/compact_lang_det.h +145 -0
  43. data/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
  44. data/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
  45. data/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
  46. data/encodings/compact_lang_det/compile.cmd +1 -0
  47. data/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
  48. data/encodings/compact_lang_det/ext_lang_enc.h +119 -0
  49. data/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
  50. data/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
  51. data/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
  52. data/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
  53. data/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
  54. data/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
  55. data/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
  56. data/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
  57. data/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
  58. data/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
  59. data/encodings/compact_lang_det/getonescriptspan.cc +570 -0
  60. data/encodings/compact_lang_det/getonescriptspan.h +131 -0
  61. data/encodings/compact_lang_det/letterscript_enum.cc +117 -0
  62. data/encodings/compact_lang_det/letterscript_enum.h +99 -0
  63. data/encodings/compact_lang_det/subsetsequence.cc +259 -0
  64. data/encodings/compact_lang_det/subsetsequence.h +44 -0
  65. data/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
  66. data/encodings/compact_lang_det/tote.cc +299 -0
  67. data/encodings/compact_lang_det/tote.h +89 -0
  68. data/encodings/compact_lang_det/unittest_data.h +193 -0
  69. data/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
  70. data/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
  71. data/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
  72. data/encodings/compact_lang_det/win/#cld_unilib_windows.cc# +29 -0
  73. data/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
  74. data/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
  75. data/encodings/compact_lang_det/win/cld_google.h +18 -0
  76. data/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
  77. data/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
  78. data/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
  79. data/encodings/compact_lang_det/win/cld_logging.h +21 -0
  80. data/encodings/compact_lang_det/win/cld_macros.h +19 -0
  81. data/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
  82. data/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
  83. data/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
  84. data/encodings/compact_lang_det/win/cld_unilib.h +15 -0
  85. data/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
  86. data/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
  87. data/encodings/compact_lang_det/win/cld_utf.h +24 -0
  88. data/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
  89. data/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
  90. data/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
  91. data/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
  92. data/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
  93. data/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
  94. data/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
  95. data/encodings/internal/encodings.cc +12 -0
  96. data/encodings/lang_enc.h +254 -0
  97. data/encodings/proto/encodings.pb.h +169 -0
  98. data/encodings/public/encodings.h +301 -0
  99. data/ext/cld/extconf.rb +8 -0
  100. data/krukid-cld.gemspec +33 -0
  101. data/languages/internal/#languages.cc# +337 -0
  102. data/languages/internal/languages.cc +337 -0
  103. data/languages/proto/languages.pb.h +179 -0
  104. data/languages/public/languages.h +379 -0
  105. data/lib/cld.rb +12 -0
  106. data/test/test.rb +570 -0
  107. data/thunk.cc +131 -0
  108. metadata +196 -0
@@ -0,0 +1,115 @@
1
+ // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ // This file adds defines about the platform we're currently building on.
6
+ // Operating System:
7
+ // OS_WIN / OS_MACOSX / OS_LINUX / OS_POSIX (MACOSX or LINUX)
8
+ // Compiler:
9
+ // COMPILER_MSVC / COMPILER_GCC
10
+ // Processor:
11
+ // ARCH_CPU_X86 / ARCH_CPU_X86_64 / ARCH_CPU_X86_FAMILY (X86 or X86_64)
12
+ // ARCH_CPU_32_BITS / ARCH_CPU_64_BITS
13
+
14
+ #ifndef BUILD_BUILD_CONFIG_H_
15
+ #define BUILD_BUILD_CONFIG_H_
16
+
17
+ // A set of macros to use for platform detection.
18
+ #if defined(__APPLE__)
19
+ #define OS_MACOSX 1
20
+ #elif defined(__linux__)
21
+ #define OS_LINUX 1
22
+ // Use TOOLKIT_GTK on linux if TOOLKIT_VIEWS isn't defined.
23
+ #if !defined(TOOLKIT_VIEWS)
24
+ #define TOOLKIT_GTK
25
+ #endif
26
+ #elif defined(_WIN32)
27
+ #define OS_WIN 1
28
+ #define TOOLKIT_VIEWS 1
29
+ #elif defined(__FreeBSD__)
30
+ #define OS_FREEBSD 1
31
+ #define TOOLKIT_GTK
32
+ #elif defined(__OpenBSD__)
33
+ #define OS_OPENBSD 1
34
+ #define TOOLKIT_GTK
35
+ #elif defined(__sun)
36
+ #define OS_SOLARIS 1
37
+ #define TOOLKIT_GTK
38
+ #else
39
+ #error Please add support for your platform in build/build_config.h
40
+ #endif
41
+
42
+ // A flag derived from the above flags, used to cover GTK code in
43
+ // both TOOLKIT_GTK and TOOLKIT_VIEWS.
44
+ #if defined(TOOLKIT_GTK) || (defined(TOOLKIT_VIEWS) && !defined(OS_WIN))
45
+ #define TOOLKIT_USES_GTK 1
46
+ #endif
47
+
48
+ #if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_OPENBSD)
49
+ #define USE_NSS 1 // Use NSS for crypto.
50
+ #define USE_X11 1 // Use X for graphics.
51
+ #endif
52
+
53
+ // For access to standard POSIXish features, use OS_POSIX instead of a
54
+ // more specific macro.
55
+ #if defined(OS_MACOSX) || defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_SOLARIS)
56
+ #define OS_POSIX 1
57
+ // Use base::DataPack for name/value pairs.
58
+ #define USE_BASE_DATA_PACK 1
59
+ #endif
60
+
61
+ // Use tcmalloc
62
+ #if defined(OS_WIN) && ! defined(NO_TCMALLOC)
63
+ #define USE_TCMALLOC 1
64
+ #endif
65
+
66
+ // Compiler detection.
67
+ #if defined(__GNUC__)
68
+ #define COMPILER_GCC 1
69
+ #elif defined(_MSC_VER)
70
+ #define COMPILER_MSVC 1
71
+ #else
72
+ #error Please add support for your compiler in build/build_config.h
73
+ #endif
74
+
75
+ // Processor architecture detection. For more info on what's defined, see:
76
+ // http://msdn.microsoft.com/en-us/library/b0084kay.aspx
77
+ // http://www.agner.org/optimize/calling_conventions.pdf
78
+ // or with gcc, run: "echo | gcc -E -dM -"
79
+ #if defined(_M_X64) || defined(__x86_64__)
80
+ #define ARCH_CPU_X86_FAMILY 1
81
+ #define ARCH_CPU_X86_64 1
82
+ #define ARCH_CPU_64_BITS 1
83
+ #elif defined(_M_IX86) || defined(__i386__)
84
+ #define ARCH_CPU_X86_FAMILY 1
85
+ #define ARCH_CPU_X86 1
86
+ #define ARCH_CPU_32_BITS 1
87
+ #elif defined(__ARMEL__)
88
+ #define ARCH_CPU_ARM_FAMILY 1
89
+ #define ARCH_CPU_ARMEL 1
90
+ #define ARCH_CPU_32_BITS 1
91
+ #define WCHAR_T_IS_UNSIGNED 1
92
+ #else
93
+ #error Please add support for your architecture in build/build_config.h
94
+ #endif
95
+
96
+ // Type detection for wchar_t.
97
+ #if defined(OS_WIN)
98
+ #define WCHAR_T_IS_UTF16
99
+ #elif defined(OS_POSIX) && defined(COMPILER_GCC) && \
100
+ defined(__WCHAR_MAX__) && \
101
+ (__WCHAR_MAX__ == 0x7fffffff || __WCHAR_MAX__ == 0xffffffff)
102
+ #define WCHAR_T_IS_UTF32
103
+ #elif defined(OS_POSIX) && defined(COMPILER_GCC) && \
104
+ defined(__WCHAR_MAX__) && \
105
+ (__WCHAR_MAX__ == 0x7fff || __WCHAR_MAX__ == 0xffff)
106
+ // On Posix, we'll detect short wchar_t, but projects aren't guaranteed to
107
+ // compile in this mode (in particular, Chrome doesn't). This is intended for
108
+ // other projects using base who manage their own dependencies and make sure
109
+ // short wchar works for them.
110
+ #define WCHAR_T_IS_UTF16
111
+ #else
112
+ #error Please add support for your compiler in build/build_config.h
113
+ #endif
114
+
115
+ #endif // BUILD_BUILD_CONFIG_H_
data/base/casts.h ADDED
@@ -0,0 +1,156 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef BASE_CASTS_H_
6
+ #define BASE_CASTS_H_
7
+
8
+ #include <assert.h> // for use with down_cast<>
9
+ #include <string.h> // for memcpy
10
+
11
+ #include "base/macros.h"
12
+
13
+
14
+ // Use implicit_cast as a safe version of static_cast or const_cast
15
+ // for upcasting in the type hierarchy (i.e. casting a pointer to Foo
16
+ // to a pointer to SuperclassOfFoo or casting a pointer to Foo to
17
+ // a const pointer to Foo).
18
+ // When you use implicit_cast, the compiler checks that the cast is safe.
19
+ // Such explicit implicit_casts are necessary in surprisingly many
20
+ // situations where C++ demands an exact type match instead of an
21
+ // argument type convertable to a target type.
22
+ //
23
+ // The From type can be inferred, so the preferred syntax for using
24
+ // implicit_cast is the same as for static_cast etc.:
25
+ //
26
+ // implicit_cast<ToType>(expr)
27
+ //
28
+ // implicit_cast would have been part of the C++ standard library,
29
+ // but the proposal was submitted too late. It will probably make
30
+ // its way into the language in the future.
31
+ template<typename To, typename From>
32
+ inline To implicit_cast(From const &f) {
33
+ return f;
34
+ }
35
+
36
+
37
+ // When you upcast (that is, cast a pointer from type Foo to type
38
+ // SuperclassOfFoo), it's fine to use implicit_cast<>, since upcasts
39
+ // always succeed. When you downcast (that is, cast a pointer from
40
+ // type Foo to type SubclassOfFoo), static_cast<> isn't safe, because
41
+ // how do you know the pointer is really of type SubclassOfFoo? It
42
+ // could be a bare Foo, or of type DifferentSubclassOfFoo. Thus,
43
+ // when you downcast, you should use this macro. In debug mode, we
44
+ // use dynamic_cast<> to double-check the downcast is legal (we die
45
+ // if it's not). In normal mode, we do the efficient static_cast<>
46
+ // instead. Thus, it's important to test in debug mode to make sure
47
+ // the cast is legal!
48
+ // This is the only place in the code we should use dynamic_cast<>.
49
+ // In particular, you SHOULDN'T be using dynamic_cast<> in order to
50
+ // do RTTI (eg code like this:
51
+ // if (dynamic_cast<Subclass1>(foo)) HandleASubclass1Object(foo);
52
+ // if (dynamic_cast<Subclass2>(foo)) HandleASubclass2Object(foo);
53
+ // You should design the code some other way not to need this.
54
+
55
+ template<typename To, typename From> // use like this: down_cast<T*>(foo);
56
+ inline To down_cast(From* f) { // so we only accept pointers
57
+ // Ensures that To is a sub-type of From *. This test is here only
58
+ // for compile-time type checking, and has no overhead in an
59
+ // optimized build at run-time, as it will be optimized away
60
+ // completely.
61
+ if (false) {
62
+ implicit_cast<From*, To>(0);
63
+ }
64
+
65
+ assert(f == NULL || dynamic_cast<To>(f) != NULL); // RTTI: debug mode only!
66
+ return static_cast<To>(f);
67
+ }
68
+
69
+ // Overload of down_cast for references. Use like this: down_cast<T&>(foo).
70
+ // The code is slightly convoluted because we're still using the pointer
71
+ // form of dynamic cast. (The reference form throws an exception if it
72
+ // fails.)
73
+ //
74
+ // There's no need for a special const overload either for the pointer
75
+ // or the reference form. If you call down_cast with a const T&, the
76
+ // compiler will just bind From to const T.
77
+ template<typename To, typename From>
78
+ inline To down_cast(From& f) {
79
+ COMPILE_ASSERT(base::is_reference<To>::value, target_type_not_a_reference);
80
+ typedef typename base::remove_reference<To>::type* ToAsPointer;
81
+ if (false) {
82
+ // Compile-time check that To inherits from From. See above for details.
83
+ implicit_cast<From*, ToAsPointer>(0);
84
+ }
85
+
86
+ assert(dynamic_cast<ToAsPointer>(&f) != NULL); // RTTI: debug mode only
87
+ return static_cast<To>(f);
88
+ }
89
+
90
+ // bit_cast<Dest,Source> is a template function that implements the
91
+ // equivalent of "*reinterpret_cast<Dest*>(&source)". We need this in
92
+ // very low-level functions like the protobuf library and fast math
93
+ // support.
94
+ //
95
+ // float f = 3.14159265358979;
96
+ // int i = bit_cast<int32>(f);
97
+ // // i = 0x40490fdb
98
+ //
99
+ // The classical address-casting method is:
100
+ //
101
+ // // WRONG
102
+ // float f = 3.14159265358979; // WRONG
103
+ // int i = * reinterpret_cast<int*>(&f); // WRONG
104
+ //
105
+ // The address-casting method actually produces undefined behavior
106
+ // according to ISO C++ specification section 3.10 -15 -. Roughly, this
107
+ // section says: if an object in memory has one type, and a program
108
+ // accesses it with a different type, then the result is undefined
109
+ // behavior for most values of "different type".
110
+ //
111
+ // This is true for any cast syntax, either *(int*)&f or
112
+ // *reinterpret_cast<int*>(&f). And it is particularly true for
113
+ // conversions betweeen integral lvalues and floating-point lvalues.
114
+ //
115
+ // The purpose of 3.10 -15- is to allow optimizing compilers to assume
116
+ // that expressions with different types refer to different memory. gcc
117
+ // 4.0.1 has an optimizer that takes advantage of this. So a
118
+ // non-conforming program quietly produces wildly incorrect output.
119
+ //
120
+ // The problem is not the use of reinterpret_cast. The problem is type
121
+ // punning: holding an object in memory of one type and reading its bits
122
+ // back using a different type.
123
+ //
124
+ // The C++ standard is more subtle and complex than this, but that
125
+ // is the basic idea.
126
+ //
127
+ // Anyways ...
128
+ //
129
+ // bit_cast<> calls memcpy() which is blessed by the standard,
130
+ // especially by the example in section 3.9 . Also, of course,
131
+ // bit_cast<> wraps up the nasty logic in one place.
132
+ //
133
+ // Fortunately memcpy() is very fast. In optimized mode, with a
134
+ // constant size, gcc 2.95.3, gcc 4.0.1, and msvc 7.1 produce inline
135
+ // code with the minimal amount of data movement. On a 32-bit system,
136
+ // memcpy(d,s,4) compiles to one load and one store, and memcpy(d,s,8)
137
+ // compiles to two loads and two stores.
138
+ //
139
+ // I tested this code with gcc 2.95.3, gcc 4.0.1, icc 8.1, and msvc 7.1.
140
+ //
141
+ // WARNING: if Dest or Source is a non-POD type, the result of the memcpy
142
+ // is likely to surprise you.
143
+ //
144
+
145
+ template <class Dest, class Source>
146
+ inline Dest bit_cast(const Source& source) {
147
+ // Compile time assertion: sizeof(Dest) == sizeof(Source)
148
+ // A compile error here means your Dest and Source have different sizes.
149
+ typedef char VerifySizesAreEqual [sizeof(Dest) == sizeof(Source) ? 1 : -1];
150
+
151
+ Dest dest;
152
+ memcpy(&dest, &source, sizeof(dest));
153
+ return dest;
154
+ }
155
+
156
+ #endif // BASE_CASTS_H_