krukid-cld 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +27 -0
- data/Manifest +106 -0
- data/README.rdoc +173 -0
- data/Rakefile +15 -0
- data/base/basictypes.h +348 -0
- data/base/build_config.h +115 -0
- data/base/casts.h +156 -0
- data/base/commandlineflags.h +443 -0
- data/base/crash.h +41 -0
- data/base/dynamic_annotations.h +358 -0
- data/base/global_strip_options.h +59 -0
- data/base/log_severity.h +46 -0
- data/base/logging.h +1403 -0
- data/base/macros.h +243 -0
- data/base/port.h +54 -0
- data/base/scoped_ptr.h +428 -0
- data/base/stl_decl.h +0 -0
- data/base/stl_decl_msvc.h +107 -0
- data/base/string_util.h +29 -0
- data/base/strtoint.h +93 -0
- data/base/template_util.h +96 -0
- data/base/type_traits.h +198 -0
- data/base/vlog_is_on.h +143 -0
- data/build.sh +48 -0
- data/build.win.cmd +28 -0
- data/cld.gemspec +33 -0
- data/cld_encodings.h +95 -0
- data/encodings/compact_lang_det/#cldutil.cc# +905 -0
- data/encodings/compact_lang_det/#cldutil.h# +1205 -0
- data/encodings/compact_lang_det/#compact_lang_det_impl.h# +171 -0
- data/encodings/compact_lang_det/#ext_lang_enc.cc# +545 -0
- data/encodings/compact_lang_det/#ext_lang_enc.h# +119 -0
- data/encodings/compact_lang_det/#getonescriptspan.cc# +570 -0
- data/encodings/compact_lang_det/#getonescriptspan.h# +131 -0
- data/encodings/compact_lang_det/#tote.cc# +299 -0
- data/encodings/compact_lang_det/#tote.h# +89 -0
- data/encodings/compact_lang_det/cldutil.cc +905 -0
- data/encodings/compact_lang_det/cldutil.h +1205 -0
- data/encodings/compact_lang_det/cldutil_dbg.h +76 -0
- data/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
- data/encodings/compact_lang_det/compact_lang_det.cc +62 -0
- data/encodings/compact_lang_det/compact_lang_det.h +145 -0
- data/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
- data/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
- data/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
- data/encodings/compact_lang_det/compile.cmd +1 -0
- data/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
- data/encodings/compact_lang_det/ext_lang_enc.h +119 -0
- data/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
- data/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
- data/encodings/compact_lang_det/getonescriptspan.cc +570 -0
- data/encodings/compact_lang_det/getonescriptspan.h +131 -0
- data/encodings/compact_lang_det/letterscript_enum.cc +117 -0
- data/encodings/compact_lang_det/letterscript_enum.h +99 -0
- data/encodings/compact_lang_det/subsetsequence.cc +259 -0
- data/encodings/compact_lang_det/subsetsequence.h +44 -0
- data/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
- data/encodings/compact_lang_det/tote.cc +299 -0
- data/encodings/compact_lang_det/tote.h +89 -0
- data/encodings/compact_lang_det/unittest_data.h +193 -0
- data/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
- data/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
- data/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
- data/encodings/compact_lang_det/win/#cld_unilib_windows.cc# +29 -0
- data/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
- data/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
- data/encodings/compact_lang_det/win/cld_google.h +18 -0
- data/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
- data/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
- data/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
- data/encodings/compact_lang_det/win/cld_logging.h +21 -0
- data/encodings/compact_lang_det/win/cld_macros.h +19 -0
- data/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
- data/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
- data/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
- data/encodings/compact_lang_det/win/cld_unilib.h +15 -0
- data/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
- data/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
- data/encodings/compact_lang_det/win/cld_utf.h +24 -0
- data/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
- data/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
- data/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
- data/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
- data/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
- data/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
- data/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
- data/encodings/internal/encodings.cc +12 -0
- data/encodings/lang_enc.h +254 -0
- data/encodings/proto/encodings.pb.h +169 -0
- data/encodings/public/encodings.h +301 -0
- data/ext/cld/extconf.rb +8 -0
- data/krukid-cld.gemspec +33 -0
- data/languages/internal/#languages.cc# +337 -0
- data/languages/internal/languages.cc +337 -0
- data/languages/proto/languages.pb.h +179 -0
- data/languages/public/languages.h +379 -0
- data/lib/cld.rb +12 -0
- data/test/test.rb +570 -0
- data/thunk.cc +131 -0
- metadata +196 -0
data/base/build_config.h
ADDED
@@ -0,0 +1,115 @@
|
|
1
|
+
// Copyright (c) 2011 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
// This file adds defines about the platform we're currently building on.
|
6
|
+
// Operating System:
|
7
|
+
// OS_WIN / OS_MACOSX / OS_LINUX / OS_POSIX (MACOSX or LINUX)
|
8
|
+
// Compiler:
|
9
|
+
// COMPILER_MSVC / COMPILER_GCC
|
10
|
+
// Processor:
|
11
|
+
// ARCH_CPU_X86 / ARCH_CPU_X86_64 / ARCH_CPU_X86_FAMILY (X86 or X86_64)
|
12
|
+
// ARCH_CPU_32_BITS / ARCH_CPU_64_BITS
|
13
|
+
|
14
|
+
#ifndef BUILD_BUILD_CONFIG_H_
|
15
|
+
#define BUILD_BUILD_CONFIG_H_
|
16
|
+
|
17
|
+
// A set of macros to use for platform detection.
|
18
|
+
#if defined(__APPLE__)
|
19
|
+
#define OS_MACOSX 1
|
20
|
+
#elif defined(__linux__)
|
21
|
+
#define OS_LINUX 1
|
22
|
+
// Use TOOLKIT_GTK on linux if TOOLKIT_VIEWS isn't defined.
|
23
|
+
#if !defined(TOOLKIT_VIEWS)
|
24
|
+
#define TOOLKIT_GTK
|
25
|
+
#endif
|
26
|
+
#elif defined(_WIN32)
|
27
|
+
#define OS_WIN 1
|
28
|
+
#define TOOLKIT_VIEWS 1
|
29
|
+
#elif defined(__FreeBSD__)
|
30
|
+
#define OS_FREEBSD 1
|
31
|
+
#define TOOLKIT_GTK
|
32
|
+
#elif defined(__OpenBSD__)
|
33
|
+
#define OS_OPENBSD 1
|
34
|
+
#define TOOLKIT_GTK
|
35
|
+
#elif defined(__sun)
|
36
|
+
#define OS_SOLARIS 1
|
37
|
+
#define TOOLKIT_GTK
|
38
|
+
#else
|
39
|
+
#error Please add support for your platform in build/build_config.h
|
40
|
+
#endif
|
41
|
+
|
42
|
+
// A flag derived from the above flags, used to cover GTK code in
|
43
|
+
// both TOOLKIT_GTK and TOOLKIT_VIEWS.
|
44
|
+
#if defined(TOOLKIT_GTK) || (defined(TOOLKIT_VIEWS) && !defined(OS_WIN))
|
45
|
+
#define TOOLKIT_USES_GTK 1
|
46
|
+
#endif
|
47
|
+
|
48
|
+
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_OPENBSD)
|
49
|
+
#define USE_NSS 1 // Use NSS for crypto.
|
50
|
+
#define USE_X11 1 // Use X for graphics.
|
51
|
+
#endif
|
52
|
+
|
53
|
+
// For access to standard POSIXish features, use OS_POSIX instead of a
|
54
|
+
// more specific macro.
|
55
|
+
#if defined(OS_MACOSX) || defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_SOLARIS)
|
56
|
+
#define OS_POSIX 1
|
57
|
+
// Use base::DataPack for name/value pairs.
|
58
|
+
#define USE_BASE_DATA_PACK 1
|
59
|
+
#endif
|
60
|
+
|
61
|
+
// Use tcmalloc
|
62
|
+
#if defined(OS_WIN) && ! defined(NO_TCMALLOC)
|
63
|
+
#define USE_TCMALLOC 1
|
64
|
+
#endif
|
65
|
+
|
66
|
+
// Compiler detection.
|
67
|
+
#if defined(__GNUC__)
|
68
|
+
#define COMPILER_GCC 1
|
69
|
+
#elif defined(_MSC_VER)
|
70
|
+
#define COMPILER_MSVC 1
|
71
|
+
#else
|
72
|
+
#error Please add support for your compiler in build/build_config.h
|
73
|
+
#endif
|
74
|
+
|
75
|
+
// Processor architecture detection. For more info on what's defined, see:
|
76
|
+
// http://msdn.microsoft.com/en-us/library/b0084kay.aspx
|
77
|
+
// http://www.agner.org/optimize/calling_conventions.pdf
|
78
|
+
// or with gcc, run: "echo | gcc -E -dM -"
|
79
|
+
#if defined(_M_X64) || defined(__x86_64__)
|
80
|
+
#define ARCH_CPU_X86_FAMILY 1
|
81
|
+
#define ARCH_CPU_X86_64 1
|
82
|
+
#define ARCH_CPU_64_BITS 1
|
83
|
+
#elif defined(_M_IX86) || defined(__i386__)
|
84
|
+
#define ARCH_CPU_X86_FAMILY 1
|
85
|
+
#define ARCH_CPU_X86 1
|
86
|
+
#define ARCH_CPU_32_BITS 1
|
87
|
+
#elif defined(__ARMEL__)
|
88
|
+
#define ARCH_CPU_ARM_FAMILY 1
|
89
|
+
#define ARCH_CPU_ARMEL 1
|
90
|
+
#define ARCH_CPU_32_BITS 1
|
91
|
+
#define WCHAR_T_IS_UNSIGNED 1
|
92
|
+
#else
|
93
|
+
#error Please add support for your architecture in build/build_config.h
|
94
|
+
#endif
|
95
|
+
|
96
|
+
// Type detection for wchar_t.
|
97
|
+
#if defined(OS_WIN)
|
98
|
+
#define WCHAR_T_IS_UTF16
|
99
|
+
#elif defined(OS_POSIX) && defined(COMPILER_GCC) && \
|
100
|
+
defined(__WCHAR_MAX__) && \
|
101
|
+
(__WCHAR_MAX__ == 0x7fffffff || __WCHAR_MAX__ == 0xffffffff)
|
102
|
+
#define WCHAR_T_IS_UTF32
|
103
|
+
#elif defined(OS_POSIX) && defined(COMPILER_GCC) && \
|
104
|
+
defined(__WCHAR_MAX__) && \
|
105
|
+
(__WCHAR_MAX__ == 0x7fff || __WCHAR_MAX__ == 0xffff)
|
106
|
+
// On Posix, we'll detect short wchar_t, but projects aren't guaranteed to
|
107
|
+
// compile in this mode (in particular, Chrome doesn't). This is intended for
|
108
|
+
// other projects using base who manage their own dependencies and make sure
|
109
|
+
// short wchar works for them.
|
110
|
+
#define WCHAR_T_IS_UTF16
|
111
|
+
#else
|
112
|
+
#error Please add support for your compiler in build/build_config.h
|
113
|
+
#endif
|
114
|
+
|
115
|
+
#endif // BUILD_BUILD_CONFIG_H_
|
data/base/casts.h
ADDED
@@ -0,0 +1,156 @@
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
#ifndef BASE_CASTS_H_
|
6
|
+
#define BASE_CASTS_H_
|
7
|
+
|
8
|
+
#include <assert.h> // for use with down_cast<>
|
9
|
+
#include <string.h> // for memcpy
|
10
|
+
|
11
|
+
#include "base/macros.h"
|
12
|
+
|
13
|
+
|
14
|
+
// Use implicit_cast as a safe version of static_cast or const_cast
|
15
|
+
// for upcasting in the type hierarchy (i.e. casting a pointer to Foo
|
16
|
+
// to a pointer to SuperclassOfFoo or casting a pointer to Foo to
|
17
|
+
// a const pointer to Foo).
|
18
|
+
// When you use implicit_cast, the compiler checks that the cast is safe.
|
19
|
+
// Such explicit implicit_casts are necessary in surprisingly many
|
20
|
+
// situations where C++ demands an exact type match instead of an
|
21
|
+
// argument type convertable to a target type.
|
22
|
+
//
|
23
|
+
// The From type can be inferred, so the preferred syntax for using
|
24
|
+
// implicit_cast is the same as for static_cast etc.:
|
25
|
+
//
|
26
|
+
// implicit_cast<ToType>(expr)
|
27
|
+
//
|
28
|
+
// implicit_cast would have been part of the C++ standard library,
|
29
|
+
// but the proposal was submitted too late. It will probably make
|
30
|
+
// its way into the language in the future.
|
31
|
+
template<typename To, typename From>
|
32
|
+
inline To implicit_cast(From const &f) {
|
33
|
+
return f;
|
34
|
+
}
|
35
|
+
|
36
|
+
|
37
|
+
// When you upcast (that is, cast a pointer from type Foo to type
|
38
|
+
// SuperclassOfFoo), it's fine to use implicit_cast<>, since upcasts
|
39
|
+
// always succeed. When you downcast (that is, cast a pointer from
|
40
|
+
// type Foo to type SubclassOfFoo), static_cast<> isn't safe, because
|
41
|
+
// how do you know the pointer is really of type SubclassOfFoo? It
|
42
|
+
// could be a bare Foo, or of type DifferentSubclassOfFoo. Thus,
|
43
|
+
// when you downcast, you should use this macro. In debug mode, we
|
44
|
+
// use dynamic_cast<> to double-check the downcast is legal (we die
|
45
|
+
// if it's not). In normal mode, we do the efficient static_cast<>
|
46
|
+
// instead. Thus, it's important to test in debug mode to make sure
|
47
|
+
// the cast is legal!
|
48
|
+
// This is the only place in the code we should use dynamic_cast<>.
|
49
|
+
// In particular, you SHOULDN'T be using dynamic_cast<> in order to
|
50
|
+
// do RTTI (eg code like this:
|
51
|
+
// if (dynamic_cast<Subclass1>(foo)) HandleASubclass1Object(foo);
|
52
|
+
// if (dynamic_cast<Subclass2>(foo)) HandleASubclass2Object(foo);
|
53
|
+
// You should design the code some other way not to need this.
|
54
|
+
|
55
|
+
template<typename To, typename From> // use like this: down_cast<T*>(foo);
|
56
|
+
inline To down_cast(From* f) { // so we only accept pointers
|
57
|
+
// Ensures that To is a sub-type of From *. This test is here only
|
58
|
+
// for compile-time type checking, and has no overhead in an
|
59
|
+
// optimized build at run-time, as it will be optimized away
|
60
|
+
// completely.
|
61
|
+
if (false) {
|
62
|
+
implicit_cast<From*, To>(0);
|
63
|
+
}
|
64
|
+
|
65
|
+
assert(f == NULL || dynamic_cast<To>(f) != NULL); // RTTI: debug mode only!
|
66
|
+
return static_cast<To>(f);
|
67
|
+
}
|
68
|
+
|
69
|
+
// Overload of down_cast for references. Use like this: down_cast<T&>(foo).
|
70
|
+
// The code is slightly convoluted because we're still using the pointer
|
71
|
+
// form of dynamic cast. (The reference form throws an exception if it
|
72
|
+
// fails.)
|
73
|
+
//
|
74
|
+
// There's no need for a special const overload either for the pointer
|
75
|
+
// or the reference form. If you call down_cast with a const T&, the
|
76
|
+
// compiler will just bind From to const T.
|
77
|
+
template<typename To, typename From>
|
78
|
+
inline To down_cast(From& f) {
|
79
|
+
COMPILE_ASSERT(base::is_reference<To>::value, target_type_not_a_reference);
|
80
|
+
typedef typename base::remove_reference<To>::type* ToAsPointer;
|
81
|
+
if (false) {
|
82
|
+
// Compile-time check that To inherits from From. See above for details.
|
83
|
+
implicit_cast<From*, ToAsPointer>(0);
|
84
|
+
}
|
85
|
+
|
86
|
+
assert(dynamic_cast<ToAsPointer>(&f) != NULL); // RTTI: debug mode only
|
87
|
+
return static_cast<To>(f);
|
88
|
+
}
|
89
|
+
|
90
|
+
// bit_cast<Dest,Source> is a template function that implements the
|
91
|
+
// equivalent of "*reinterpret_cast<Dest*>(&source)". We need this in
|
92
|
+
// very low-level functions like the protobuf library and fast math
|
93
|
+
// support.
|
94
|
+
//
|
95
|
+
// float f = 3.14159265358979;
|
96
|
+
// int i = bit_cast<int32>(f);
|
97
|
+
// // i = 0x40490fdb
|
98
|
+
//
|
99
|
+
// The classical address-casting method is:
|
100
|
+
//
|
101
|
+
// // WRONG
|
102
|
+
// float f = 3.14159265358979; // WRONG
|
103
|
+
// int i = * reinterpret_cast<int*>(&f); // WRONG
|
104
|
+
//
|
105
|
+
// The address-casting method actually produces undefined behavior
|
106
|
+
// according to ISO C++ specification section 3.10 -15 -. Roughly, this
|
107
|
+
// section says: if an object in memory has one type, and a program
|
108
|
+
// accesses it with a different type, then the result is undefined
|
109
|
+
// behavior for most values of "different type".
|
110
|
+
//
|
111
|
+
// This is true for any cast syntax, either *(int*)&f or
|
112
|
+
// *reinterpret_cast<int*>(&f). And it is particularly true for
|
113
|
+
// conversions betweeen integral lvalues and floating-point lvalues.
|
114
|
+
//
|
115
|
+
// The purpose of 3.10 -15- is to allow optimizing compilers to assume
|
116
|
+
// that expressions with different types refer to different memory. gcc
|
117
|
+
// 4.0.1 has an optimizer that takes advantage of this. So a
|
118
|
+
// non-conforming program quietly produces wildly incorrect output.
|
119
|
+
//
|
120
|
+
// The problem is not the use of reinterpret_cast. The problem is type
|
121
|
+
// punning: holding an object in memory of one type and reading its bits
|
122
|
+
// back using a different type.
|
123
|
+
//
|
124
|
+
// The C++ standard is more subtle and complex than this, but that
|
125
|
+
// is the basic idea.
|
126
|
+
//
|
127
|
+
// Anyways ...
|
128
|
+
//
|
129
|
+
// bit_cast<> calls memcpy() which is blessed by the standard,
|
130
|
+
// especially by the example in section 3.9 . Also, of course,
|
131
|
+
// bit_cast<> wraps up the nasty logic in one place.
|
132
|
+
//
|
133
|
+
// Fortunately memcpy() is very fast. In optimized mode, with a
|
134
|
+
// constant size, gcc 2.95.3, gcc 4.0.1, and msvc 7.1 produce inline
|
135
|
+
// code with the minimal amount of data movement. On a 32-bit system,
|
136
|
+
// memcpy(d,s,4) compiles to one load and one store, and memcpy(d,s,8)
|
137
|
+
// compiles to two loads and two stores.
|
138
|
+
//
|
139
|
+
// I tested this code with gcc 2.95.3, gcc 4.0.1, icc 8.1, and msvc 7.1.
|
140
|
+
//
|
141
|
+
// WARNING: if Dest or Source is a non-POD type, the result of the memcpy
|
142
|
+
// is likely to surprise you.
|
143
|
+
//
|
144
|
+
|
145
|
+
template <class Dest, class Source>
|
146
|
+
inline Dest bit_cast(const Source& source) {
|
147
|
+
// Compile time assertion: sizeof(Dest) == sizeof(Source)
|
148
|
+
// A compile error here means your Dest and Source have different sizes.
|
149
|
+
typedef char VerifySizesAreEqual [sizeof(Dest) == sizeof(Source) ? 1 : -1];
|
150
|
+
|
151
|
+
Dest dest;
|
152
|
+
memcpy(&dest, &source, sizeof(dest));
|
153
|
+
return dest;
|
154
|
+
}
|
155
|
+
|
156
|
+
#endif // BASE_CASTS_H_
|