language_detection 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +19 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +85 -0
- data/Rakefile +11 -0
- data/ext/cld/Makefile +34 -0
- data/ext/cld/base/basictypes.h +348 -0
- data/ext/cld/base/build_config.h +124 -0
- data/ext/cld/base/casts.h +156 -0
- data/ext/cld/base/commandlineflags.h +443 -0
- data/ext/cld/base/crash.h +41 -0
- data/ext/cld/base/dynamic_annotations.h +358 -0
- data/ext/cld/base/global_strip_options.h +59 -0
- data/ext/cld/base/log_severity.h +46 -0
- data/ext/cld/base/logging.h +1403 -0
- data/ext/cld/base/macros.h +243 -0
- data/ext/cld/base/port.h +54 -0
- data/ext/cld/base/scoped_ptr.h +428 -0
- data/ext/cld/base/stl_decl.h +0 -0
- data/ext/cld/base/stl_decl_msvc.h +107 -0
- data/ext/cld/base/string_util.h +29 -0
- data/ext/cld/base/strtoint.h +93 -0
- data/ext/cld/base/template_util.h +96 -0
- data/ext/cld/base/type_traits.h +198 -0
- data/ext/cld/base/vlog_is_on.h +143 -0
- data/ext/cld/cld.so +0 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
- data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
- data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
- data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
- data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
- data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
- data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
- data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
- data/ext/cld/encodings/internal/encodings.cc +12 -0
- data/ext/cld/encodings/lang_enc.h +254 -0
- data/ext/cld/encodings/proto/encodings.pb.h +169 -0
- data/ext/cld/encodings/public/encodings.h +301 -0
- data/ext/cld/extconf.rb +1 -0
- data/ext/cld/language_detection.cc +88 -0
- data/ext/cld/languages/internal/languages.cc +337 -0
- data/ext/cld/languages/proto/languages.pb.h +179 -0
- data/ext/cld/languages/public/languages.h +379 -0
- data/language_detection.gemspec +28 -0
- data/lib/language_detection/string.rb +1 -0
- data/lib/language_detection/version.rb +3 -0
- data/lib/language_detection.rb +54 -0
- data/test/_helper.rb +15 -0
- data/test/fixtures/languages.csv +80 -0
- data/test/language_detection_test.rb +88 -0
- metadata +250 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
Copyright (c) 2012 Vojtech Hyza
|
|
2
|
+
|
|
3
|
+
MIT License
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
|
6
|
+
a copy of this software and associated documentation files (the
|
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
|
11
|
+
the following conditions:
|
|
12
|
+
|
|
13
|
+
The above copyright notice and this permission notice shall be
|
|
14
|
+
included in all copies or substantial portions of the Software.
|
|
15
|
+
|
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# LanguageDetection
|
|
2
|
+
|
|
3
|
+
Ruby bindings for Chromium Compact Language Detector ([source](http://src.chromium.org/viewvc/chrome/trunk/src/third_party/cld/)). This gem is using source codes from [chromium-compact-language-detector](http://code.google.com/p/chromium-compact-language-detector/) port.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
Add this line to your application's Gemfile:
|
|
8
|
+
|
|
9
|
+
gem 'language_detection'
|
|
10
|
+
|
|
11
|
+
And then execute:
|
|
12
|
+
|
|
13
|
+
$ bundle
|
|
14
|
+
|
|
15
|
+
Or install it yourself as:
|
|
16
|
+
|
|
17
|
+
$ gem install language_detection
|
|
18
|
+
|
|
19
|
+
## Usage
|
|
20
|
+
|
|
21
|
+
```ruby
|
|
22
|
+
>> require 'language_detection'
|
|
23
|
+
=> true
|
|
24
|
+
>> language = LanguageDetection.perform("This is some example text for language detection")
|
|
25
|
+
=> {:name=>"ENGLISH", :code=>"en", :reliable=>true, :text_bytes=>51, :details=>[{:name=>"ENGLISH", :code=>"en", :percent=>100, :score=>49.43273905996759}]}
|
|
26
|
+
>> language.name
|
|
27
|
+
=> "ENGLISH"
|
|
28
|
+
>> language.code
|
|
29
|
+
=> "en"
|
|
30
|
+
>> language.reliable
|
|
31
|
+
=> true
|
|
32
|
+
>> language.details # contains up to 3 languages sorted by score
|
|
33
|
+
=> [{:name=>"ENGLISH", :code=>"en", :percent=>100, :score=>49.43273905996759}]
|
|
34
|
+
>> language.details.first.percent
|
|
35
|
+
=> 100
|
|
36
|
+
>> language.details.first.score
|
|
37
|
+
=> 49.43273905996759
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
the other way is to include `LanguageDetection` module in your class
|
|
41
|
+
|
|
42
|
+
```ruby
|
|
43
|
+
class Article
|
|
44
|
+
include LanguageDetection
|
|
45
|
+
|
|
46
|
+
attr_accessor :title, :content
|
|
47
|
+
|
|
48
|
+
def initialize(params = {})
|
|
49
|
+
@title = params[:title]
|
|
50
|
+
@content = params[:content]
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def to_s
|
|
54
|
+
"#{title}\n#{content}"
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
which provides `Article#language` method using `Article#to_s` method as parameter
|
|
60
|
+
|
|
61
|
+
```ruby
|
|
62
|
+
>> article = Article.new :title => "Web development that doesn't hurt", :content => "Tens of thousands of Rails applications are already live..."
|
|
63
|
+
>> article.language
|
|
64
|
+
=> {:name=>"ENGLISH", :code=>"en", :reliable=>true, :text_bytes=>93, :details=>[{:name=>"ENGLISH", :code=>"en", :percent=>100, :score=>80.22690437601297}]}
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
or you can add `String#language` method by `require 'language_detection/string'`
|
|
68
|
+
|
|
69
|
+
```ruby
|
|
70
|
+
>> require 'language_detection'
|
|
71
|
+
=> true
|
|
72
|
+
>> require 'language_detection/string'
|
|
73
|
+
=> true
|
|
74
|
+
>> "Web development that doesn't hurt".language
|
|
75
|
+
=> {:name=>"ENGLISH", :code=>"en", :reliable=>true, :text_bytes=>36, :details=>[{:name=>"ENGLISH", :code=>"en", :percent=>100, :score=>39.70826580226905}]}
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
## Contributing
|
|
80
|
+
|
|
81
|
+
1. Fork it
|
|
82
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
|
83
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
|
84
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
|
85
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
data/ext/cld/Makefile
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# TODO: Generate Makefile
|
|
2
|
+
|
|
3
|
+
CFLAGS=-fPIC -I. -O2 -DCLD_WINDOWS
|
|
4
|
+
LDFLAGS=-L.
|
|
5
|
+
CC=g++
|
|
6
|
+
AR=ar
|
|
7
|
+
|
|
8
|
+
SOURCES = encodings/compact_lang_det/cldutil.cc \
|
|
9
|
+
encodings/compact_lang_det/cldutil_dbg_empty.cc \
|
|
10
|
+
encodings/compact_lang_det/compact_lang_det.cc \
|
|
11
|
+
encodings/compact_lang_det/compact_lang_det_impl.cc \
|
|
12
|
+
encodings/compact_lang_det/ext_lang_enc.cc \
|
|
13
|
+
encodings/compact_lang_det/getonescriptspan.cc \
|
|
14
|
+
encodings/compact_lang_det/letterscript_enum.cc \
|
|
15
|
+
encodings/compact_lang_det/tote.cc \
|
|
16
|
+
encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc \
|
|
17
|
+
encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc \
|
|
18
|
+
encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc \
|
|
19
|
+
encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc \
|
|
20
|
+
encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc \
|
|
21
|
+
encodings/compact_lang_det/win/cld_htmlutils_windows.cc \
|
|
22
|
+
encodings/compact_lang_det/win/cld_unilib_windows.cc \
|
|
23
|
+
encodings/compact_lang_det/win/cld_utf8statetable.cc \
|
|
24
|
+
encodings/compact_lang_det/win/cld_utf8utils_windows.cc \
|
|
25
|
+
encodings/internal/encodings.cc \
|
|
26
|
+
languages/internal/languages.cc \
|
|
27
|
+
language_detection.cc
|
|
28
|
+
|
|
29
|
+
install:
|
|
30
|
+
rm -f *.o
|
|
31
|
+
rm -f libcld.a
|
|
32
|
+
$(CC) -c $(CFLAGS) $(SOURCES)
|
|
33
|
+
$(AR) rcs libcld.a *.o
|
|
34
|
+
$(CC) -DCLD_WINDOWS -I. -L. -shared -o cld.so -lstdc++ *.o
|
|
@@ -0,0 +1,348 @@
|
|
|
1
|
+
// Copyright (c) 2010 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#ifndef BASE_BASICTYPES_H_
|
|
6
|
+
#define BASE_BASICTYPES_H_
|
|
7
|
+
|
|
8
|
+
#include <limits.h> // So we can set the bounds of our types
|
|
9
|
+
#include <stddef.h> // For size_t
|
|
10
|
+
#include <string.h> // for memcpy
|
|
11
|
+
|
|
12
|
+
#include "base/port.h" // Types that only need exist on certain systems
|
|
13
|
+
|
|
14
|
+
#ifndef COMPILER_MSVC
|
|
15
|
+
// stdint.h is part of C99 but MSVC doesn't have it.
|
|
16
|
+
#include <stdint.h> // For intptr_t.
|
|
17
|
+
#endif
|
|
18
|
+
|
|
19
|
+
typedef signed char schar;
|
|
20
|
+
typedef signed char int8;
|
|
21
|
+
typedef short int16;
|
|
22
|
+
// TODO(mbelshe) Remove these type guards. These are
|
|
23
|
+
// temporary to avoid conflicts with npapi.h.
|
|
24
|
+
#ifndef _INT32
|
|
25
|
+
#define _INT32
|
|
26
|
+
typedef int int32;
|
|
27
|
+
#endif
|
|
28
|
+
|
|
29
|
+
// The NSPR system headers define 64-bit as |long| when possible. In order to
|
|
30
|
+
// not have typedef mismatches, we do the same on LP64.
|
|
31
|
+
#if __LP64__
|
|
32
|
+
typedef long int64;
|
|
33
|
+
#else
|
|
34
|
+
typedef long long int64;
|
|
35
|
+
#endif
|
|
36
|
+
|
|
37
|
+
// NOTE: unsigned types are DANGEROUS in loops and other arithmetical
|
|
38
|
+
// places. Use the signed types unless your variable represents a bit
|
|
39
|
+
// pattern (eg a hash value) or you really need the extra bit. Do NOT
|
|
40
|
+
// use 'unsigned' to express "this value should always be positive";
|
|
41
|
+
// use assertions for this.
|
|
42
|
+
|
|
43
|
+
typedef unsigned char uint8;
|
|
44
|
+
typedef unsigned short uint16;
|
|
45
|
+
// TODO(mbelshe) Remove these type guards. These are
|
|
46
|
+
// temporary to avoid conflicts with npapi.h.
|
|
47
|
+
#ifndef _UINT32
|
|
48
|
+
#define _UINT32
|
|
49
|
+
typedef unsigned int uint32;
|
|
50
|
+
#endif
|
|
51
|
+
|
|
52
|
+
// See the comment above about NSPR and 64-bit.
|
|
53
|
+
#if __LP64__
|
|
54
|
+
typedef unsigned long uint64;
|
|
55
|
+
#else
|
|
56
|
+
typedef unsigned long long uint64;
|
|
57
|
+
#endif
|
|
58
|
+
|
|
59
|
+
// A type to represent a Unicode code-point value. As of Unicode 4.0,
|
|
60
|
+
// such values require up to 21 bits.
|
|
61
|
+
// (For type-checking on pointers, make this explicitly signed,
|
|
62
|
+
// and it should always be the signed version of whatever int32 is.)
|
|
63
|
+
typedef signed int char32;
|
|
64
|
+
|
|
65
|
+
const uint8 kuint8max = (( uint8) 0xFF);
|
|
66
|
+
const uint16 kuint16max = ((uint16) 0xFFFF);
|
|
67
|
+
const uint32 kuint32max = ((uint32) 0xFFFFFFFF);
|
|
68
|
+
const uint64 kuint64max = ((uint64) GG_LONGLONG(0xFFFFFFFFFFFFFFFF));
|
|
69
|
+
const int8 kint8min = (( int8) 0x80);
|
|
70
|
+
const int8 kint8max = (( int8) 0x7F);
|
|
71
|
+
const int16 kint16min = (( int16) 0x8000);
|
|
72
|
+
const int16 kint16max = (( int16) 0x7FFF);
|
|
73
|
+
const int32 kint32min = (( int32) 0x80000000);
|
|
74
|
+
const int32 kint32max = (( int32) 0x7FFFFFFF);
|
|
75
|
+
const int64 kint64min = (( int64) GG_LONGLONG(0x8000000000000000));
|
|
76
|
+
const int64 kint64max = (( int64) GG_LONGLONG(0x7FFFFFFFFFFFFFFF));
|
|
77
|
+
|
|
78
|
+
// A macro to disallow the copy constructor and operator= functions
|
|
79
|
+
// This should be used in the private: declarations for a class
|
|
80
|
+
#define DISALLOW_COPY_AND_ASSIGN(TypeName) \
|
|
81
|
+
TypeName(const TypeName&); \
|
|
82
|
+
void operator=(const TypeName&)
|
|
83
|
+
|
|
84
|
+
// An older, deprecated, politically incorrect name for the above.
|
|
85
|
+
#define DISALLOW_EVIL_CONSTRUCTORS(TypeName) DISALLOW_COPY_AND_ASSIGN(TypeName)
|
|
86
|
+
|
|
87
|
+
// A macro to disallow all the implicit constructors, namely the
|
|
88
|
+
// default constructor, copy constructor and operator= functions.
|
|
89
|
+
//
|
|
90
|
+
// This should be used in the private: declarations for a class
|
|
91
|
+
// that wants to prevent anyone from instantiating it. This is
|
|
92
|
+
// especially useful for classes containing only static methods.
|
|
93
|
+
#define DISALLOW_IMPLICIT_CONSTRUCTORS(TypeName) \
|
|
94
|
+
TypeName(); \
|
|
95
|
+
DISALLOW_COPY_AND_ASSIGN(TypeName)
|
|
96
|
+
|
|
97
|
+
// The arraysize(arr) macro returns the # of elements in an array arr.
|
|
98
|
+
// The expression is a compile-time constant, and therefore can be
|
|
99
|
+
// used in defining new arrays, for example. If you use arraysize on
|
|
100
|
+
// a pointer by mistake, you will get a compile-time error.
|
|
101
|
+
//
|
|
102
|
+
// One caveat is that arraysize() doesn't accept any array of an
|
|
103
|
+
// anonymous type or a type defined inside a function. In these rare
|
|
104
|
+
// cases, you have to use the unsafe ARRAYSIZE_UNSAFE() macro below. This is
|
|
105
|
+
// due to a limitation in C++'s template system. The limitation might
|
|
106
|
+
// eventually be removed, but it hasn't happened yet.
|
|
107
|
+
|
|
108
|
+
// This template function declaration is used in defining arraysize.
|
|
109
|
+
// Note that the function doesn't need an implementation, as we only
|
|
110
|
+
// use its type.
|
|
111
|
+
template <typename T, size_t N>
|
|
112
|
+
char (&ArraySizeHelper(T (&array)[N]))[N];
|
|
113
|
+
|
|
114
|
+
// That gcc wants both of these prototypes seems mysterious. VC, for
|
|
115
|
+
// its part, can't decide which to use (another mystery). Matching of
|
|
116
|
+
// template overloads: the final frontier.
|
|
117
|
+
#ifndef _MSC_VER
|
|
118
|
+
template <typename T, size_t N>
|
|
119
|
+
char (&ArraySizeHelper(const T (&array)[N]))[N];
|
|
120
|
+
#endif
|
|
121
|
+
|
|
122
|
+
#define arraysize(array) (sizeof(ArraySizeHelper(array)))
|
|
123
|
+
|
|
124
|
+
// ARRAYSIZE_UNSAFE performs essentially the same calculation as arraysize,
|
|
125
|
+
// but can be used on anonymous types or types defined inside
|
|
126
|
+
// functions. It's less safe than arraysize as it accepts some
|
|
127
|
+
// (although not all) pointers. Therefore, you should use arraysize
|
|
128
|
+
// whenever possible.
|
|
129
|
+
//
|
|
130
|
+
// The expression ARRAYSIZE_UNSAFE(a) is a compile-time constant of type
|
|
131
|
+
// size_t.
|
|
132
|
+
//
|
|
133
|
+
// ARRAYSIZE_UNSAFE catches a few type errors. If you see a compiler error
|
|
134
|
+
//
|
|
135
|
+
// "warning: division by zero in ..."
|
|
136
|
+
//
|
|
137
|
+
// when using ARRAYSIZE_UNSAFE, you are (wrongfully) giving it a pointer.
|
|
138
|
+
// You should only use ARRAYSIZE_UNSAFE on statically allocated arrays.
|
|
139
|
+
//
|
|
140
|
+
// The following comments are on the implementation details, and can
|
|
141
|
+
// be ignored by the users.
|
|
142
|
+
//
|
|
143
|
+
// ARRAYSIZE_UNSAFE(arr) works by inspecting sizeof(arr) (the # of bytes in
|
|
144
|
+
// the array) and sizeof(*(arr)) (the # of bytes in one array
|
|
145
|
+
// element). If the former is divisible by the latter, perhaps arr is
|
|
146
|
+
// indeed an array, in which case the division result is the # of
|
|
147
|
+
// elements in the array. Otherwise, arr cannot possibly be an array,
|
|
148
|
+
// and we generate a compiler error to prevent the code from
|
|
149
|
+
// compiling.
|
|
150
|
+
//
|
|
151
|
+
// Since the size of bool is implementation-defined, we need to cast
|
|
152
|
+
// !(sizeof(a) & sizeof(*(a))) to size_t in order to ensure the final
|
|
153
|
+
// result has type size_t.
|
|
154
|
+
//
|
|
155
|
+
// This macro is not perfect as it wrongfully accepts certain
|
|
156
|
+
// pointers, namely where the pointer size is divisible by the pointee
|
|
157
|
+
// size. Since all our code has to go through a 32-bit compiler,
|
|
158
|
+
// where a pointer is 4 bytes, this means all pointers to a type whose
|
|
159
|
+
// size is 3 or greater than 4 will be (righteously) rejected.
|
|
160
|
+
|
|
161
|
+
#define ARRAYSIZE_UNSAFE(a) \
|
|
162
|
+
((sizeof(a) / sizeof(*(a))) / \
|
|
163
|
+
static_cast<size_t>(!(sizeof(a) % sizeof(*(a)))))
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
// Use implicit_cast as a safe version of static_cast or const_cast
|
|
167
|
+
// for upcasting in the type hierarchy (i.e. casting a pointer to Foo
|
|
168
|
+
// to a pointer to SuperclassOfFoo or casting a pointer to Foo to
|
|
169
|
+
// a const pointer to Foo).
|
|
170
|
+
// When you use implicit_cast, the compiler checks that the cast is safe.
|
|
171
|
+
// Such explicit implicit_casts are necessary in surprisingly many
|
|
172
|
+
// situations where C++ demands an exact type match instead of an
|
|
173
|
+
// argument type convertable to a target type.
|
|
174
|
+
//
|
|
175
|
+
// The From type can be inferred, so the preferred syntax for using
|
|
176
|
+
// implicit_cast is the same as for static_cast etc.:
|
|
177
|
+
//
|
|
178
|
+
// implicit_cast<ToType>(expr)
|
|
179
|
+
//
|
|
180
|
+
// implicit_cast would have been part of the C++ standard library,
|
|
181
|
+
// but the proposal was submitted too late. It will probably make
|
|
182
|
+
// its way into the language in the future.
|
|
183
|
+
template<typename To, typename From>
|
|
184
|
+
inline To implicit_cast(From const &f) {
|
|
185
|
+
return f;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
// The COMPILE_ASSERT macro can be used to verify that a compile time
|
|
189
|
+
// expression is true. For example, you could use it to verify the
|
|
190
|
+
// size of a static array:
|
|
191
|
+
//
|
|
192
|
+
// COMPILE_ASSERT(ARRAYSIZE_UNSAFE(content_type_names) == CONTENT_NUM_TYPES,
|
|
193
|
+
// content_type_names_incorrect_size);
|
|
194
|
+
//
|
|
195
|
+
// or to make sure a struct is smaller than a certain size:
|
|
196
|
+
//
|
|
197
|
+
// COMPILE_ASSERT(sizeof(foo) < 128, foo_too_large);
|
|
198
|
+
//
|
|
199
|
+
// The second argument to the macro is the name of the variable. If
|
|
200
|
+
// the expression is false, most compilers will issue a warning/error
|
|
201
|
+
// containing the name of the variable.
|
|
202
|
+
|
|
203
|
+
template <bool>
|
|
204
|
+
struct CompileAssert {
|
|
205
|
+
};
|
|
206
|
+
|
|
207
|
+
#undef COMPILE_ASSERT
|
|
208
|
+
#define COMPILE_ASSERT(expr, msg) \
|
|
209
|
+
typedef CompileAssert<(bool(expr))> msg[bool(expr) ? 1 : -1]
|
|
210
|
+
|
|
211
|
+
// Implementation details of COMPILE_ASSERT:
|
|
212
|
+
//
|
|
213
|
+
// - COMPILE_ASSERT works by defining an array type that has -1
|
|
214
|
+
// elements (and thus is invalid) when the expression is false.
|
|
215
|
+
//
|
|
216
|
+
// - The simpler definition
|
|
217
|
+
//
|
|
218
|
+
// #define COMPILE_ASSERT(expr, msg) typedef char msg[(expr) ? 1 : -1]
|
|
219
|
+
//
|
|
220
|
+
// does not work, as gcc supports variable-length arrays whose sizes
|
|
221
|
+
// are determined at run-time (this is gcc's extension and not part
|
|
222
|
+
// of the C++ standard). As a result, gcc fails to reject the
|
|
223
|
+
// following code with the simple definition:
|
|
224
|
+
//
|
|
225
|
+
// int foo;
|
|
226
|
+
// COMPILE_ASSERT(foo, msg); // not supposed to compile as foo is
|
|
227
|
+
// // not a compile-time constant.
|
|
228
|
+
//
|
|
229
|
+
// - By using the type CompileAssert<(bool(expr))>, we ensures that
|
|
230
|
+
// expr is a compile-time constant. (Template arguments must be
|
|
231
|
+
// determined at compile-time.)
|
|
232
|
+
//
|
|
233
|
+
// - The outter parentheses in CompileAssert<(bool(expr))> are necessary
|
|
234
|
+
// to work around a bug in gcc 3.4.4 and 4.0.1. If we had written
|
|
235
|
+
//
|
|
236
|
+
// CompileAssert<bool(expr)>
|
|
237
|
+
//
|
|
238
|
+
// instead, these compilers will refuse to compile
|
|
239
|
+
//
|
|
240
|
+
// COMPILE_ASSERT(5 > 0, some_message);
|
|
241
|
+
//
|
|
242
|
+
// (They seem to think the ">" in "5 > 0" marks the end of the
|
|
243
|
+
// template argument list.)
|
|
244
|
+
//
|
|
245
|
+
// - The array size is (bool(expr) ? 1 : -1), instead of simply
|
|
246
|
+
//
|
|
247
|
+
// ((expr) ? 1 : -1).
|
|
248
|
+
//
|
|
249
|
+
// This is to avoid running into a bug in MS VC 7.1, which
|
|
250
|
+
// causes ((0.0) ? 1 : -1) to incorrectly evaluate to 1.
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
// MetatagId refers to metatag-id that we assign to
|
|
254
|
+
// each metatag <name, value> pair..
|
|
255
|
+
typedef uint32 MetatagId;
|
|
256
|
+
|
|
257
|
+
// Argument type used in interfaces that can optionally take ownership
|
|
258
|
+
// of a passed in argument. If TAKE_OWNERSHIP is passed, the called
|
|
259
|
+
// object takes ownership of the argument. Otherwise it does not.
|
|
260
|
+
enum Ownership {
|
|
261
|
+
DO_NOT_TAKE_OWNERSHIP,
|
|
262
|
+
TAKE_OWNERSHIP
|
|
263
|
+
};
|
|
264
|
+
|
|
265
|
+
// bit_cast<Dest,Source> is a template function that implements the
|
|
266
|
+
// equivalent of "*reinterpret_cast<Dest*>(&source)". We need this in
|
|
267
|
+
// very low-level functions like the protobuf library and fast math
|
|
268
|
+
// support.
|
|
269
|
+
//
|
|
270
|
+
// float f = 3.14159265358979;
|
|
271
|
+
// int i = bit_cast<int32>(f);
|
|
272
|
+
// // i = 0x40490fdb
|
|
273
|
+
//
|
|
274
|
+
// The classical address-casting method is:
|
|
275
|
+
//
|
|
276
|
+
// // WRONG
|
|
277
|
+
// float f = 3.14159265358979; // WRONG
|
|
278
|
+
// int i = * reinterpret_cast<int*>(&f); // WRONG
|
|
279
|
+
//
|
|
280
|
+
// The address-casting method actually produces undefined behavior
|
|
281
|
+
// according to ISO C++ specification section 3.10 -15 -. Roughly, this
|
|
282
|
+
// section says: if an object in memory has one type, and a program
|
|
283
|
+
// accesses it with a different type, then the result is undefined
|
|
284
|
+
// behavior for most values of "different type".
|
|
285
|
+
//
|
|
286
|
+
// This is true for any cast syntax, either *(int*)&f or
|
|
287
|
+
// *reinterpret_cast<int*>(&f). And it is particularly true for
|
|
288
|
+
// conversions betweeen integral lvalues and floating-point lvalues.
|
|
289
|
+
//
|
|
290
|
+
// The purpose of 3.10 -15- is to allow optimizing compilers to assume
|
|
291
|
+
// that expressions with different types refer to different memory. gcc
|
|
292
|
+
// 4.0.1 has an optimizer that takes advantage of this. So a
|
|
293
|
+
// non-conforming program quietly produces wildly incorrect output.
|
|
294
|
+
//
|
|
295
|
+
// The problem is not the use of reinterpret_cast. The problem is type
|
|
296
|
+
// punning: holding an object in memory of one type and reading its bits
|
|
297
|
+
// back using a different type.
|
|
298
|
+
//
|
|
299
|
+
// The C++ standard is more subtle and complex than this, but that
|
|
300
|
+
// is the basic idea.
|
|
301
|
+
//
|
|
302
|
+
// Anyways ...
|
|
303
|
+
//
|
|
304
|
+
// bit_cast<> calls memcpy() which is blessed by the standard,
|
|
305
|
+
// especially by the example in section 3.9 . Also, of course,
|
|
306
|
+
// bit_cast<> wraps up the nasty logic in one place.
|
|
307
|
+
//
|
|
308
|
+
// Fortunately memcpy() is very fast. In optimized mode, with a
|
|
309
|
+
// constant size, gcc 2.95.3, gcc 4.0.1, and msvc 7.1 produce inline
|
|
310
|
+
// code with the minimal amount of data movement. On a 32-bit system,
|
|
311
|
+
// memcpy(d,s,4) compiles to one load and one store, and memcpy(d,s,8)
|
|
312
|
+
// compiles to two loads and two stores.
|
|
313
|
+
//
|
|
314
|
+
// I tested this code with gcc 2.95.3, gcc 4.0.1, icc 8.1, and msvc 7.1.
|
|
315
|
+
//
|
|
316
|
+
// WARNING: if Dest or Source is a non-POD type, the result of the memcpy
|
|
317
|
+
// is likely to surprise you.
|
|
318
|
+
|
|
319
|
+
template <class Dest, class Source>
|
|
320
|
+
inline Dest bit_cast(const Source& source) {
|
|
321
|
+
// Compile time assertion: sizeof(Dest) == sizeof(Source)
|
|
322
|
+
// A compile error here means your Dest and Source have different sizes.
|
|
323
|
+
typedef char VerifySizesAreEqual [sizeof(Dest) == sizeof(Source) ? 1 : -1];
|
|
324
|
+
|
|
325
|
+
Dest dest;
|
|
326
|
+
memcpy(&dest, &source, sizeof(dest));
|
|
327
|
+
return dest;
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
// The following enum should be used only as a constructor argument to indicate
|
|
331
|
+
// that the variable has static storage class, and that the constructor should
|
|
332
|
+
// do nothing to its state. It indicates to the reader that it is legal to
|
|
333
|
+
// declare a static instance of the class, provided the constructor is given
|
|
334
|
+
// the base::LINKER_INITIALIZED argument. Normally, it is unsafe to declare a
|
|
335
|
+
// static variable that has a constructor or a destructor because invocation
|
|
336
|
+
// order is undefined. However, IF the type can be initialized by filling with
|
|
337
|
+
// zeroes (which the loader does for static variables), AND the destructor also
|
|
338
|
+
// does nothing to the storage, AND there are no virtual methods, then a
|
|
339
|
+
// constructor declared as
|
|
340
|
+
// explicit MyClass(base::LinkerInitialized x) {}
|
|
341
|
+
// and invoked as
|
|
342
|
+
// static MyClass my_variable_name(base::LINKER_INITIALIZED);
|
|
343
|
+
namespace base {
|
|
344
|
+
enum LinkerInitialized { LINKER_INITIALIZED };
|
|
345
|
+
} // base
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
#endif // BASE_BASICTYPES_H_
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
// Copyright (c) 2011 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
// This file adds defines about the platform we're currently building on.
|
|
6
|
+
// Operating System:
|
|
7
|
+
// OS_WIN / OS_MACOSX / OS_LINUX / OS_POSIX (MACOSX or LINUX)
|
|
8
|
+
// Compiler:
|
|
9
|
+
// COMPILER_MSVC / COMPILER_GCC
|
|
10
|
+
// Processor:
|
|
11
|
+
// ARCH_CPU_X86 / ARCH_CPU_X86_64 / ARCH_CPU_X86_FAMILY (X86 or X86_64)
|
|
12
|
+
// ARCH_CPU_32_BITS / ARCH_CPU_64_BITS
|
|
13
|
+
|
|
14
|
+
#ifndef BUILD_BUILD_CONFIG_H_
|
|
15
|
+
#define BUILD_BUILD_CONFIG_H_
|
|
16
|
+
|
|
17
|
+
// A set of macros to use for platform detection.
|
|
18
|
+
#if defined(__APPLE__)
|
|
19
|
+
#define OS_MACOSX 1
|
|
20
|
+
#elif defined(__linux__)
|
|
21
|
+
#define OS_LINUX 1
|
|
22
|
+
// Use TOOLKIT_GTK on linux if TOOLKIT_VIEWS isn't defined.
|
|
23
|
+
#if !defined(TOOLKIT_VIEWS)
|
|
24
|
+
#define TOOLKIT_GTK
|
|
25
|
+
#endif
|
|
26
|
+
#elif defined(_WIN32)
|
|
27
|
+
#define OS_WIN 1
|
|
28
|
+
#define TOOLKIT_VIEWS 1
|
|
29
|
+
#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
|
|
30
|
+
#define OS_FREEBSD 1
|
|
31
|
+
#define TOOLKIT_GTK
|
|
32
|
+
#elif defined(__NetBSD__)
|
|
33
|
+
#define OS_NETBSD 1
|
|
34
|
+
#define TOOLKIT_GTK
|
|
35
|
+
#elif defined(__OpenBSD__)
|
|
36
|
+
#define OS_OPENBSD 1
|
|
37
|
+
#define TOOLKIT_GTK
|
|
38
|
+
#elif defined(__DragonFly__)
|
|
39
|
+
#define OS_DRAGONFLY 1
|
|
40
|
+
#define TOOLKIT_GTK
|
|
41
|
+
#elif defined(__sun)
|
|
42
|
+
#define OS_SOLARIS 1
|
|
43
|
+
#define TOOLKIT_GTK
|
|
44
|
+
#else
|
|
45
|
+
#error Please add support for your platform in build/build_config.h
|
|
46
|
+
#endif
|
|
47
|
+
|
|
48
|
+
// A flag derived from the above flags, used to cover GTK code in
|
|
49
|
+
// both TOOLKIT_GTK and TOOLKIT_VIEWS.
|
|
50
|
+
#if defined(TOOLKIT_GTK) || (defined(TOOLKIT_VIEWS) && !defined(OS_WIN))
|
|
51
|
+
#define TOOLKIT_USES_GTK 1
|
|
52
|
+
#endif
|
|
53
|
+
|
|
54
|
+
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY)
|
|
55
|
+
#define USE_NSS 1 // Use NSS for crypto.
|
|
56
|
+
#define USE_X11 1 // Use X for graphics.
|
|
57
|
+
#endif
|
|
58
|
+
|
|
59
|
+
// For access to standard POSIXish features, use OS_POSIX instead of a
|
|
60
|
+
// more specific macro.
|
|
61
|
+
#if defined(OS_MACOSX) || defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_SOLARIS) || defined(OS_DRAGONFLY)
|
|
62
|
+
#define OS_POSIX 1
|
|
63
|
+
// Use base::DataPack for name/value pairs.
|
|
64
|
+
#define USE_BASE_DATA_PACK 1
|
|
65
|
+
#endif
|
|
66
|
+
|
|
67
|
+
// Use tcmalloc
|
|
68
|
+
#if defined(OS_WIN) && ! defined(NO_TCMALLOC)
|
|
69
|
+
#define USE_TCMALLOC 1
|
|
70
|
+
#endif
|
|
71
|
+
|
|
72
|
+
// Compiler detection.
|
|
73
|
+
#if defined(__GNUC__)
|
|
74
|
+
#define COMPILER_GCC 1
|
|
75
|
+
#elif defined(_MSC_VER)
|
|
76
|
+
#define COMPILER_MSVC 1
|
|
77
|
+
#else
|
|
78
|
+
#error Please add support for your compiler in build/build_config.h
|
|
79
|
+
#endif
|
|
80
|
+
|
|
81
|
+
// Processor architecture detection. For more info on what's defined, see:
|
|
82
|
+
// http://msdn.microsoft.com/en-us/library/b0084kay.aspx
|
|
83
|
+
// http://www.agner.org/optimize/calling_conventions.pdf
|
|
84
|
+
// or with gcc, run: "echo | gcc -E -dM -"
|
|
85
|
+
#if defined(_M_X64) || defined(__x86_64__)
|
|
86
|
+
#define ARCH_CPU_X86_FAMILY 1
|
|
87
|
+
#define ARCH_CPU_X86_64 1
|
|
88
|
+
#define ARCH_CPU_64_BITS 1
|
|
89
|
+
#elif defined(_M_IX86) || defined(__i386__)
|
|
90
|
+
#define ARCH_CPU_X86_FAMILY 1
|
|
91
|
+
#define ARCH_CPU_X86 1
|
|
92
|
+
#define ARCH_CPU_32_BITS 1
|
|
93
|
+
#elif defined(__ARMEL__)
|
|
94
|
+
#define ARCH_CPU_ARM_FAMILY 1
|
|
95
|
+
#define ARCH_CPU_ARMEL 1
|
|
96
|
+
#define ARCH_CPU_32_BITS 1
|
|
97
|
+
#define WCHAR_T_IS_UNSIGNED 1
|
|
98
|
+
#elif defined(__ARCH_PPC) || defined(__ppc__)
|
|
99
|
+
#define ARCH_CPU_PPC_FAMILY 1
|
|
100
|
+
#define ARCH_CPU_32_BITS 1
|
|
101
|
+
#else
|
|
102
|
+
#error Please add support for your architecture in build/build_config.h
|
|
103
|
+
#endif
|
|
104
|
+
|
|
105
|
+
// Type detection for wchar_t.
|
|
106
|
+
#if defined(OS_WIN)
|
|
107
|
+
#define WCHAR_T_IS_UTF16
|
|
108
|
+
#elif defined(OS_POSIX) && defined(COMPILER_GCC) && \
|
|
109
|
+
defined(__WCHAR_MAX__) && \
|
|
110
|
+
(__WCHAR_MAX__ == 0x7fffffff || __WCHAR_MAX__ == 0xffffffff)
|
|
111
|
+
#define WCHAR_T_IS_UTF32
|
|
112
|
+
#elif defined(OS_POSIX) && defined(COMPILER_GCC) && \
|
|
113
|
+
defined(__WCHAR_MAX__) && \
|
|
114
|
+
(__WCHAR_MAX__ == 0x7fff || __WCHAR_MAX__ == 0xffff)
|
|
115
|
+
// On Posix, we'll detect short wchar_t, but projects aren't guaranteed to
|
|
116
|
+
// compile in this mode (in particular, Chrome doesn't). This is intended for
|
|
117
|
+
// other projects using base who manage their own dependencies and make sure
|
|
118
|
+
// short wchar works for them.
|
|
119
|
+
#define WCHAR_T_IS_UTF16
|
|
120
|
+
#else
|
|
121
|
+
#error Please add support for your compiler in build/build_config.h
|
|
122
|
+
#endif
|
|
123
|
+
|
|
124
|
+
#endif // BUILD_BUILD_CONFIG_H_
|