icu 0.9.1 → 0.10.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.travis.yml +11 -0
- data/Gemfile +3 -0
- data/LICENSE +20 -0
- data/README.md +69 -0
- data/Rakefile +38 -0
- data/benchmark/normalization.rb +106 -0
- data/benchmark/normalization_phrases.txt +1031 -0
- data/benchmark/normalization_result.txt +45 -0
- data/benchmark/normalization_wikip.txt +2838 -0
- data/ext/icu/extconf.rb +242 -0
- data/ext/icu/icu.c +18 -0
- data/ext/icu/icu.h +78 -0
- data/ext/icu/icu_charset_detector.c +192 -0
- data/ext/icu/icu_collator.c +138 -0
- data/ext/icu/icu_locale.c +852 -0
- data/ext/icu/icu_normalizer.c +122 -0
- data/ext/icu/icu_number_format.c +0 -0
- data/ext/icu/icu_spoof_checker.c +194 -0
- data/ext/icu/icu_transliterator.c +159 -0
- data/ext/icu/internal_encoding.c +38 -0
- data/ext/icu/internal_ustring.c +304 -0
- data/ext/icu/internal_utils.c +50 -0
- data/ext/icu/rb_errors.c +14 -0
- data/icu.gemspec +22 -0
- data/lib/icu.rb +6 -18
- data/lib/icu/charset_detector.rb +5 -0
- data/lib/icu/collator.rb +24 -0
- data/lib/icu/locale.rb +19 -0
- data/lib/icu/transliterator.rb +8 -0
- data/lib/icu/version.rb +3 -0
- data/spec/charset_detector_spec.rb +47 -0
- data/spec/collator_spec.rb +73 -0
- data/spec/locale_spec.rb +312 -0
- data/spec/normalizer_spec.rb +35 -0
- data/spec/spec_helper.rb +8 -0
- data/spec/spoof_checker_spec.rb +56 -0
- data/spec/transliterator_spec.rb +41 -0
- metadata +132 -55
- data/COPYING +0 -674
- data/COPYING.LESSER +0 -165
- data/README +0 -81
- data/ext/extconf.rb +0 -31
- data/ext/icu.c +0 -128
- data/ext/icu.h +0 -34
- data/ext/icu_locale.c +0 -330
- data/ext/icu_locale_country.c +0 -99
- data/ext/icu_locale_language.c +0 -99
- data/ext/icu_numeric.c +0 -161
- data/ext/icu_time.c +0 -391
- data/test/test_locale.rb +0 -73
- data/test/test_numeric.rb +0 -78
- data/test/test_time.rb +0 -75
data/ext/icu/extconf.rb
ADDED
@@ -0,0 +1,242 @@
|
|
1
|
+
require 'mkmf'
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
|
5
|
+
ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..', '..'))
|
6
|
+
|
7
|
+
# Utility functions
|
8
|
+
|
9
|
+
def using_system_libraries?
|
10
|
+
arg_config('--use-system-libraries', !!ENV['ICU_USE_SYSTEM_LIBRARIES'])
|
11
|
+
end
|
12
|
+
|
13
|
+
# Building with system ICU
|
14
|
+
|
15
|
+
if using_system_libraries?
|
16
|
+
message "Building ICU using system libraries.\n Not supported yet, PR welcome!"
|
17
|
+
exit 1
|
18
|
+
|
19
|
+
unless dir_config('icu').any?
|
20
|
+
base = if !`which brew`.empty?
|
21
|
+
`brew --prefix`.strip
|
22
|
+
elsif File.exists?("/usr/local/Cellar/icu4c")
|
23
|
+
'/usr/local/Cellar'
|
24
|
+
end
|
25
|
+
|
26
|
+
if base and icu4c = Dir[File.join(base, 'Cellar/icu4c/*')].sort.last
|
27
|
+
$INCFLAGS << " -I#{icu4c}/include "
|
28
|
+
$LDFLAGS << " -L#{icu4c}/lib "
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
unless have_library 'icui18n' and have_header 'unicode/ucnv.h'
|
33
|
+
STDERR.puts <<-EOS
|
34
|
+
************************************************************************
|
35
|
+
icu not found.
|
36
|
+
install by brew install icu4c or apt-get install libicu-dev)
|
37
|
+
************************************************************************ww
|
38
|
+
EOS
|
39
|
+
|
40
|
+
exit(1)
|
41
|
+
end
|
42
|
+
|
43
|
+
have_library 'icuuc' or abort 'libicuuc missing'
|
44
|
+
have_library 'icudata' or abort 'libicudata missing'
|
45
|
+
else
|
46
|
+
message "Building ICU from source.\n"
|
47
|
+
|
48
|
+
# The gem version constraint in the Rakefile is not respected at install time.
|
49
|
+
# Keep this version in sync with the one in the Rakefile !
|
50
|
+
require 'rubygems'
|
51
|
+
gem 'mini_portile2', '~> 2.2.0'
|
52
|
+
require 'mini_portile2'
|
53
|
+
|
54
|
+
# Checkout the source code of ICU.
|
55
|
+
# http://site.icu-project.org/download/
|
56
|
+
# http://userguide.icu-project.org/howtouseicu
|
57
|
+
# Also check the readme of ICU release file.
|
58
|
+
class ICURecipe < MiniPortile
|
59
|
+
def initialize(name, version, static_p)
|
60
|
+
super(name, version)
|
61
|
+
self.target = File.join(ROOT, "ports")
|
62
|
+
# Prefer host_alias over host in order to use i586-mingw32msvc as
|
63
|
+
# correct compiler prefix for cross build, but use host if not set.
|
64
|
+
self.host = RbConfig::CONFIG["host_alias"].empty? ? RbConfig::CONFIG["host"] : RbConfig::CONFIG["host_alias"]
|
65
|
+
self.patch_files = Dir[File.join(ROOT, "patches", name, "*.patch")].sort
|
66
|
+
self.configure_options << "--libdir=#{File.join(self.path, "lib")}"
|
67
|
+
|
68
|
+
yield self
|
69
|
+
|
70
|
+
env = Hash.new do |hash, key|
|
71
|
+
hash[key] = ENV[key].dup.to_s rescue ''
|
72
|
+
end
|
73
|
+
|
74
|
+
self.configure_options.flatten!
|
75
|
+
|
76
|
+
self.configure_options.delete_if do |option|
|
77
|
+
case option
|
78
|
+
when /\A(\w+)=(.*)\z/
|
79
|
+
env[$1] = $2
|
80
|
+
true
|
81
|
+
else
|
82
|
+
false
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
if static_p
|
87
|
+
self.configure_options += [
|
88
|
+
"--enable-shared",
|
89
|
+
"--enable-static",
|
90
|
+
"--disable-renaming"
|
91
|
+
]
|
92
|
+
env['CFLAGS'] = "-fPIC #{env['CFLAGS']}"
|
93
|
+
env['CPPFLAGS'] = "-DU_CHARSET_IS_UTF8=1 -DU_USING_ICU_NAMESPACE=0 -DU_STATIC_IMPLEMENTATION #{env['CPPFLAGS']}"
|
94
|
+
env['CXXFLAGS'] = "-fPIC -fno-exceptions #{env['CXXFLAGS']}"
|
95
|
+
env['LDFLAGS'] = "-fPIC -static-libstdc++ #{env['CFLAGS']}"
|
96
|
+
else
|
97
|
+
self.configure_options += [
|
98
|
+
"--enable-shared",
|
99
|
+
"--disable-static",
|
100
|
+
]
|
101
|
+
end
|
102
|
+
|
103
|
+
if RbConfig::CONFIG['target_cpu'] == 'universal'
|
104
|
+
%w[CFLAGS LDFLAGS].each do |key|
|
105
|
+
unless env[key].include?('-arch')
|
106
|
+
env[key] += ' ' + RbConfig::CONFIG['ARCH_FLAG']
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
@env = env
|
112
|
+
end
|
113
|
+
|
114
|
+
def cook
|
115
|
+
|
116
|
+
message <<-"EOS"
|
117
|
+
************************************************************************
|
118
|
+
IMPORTANT NOTICE:
|
119
|
+
|
120
|
+
Building ICU with a packaged version of #{name}-#{version}#{'.' if self.patch_files.empty?}
|
121
|
+
EOS
|
122
|
+
|
123
|
+
unless self.patch_files.empty?
|
124
|
+
message "with the following patches applied:\n"
|
125
|
+
|
126
|
+
self.patch_files.each do |patch|
|
127
|
+
message "\t- %s\n" % File.basename(patch)
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
message <<-"EOS"
|
132
|
+
|
133
|
+
gem install icu -- --use-system-libraries
|
134
|
+
|
135
|
+
If you are using Bundler, tell it to use the option:
|
136
|
+
|
137
|
+
bundle config build.icu --use-system-libraries
|
138
|
+
bundle install
|
139
|
+
EOS
|
140
|
+
|
141
|
+
message <<-"EOS"
|
142
|
+
************************************************************************
|
143
|
+
EOS
|
144
|
+
super
|
145
|
+
end
|
146
|
+
|
147
|
+
def configure
|
148
|
+
# run as recommend, basically set up compiler and flags
|
149
|
+
platform = if RUBY_PLATFORM =~ /mingw|mswin/
|
150
|
+
'MSYS/MSVC'
|
151
|
+
elsif RUBY_PLATFORM =~ /darwin/
|
152
|
+
'MacOSX'
|
153
|
+
else
|
154
|
+
'Linux'
|
155
|
+
end # double quotes are significant.
|
156
|
+
execute('ICU Configure', [@env] + ['./runConfigureICU', platform] + computed_options)
|
157
|
+
super
|
158
|
+
end
|
159
|
+
|
160
|
+
def work_path
|
161
|
+
File.join(Dir.glob("#{tmp_path}/*").find { |d| File.directory?(d) }, 'source')
|
162
|
+
end
|
163
|
+
|
164
|
+
end
|
165
|
+
|
166
|
+
message "Using mini_portile version #{MiniPortile::VERSION}\n"
|
167
|
+
|
168
|
+
static_p = enable_config('static', true) or
|
169
|
+
message "Static linking is disabled.\n"
|
170
|
+
recipes = []
|
171
|
+
|
172
|
+
libicu_recipe = ICURecipe.new("libicu", "59.1", static_p) do |recipe|
|
173
|
+
recipe.files = [{
|
174
|
+
url: "https://downloads.sourceforge.net/project/icu/ICU4C/59.1/icu4c-59_1-src.tgz?r=&ts=1501595646",
|
175
|
+
sha256: "7132fdaf9379429d004005217f10e00b7d2319d0fea22bdfddef8991c45b75fe"
|
176
|
+
# gpg: Signature made Fri Apr 14 21:00:23 2017 CEST using RSA key ID 4FB419E3
|
177
|
+
# gpg: requesting key 4FB419E3 from hkps server hkps.pool.sks-keyservers.net
|
178
|
+
# gpg: key 4FB419E3: public key "Steven R. Loomis (filfla-signing) <srloomis@us.ibm.com>" imported
|
179
|
+
# gpg: 3 marginal(s) needed, 1 complete(s) needed, PGP trust model
|
180
|
+
# gpg: depth: 0 valid: 2 signed: 1 trust: 0-, 0q, 0n, 0m, 0f, 2u
|
181
|
+
# gpg: depth: 1 valid: 1 signed: 0 trust: 1-, 0q, 0n, 0m, 0f, 0u
|
182
|
+
# gpg: next trustdb check due at 2018-08-19
|
183
|
+
# gpg: Total number processed: 1
|
184
|
+
# gpg: imported: 1 (RSA: 1)
|
185
|
+
# gpg: Good signature from "Steven R. Loomis (filfla-signing) <srloomis@us.ibm.com>" [unknown]
|
186
|
+
# gpg: aka "Steven R. Loomis (filfla-signing) <srl295@gmail.com>" [unknown]
|
187
|
+
# gpg: aka "Steven R. Loomis (filfla-signing) <srl@icu-project.org>" [unknown]
|
188
|
+
# gpg: aka "[jpeg image of size 4680]" [unknown]
|
189
|
+
# gpg: WARNING: This key is not certified with a trusted signature!
|
190
|
+
# gpg: There is no indication that the signature belongs to the owner.
|
191
|
+
# Primary key fingerprint: BA90 283A 60D6 7BA0 DD91 0A89 3932 080F 4FB4 19E3
|
192
|
+
}]
|
193
|
+
end
|
194
|
+
recipes.push libicu_recipe
|
195
|
+
|
196
|
+
recipes.each do |recipe|
|
197
|
+
checkpoint = "#{recipe.target}/#{recipe.name}-#{recipe.version}-#{recipe.host}.installed"
|
198
|
+
unless File.exist?(checkpoint)
|
199
|
+
recipe.cook
|
200
|
+
FileUtils.touch checkpoint
|
201
|
+
end
|
202
|
+
|
203
|
+
recipe.activate
|
204
|
+
end
|
205
|
+
|
206
|
+
$libs = $libs.shellsplit.tap do |libs|
|
207
|
+
[libicu_recipe].each do |recipe|
|
208
|
+
libname = recipe.name[/\Alib(.+)\z/, 1]
|
209
|
+
# TODO: build with pkg-config
|
210
|
+
# Should do like PKG_CONFIG_PATH=/root/icu4r/ports/x86_64-pc-linux-gnu/libicu/59.1/lib/pkgconfig/ pkg-config --static icu-uc
|
211
|
+
File.join(recipe.path, "bin", "#{libname}-config").tap do |config|
|
212
|
+
# call config scripts explicit with 'sh' for compat with Windows
|
213
|
+
$CPPFLAGS = '-DU_DISABLE_RENAMING=1 -DU_CHARSET_IS_UTF8=1 -DU_USING_ICU_NAMESPACE=0 -DU_STATIC_IMPLEMENTATION' << ' ' << $CPPFLAGS
|
214
|
+
`sh #{config} --ldflags`.strip.shellsplit.each do |arg|
|
215
|
+
case arg
|
216
|
+
when /\A-L(.+)\z/
|
217
|
+
# Prioritize ports' directories
|
218
|
+
if $1.start_with?(ROOT + '/')
|
219
|
+
$LIBPATH = [$1] | $LIBPATH
|
220
|
+
else
|
221
|
+
$LIBPATH = $LIBPATH | [$1]
|
222
|
+
end
|
223
|
+
when /\A-l./
|
224
|
+
libs.unshift(arg)
|
225
|
+
else
|
226
|
+
$LDFLAGS << ' ' << arg.shellescape
|
227
|
+
end
|
228
|
+
end
|
229
|
+
$INCFLAGS = `sh #{config} --cppflags-searchpath `.strip << ' ' << $INCFLAGS
|
230
|
+
$CFLAGS = '-DU_DISABLE_RENAMING=1 -DU_CHARSET_IS_UTF8=1 -DU_USING_ICU_NAMESPACE=0 -DU_STATIC_IMPLEMENTATION' << ' ' << `sh #{config} --cflags`.strip << $CFLAGS
|
231
|
+
end
|
232
|
+
end
|
233
|
+
end.shelljoin
|
234
|
+
|
235
|
+
end
|
236
|
+
|
237
|
+
$CFLAGS << ' -O3 -funroll-loops -std=c99'
|
238
|
+
$CFLAGS << ' -Wextra -O0 -ggdb3' if ENV['DEBUG']
|
239
|
+
|
240
|
+
puts $CFLAGS, $CPPFLAGS, $CXXFLAGS
|
241
|
+
|
242
|
+
create_makefile('icu/icu')
|
data/ext/icu/icu.c
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
#include "icu.h"
|
2
|
+
|
3
|
+
VALUE rb_mICU;
|
4
|
+
|
5
|
+
void Init_icu(void)
|
6
|
+
{
|
7
|
+
rb_mICU = rb_define_module("ICU");
|
8
|
+
init_internal_encoding();
|
9
|
+
init_rb_errors();
|
10
|
+
init_icu_collator();
|
11
|
+
init_icu_normalizer();
|
12
|
+
init_icu_spoof_checker();
|
13
|
+
init_icu_transliterator();
|
14
|
+
init_icu_charset_detector();
|
15
|
+
init_icu_locale();
|
16
|
+
}
|
17
|
+
|
18
|
+
/* vim: set expandtab sws=4 sw=4: */
|
data/ext/icu/icu.h
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
#ifndef RUBY_EXTENSION_ICU_H_
|
2
|
+
#define RUBY_EXTENSION_ICU_H_
|
3
|
+
|
4
|
+
/* System libraries */
|
5
|
+
#include <stdlib.h>
|
6
|
+
|
7
|
+
/* Ruby headers */
|
8
|
+
#define ONIG_ESCAPE_UCHAR_COLLISION 1 // ruby.h defines UChar macro
|
9
|
+
#include <ruby.h>
|
10
|
+
#include <ruby/encoding.h>
|
11
|
+
#ifdef UChar // fail-safe
|
12
|
+
#undef UChar
|
13
|
+
#endif
|
14
|
+
#include "unicode/ustring.h"
|
15
|
+
#include "unicode/uenum.h"
|
16
|
+
#include "unicode/parseerr.h"
|
17
|
+
|
18
|
+
/* Globals */
|
19
|
+
|
20
|
+
extern VALUE rb_mICU;
|
21
|
+
extern VALUE rb_eICU_Error;
|
22
|
+
extern VALUE rb_eICU_InvalidParameterError;
|
23
|
+
extern VALUE rb_cICU_UString;
|
24
|
+
extern VALUE rb_cICU_Collator;
|
25
|
+
extern VALUE rb_cICU_Normalizer;
|
26
|
+
extern VALUE rb_cICU_SpoofChecker;
|
27
|
+
extern VALUE rb_cICU_Transliterator;
|
28
|
+
extern VALUE rb_cICU_CharsetDetector;
|
29
|
+
extern VALUE rb_cICU_CharsetDetector_Match;
|
30
|
+
extern VALUE rb_cICU_Locale;
|
31
|
+
|
32
|
+
/* Prototypes */
|
33
|
+
void Init_icu _(( void ));
|
34
|
+
void init_internal_encoding _(( void ));
|
35
|
+
void init_rb_errors _(( void ));
|
36
|
+
void init_icu_collator _(( void ));
|
37
|
+
void init_icu_normalizer _(( void ));
|
38
|
+
void init_icu_spoof_checker _(( void ));
|
39
|
+
void init_icu_transliterator _(( void ));
|
40
|
+
void init_icu_charset_detector _(( void ));
|
41
|
+
void init_icu_locale _(( void ));
|
42
|
+
|
43
|
+
int icu_is_rb_enc_idx_as_utf_8 _(( int ));
|
44
|
+
int icu_is_rb_str_as_utf_8 _(( VALUE ));
|
45
|
+
const char* icu_rb_str_enc_name _(( int ));
|
46
|
+
VALUE rb_str_enc_to_ascii_as_utf8 _(( VALUE ));
|
47
|
+
int icu_rb_str_enc_idx _(( VALUE ));
|
48
|
+
VALUE icu_enum_to_rb_ary _(( UEnumeration*, UErrorCode, long ));
|
49
|
+
extern void icu_rb_raise_icu_error _(( UErrorCode ));
|
50
|
+
extern void icu_rb_raise_icu_parse_error _(( const UParseError* ));
|
51
|
+
extern void icu_rb_raise_icu_invalid_parameter _(( const char*, const char* ));
|
52
|
+
|
53
|
+
VALUE icu_ustring_init_with_capa_enc _(( int32_t, int ));
|
54
|
+
VALUE icu_ustring_from_rb_str _(( VALUE ));
|
55
|
+
VALUE icu_ustring_from_uchar_str _(( const UChar*, int32_t ));
|
56
|
+
void icu_ustring_clear_ptr _(( VALUE ));
|
57
|
+
void icu_ustring_resize _(( VALUE, int32_t ));
|
58
|
+
void icu_ustring_set_enc _(( VALUE, int ));
|
59
|
+
VALUE icu_ustring_to_rb_enc_str_with_len _(( VALUE, int32_t ));
|
60
|
+
VALUE icu_ustring_to_rb_enc_str _(( VALUE ));
|
61
|
+
UChar* icu_ustring_ptr _(( VALUE ));
|
62
|
+
int32_t icu_ustring_len _(( VALUE ));
|
63
|
+
int32_t icu_ustring_capa _(( VALUE ));
|
64
|
+
VALUE char_buffer_to_rb_str _(( const char* ));
|
65
|
+
char* char_buffer_new _(( int32_t ));
|
66
|
+
void char_buffer_resize _(( const char*, int32_t ));
|
67
|
+
void char_buffer_free _(( const char* ));
|
68
|
+
|
69
|
+
/* Constants */
|
70
|
+
#define RUBY_C_STRING_TERMINATOR_SIZE 1
|
71
|
+
|
72
|
+
/* Macros */
|
73
|
+
#define ICU_RUBY_ENCODING_INDEX (rb_enc_to_index(rb_default_internal_encoding()) || rb_locale_encindex())
|
74
|
+
#define ICU_RB_STRING_ENC_NAME_IDX(_idx) rb_enc_from_index(_idx) != NULL ? (rb_enc_from_index(_idx))->name : ""
|
75
|
+
|
76
|
+
#endif // RUBY_EXTENSION_ICU_H_
|
77
|
+
|
78
|
+
/* vim: set expandtab sws=4 sw=4: */
|
@@ -0,0 +1,192 @@
|
|
1
|
+
#include "icu.h"
|
2
|
+
#include "unicode/ucsdet.h"
|
3
|
+
|
4
|
+
#define GET_DETECTOR(_data) icu_detector_data* _data; \
|
5
|
+
TypedData_Get_Struct(self, icu_detector_data, &icu_detector_type, _data)
|
6
|
+
|
7
|
+
VALUE rb_cICU_CharsetDetector;
|
8
|
+
VALUE rb_cICU_CharsetDetector_Match;
|
9
|
+
|
10
|
+
typedef struct {
|
11
|
+
VALUE rb_instance;
|
12
|
+
UCharsetDetector* service;
|
13
|
+
char* dummy_str; // used for reset
|
14
|
+
} icu_detector_data;
|
15
|
+
|
16
|
+
static void detector_free(void* _this)
|
17
|
+
{
|
18
|
+
icu_detector_data* this = _this;
|
19
|
+
if (this->dummy_str != NULL) {
|
20
|
+
ruby_xfree(this->dummy_str);
|
21
|
+
}
|
22
|
+
ucsdet_close(this->service);
|
23
|
+
}
|
24
|
+
|
25
|
+
static size_t detector_memsize(const void* _)
|
26
|
+
{
|
27
|
+
return sizeof(icu_detector_data);
|
28
|
+
}
|
29
|
+
|
30
|
+
static const rb_data_type_t icu_detector_type = {
|
31
|
+
"icu/charset_detector",
|
32
|
+
{NULL, detector_free, detector_memsize,},
|
33
|
+
0, 0,
|
34
|
+
RUBY_TYPED_FREE_IMMEDIATELY,
|
35
|
+
};
|
36
|
+
|
37
|
+
static VALUE detector_populate_match_struct(const UCharsetMatch* match)
|
38
|
+
{
|
39
|
+
UErrorCode status = U_ZERO_ERROR;
|
40
|
+
int32_t confidence = ucsdet_getConfidence(match, &status);
|
41
|
+
if (U_FAILURE(status)) {
|
42
|
+
icu_rb_raise_icu_error(status);
|
43
|
+
}
|
44
|
+
status = U_ZERO_ERROR;
|
45
|
+
const char* name = ucsdet_getName(match, &status);
|
46
|
+
if (U_FAILURE(status)) {
|
47
|
+
icu_rb_raise_icu_error(status);
|
48
|
+
}
|
49
|
+
status = U_ZERO_ERROR;
|
50
|
+
const char* language = ucsdet_getLanguage(match, &status);
|
51
|
+
if (U_FAILURE(status)) {
|
52
|
+
icu_rb_raise_icu_error(status);
|
53
|
+
}
|
54
|
+
return rb_struct_new(rb_cICU_CharsetDetector_Match,
|
55
|
+
rb_str_new_cstr(name),
|
56
|
+
INT2NUM(confidence),
|
57
|
+
rb_str_new_cstr(language));
|
58
|
+
}
|
59
|
+
|
60
|
+
VALUE detector_alloc(VALUE self)
|
61
|
+
{
|
62
|
+
icu_detector_data* this;
|
63
|
+
return TypedData_Make_Struct(self, icu_detector_data, &icu_detector_type, this);
|
64
|
+
}
|
65
|
+
|
66
|
+
VALUE detector_initialize(int argc, VALUE* argv, VALUE self)
|
67
|
+
{
|
68
|
+
GET_DETECTOR(this);
|
69
|
+
this->rb_instance = self;
|
70
|
+
this->service = NULL;
|
71
|
+
|
72
|
+
UErrorCode status = U_ZERO_ERROR;
|
73
|
+
this->service = ucsdet_open(&status);
|
74
|
+
if (U_FAILURE(status)) {
|
75
|
+
icu_rb_raise_icu_error(status);
|
76
|
+
}
|
77
|
+
this->dummy_str = ALLOC_N(char, 1);
|
78
|
+
this->dummy_str[0] = '\0';
|
79
|
+
|
80
|
+
return self;
|
81
|
+
}
|
82
|
+
|
83
|
+
static inline void detector_reset_text(const icu_detector_data* this)
|
84
|
+
{
|
85
|
+
UErrorCode status = U_ZERO_ERROR;
|
86
|
+
ucsdet_setText(this->service, this->dummy_str, 0, &status);
|
87
|
+
if (U_FAILURE(status)) {
|
88
|
+
icu_rb_raise_icu_error(status);
|
89
|
+
}
|
90
|
+
}
|
91
|
+
|
92
|
+
// rb_str must be a ruby String
|
93
|
+
static inline void detector_set_text(const icu_detector_data* this, VALUE rb_str)
|
94
|
+
{
|
95
|
+
UErrorCode status = U_ZERO_ERROR;
|
96
|
+
ucsdet_setText(this->service, RSTRING_PTR(rb_str), RSTRING_LENINT(rb_str), &status);
|
97
|
+
if (U_FAILURE(status)) {
|
98
|
+
icu_rb_raise_icu_error(status);
|
99
|
+
}
|
100
|
+
}
|
101
|
+
|
102
|
+
//
|
103
|
+
// no charset appears to match the data.
|
104
|
+
// no input text has been provided
|
105
|
+
VALUE detector_detect(VALUE self, VALUE str)
|
106
|
+
{
|
107
|
+
StringValue(str);
|
108
|
+
GET_DETECTOR(this);
|
109
|
+
|
110
|
+
detector_set_text(this, str);
|
111
|
+
UErrorCode status = U_ZERO_ERROR;
|
112
|
+
const UCharsetMatch* match = ucsdet_detect(this->service, &status);
|
113
|
+
if (U_FAILURE(status)) {
|
114
|
+
icu_rb_raise_icu_error(status);
|
115
|
+
}
|
116
|
+
|
117
|
+
VALUE rb_match = detector_populate_match_struct(match);
|
118
|
+
detector_reset_text(this);
|
119
|
+
return rb_match;
|
120
|
+
}
|
121
|
+
|
122
|
+
VALUE detector_detect_all(VALUE self, VALUE str)
|
123
|
+
{
|
124
|
+
StringValue(str);
|
125
|
+
GET_DETECTOR(this);
|
126
|
+
|
127
|
+
detector_set_text(this, str);
|
128
|
+
|
129
|
+
UErrorCode status = U_ZERO_ERROR;
|
130
|
+
int32_t len_matches = 0;
|
131
|
+
const UCharsetMatch** matches = ucsdet_detectAll(this->service, &len_matches, &status);
|
132
|
+
if (U_FAILURE(status)) {
|
133
|
+
icu_rb_raise_icu_error(status);
|
134
|
+
}
|
135
|
+
|
136
|
+
VALUE result = rb_ary_new2(3); // pre-allocate some slots
|
137
|
+
for (int32_t i = 0; i < len_matches; ++i) {
|
138
|
+
rb_ary_push(result, detector_populate_match_struct(matches[i]));
|
139
|
+
}
|
140
|
+
detector_reset_text(this);
|
141
|
+
return result;
|
142
|
+
}
|
143
|
+
|
144
|
+
static inline VALUE detector_get_input_filter_internal(const icu_detector_data* this)
|
145
|
+
{
|
146
|
+
return ucsdet_isInputFilterEnabled(this->service) != 0 ? Qtrue : Qfalse;
|
147
|
+
}
|
148
|
+
|
149
|
+
VALUE detector_get_input_filter(VALUE self)
|
150
|
+
{
|
151
|
+
GET_DETECTOR(this);
|
152
|
+
return detector_get_input_filter_internal(this);
|
153
|
+
}
|
154
|
+
|
155
|
+
VALUE detector_set_input_filter(VALUE self, VALUE flag)
|
156
|
+
{
|
157
|
+
GET_DETECTOR(this);
|
158
|
+
ucsdet_enableInputFilter(this->service, flag == Qtrue ? TRUE : FALSE);
|
159
|
+
return detector_get_input_filter_internal(this);
|
160
|
+
}
|
161
|
+
|
162
|
+
VALUE detector_detectable_charsets(VALUE self)
|
163
|
+
{
|
164
|
+
GET_DETECTOR(this);
|
165
|
+
UErrorCode status = U_ZERO_ERROR;
|
166
|
+
UEnumeration* charsets = ucsdet_getAllDetectableCharsets(this->service, &status);
|
167
|
+
return icu_enum_to_rb_ary(charsets, status, 28);
|
168
|
+
}
|
169
|
+
|
170
|
+
void init_icu_charset_detector(void)
|
171
|
+
{
|
172
|
+
rb_cICU_CharsetDetector = rb_define_class_under(rb_mICU, "CharsetDetector", rb_cObject);
|
173
|
+
rb_define_alloc_func(rb_cICU_CharsetDetector, detector_alloc);
|
174
|
+
rb_define_method(rb_cICU_CharsetDetector, "initialize", detector_initialize, -1);
|
175
|
+
rb_define_method(rb_cICU_CharsetDetector, "detect", detector_detect, 1);
|
176
|
+
rb_define_method(rb_cICU_CharsetDetector, "detect_all", detector_detect_all, 1);
|
177
|
+
rb_define_method(rb_cICU_CharsetDetector, "input_filter", detector_get_input_filter, 0);
|
178
|
+
rb_define_method(rb_cICU_CharsetDetector, "input_filter=", detector_set_input_filter, 1);
|
179
|
+
rb_define_method(rb_cICU_CharsetDetector, "detectable_charsets", detector_detectable_charsets, 0);
|
180
|
+
|
181
|
+
// define a Match struct in Ruby
|
182
|
+
rb_cICU_CharsetDetector_Match = rb_struct_define_under(rb_cICU_CharsetDetector,
|
183
|
+
"Match",
|
184
|
+
"name",
|
185
|
+
"confidence",
|
186
|
+
"language",
|
187
|
+
NULL);
|
188
|
+
}
|
189
|
+
|
190
|
+
#undef GET_DETECTOR
|
191
|
+
|
192
|
+
/* vim: set expandtab sws=4 sw=4: */
|