icu 0.9.1 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.travis.yml +11 -0
- data/Gemfile +3 -0
- data/LICENSE +20 -0
- data/README.md +69 -0
- data/Rakefile +38 -0
- data/benchmark/normalization.rb +106 -0
- data/benchmark/normalization_phrases.txt +1031 -0
- data/benchmark/normalization_result.txt +45 -0
- data/benchmark/normalization_wikip.txt +2838 -0
- data/ext/icu/extconf.rb +242 -0
- data/ext/icu/icu.c +18 -0
- data/ext/icu/icu.h +78 -0
- data/ext/icu/icu_charset_detector.c +192 -0
- data/ext/icu/icu_collator.c +138 -0
- data/ext/icu/icu_locale.c +852 -0
- data/ext/icu/icu_normalizer.c +122 -0
- data/ext/icu/icu_number_format.c +0 -0
- data/ext/icu/icu_spoof_checker.c +194 -0
- data/ext/icu/icu_transliterator.c +159 -0
- data/ext/icu/internal_encoding.c +38 -0
- data/ext/icu/internal_ustring.c +304 -0
- data/ext/icu/internal_utils.c +50 -0
- data/ext/icu/rb_errors.c +14 -0
- data/icu.gemspec +22 -0
- data/lib/icu.rb +6 -18
- data/lib/icu/charset_detector.rb +5 -0
- data/lib/icu/collator.rb +24 -0
- data/lib/icu/locale.rb +19 -0
- data/lib/icu/transliterator.rb +8 -0
- data/lib/icu/version.rb +3 -0
- data/spec/charset_detector_spec.rb +47 -0
- data/spec/collator_spec.rb +73 -0
- data/spec/locale_spec.rb +312 -0
- data/spec/normalizer_spec.rb +35 -0
- data/spec/spec_helper.rb +8 -0
- data/spec/spoof_checker_spec.rb +56 -0
- data/spec/transliterator_spec.rb +41 -0
- metadata +132 -55
- data/COPYING +0 -674
- data/COPYING.LESSER +0 -165
- data/README +0 -81
- data/ext/extconf.rb +0 -31
- data/ext/icu.c +0 -128
- data/ext/icu.h +0 -34
- data/ext/icu_locale.c +0 -330
- data/ext/icu_locale_country.c +0 -99
- data/ext/icu_locale_language.c +0 -99
- data/ext/icu_numeric.c +0 -161
- data/ext/icu_time.c +0 -391
- data/test/test_locale.rb +0 -73
- data/test/test_numeric.rb +0 -78
- data/test/test_time.rb +0 -75
data/ext/icu/extconf.rb
ADDED
@@ -0,0 +1,242 @@
|
|
1
|
+
require 'mkmf'
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
|
5
|
+
ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..', '..'))
|
6
|
+
|
7
|
+
# Utility functions
|
8
|
+
|
9
|
+
def using_system_libraries?
|
10
|
+
arg_config('--use-system-libraries', !!ENV['ICU_USE_SYSTEM_LIBRARIES'])
|
11
|
+
end
|
12
|
+
|
13
|
+
# Building with system ICU
|
14
|
+
|
15
|
+
if using_system_libraries?
|
16
|
+
message "Building ICU using system libraries.\n Not supported yet, PR welcome!"
|
17
|
+
exit 1
|
18
|
+
|
19
|
+
unless dir_config('icu').any?
|
20
|
+
base = if !`which brew`.empty?
|
21
|
+
`brew --prefix`.strip
|
22
|
+
elsif File.exists?("/usr/local/Cellar/icu4c")
|
23
|
+
'/usr/local/Cellar'
|
24
|
+
end
|
25
|
+
|
26
|
+
if base and icu4c = Dir[File.join(base, 'Cellar/icu4c/*')].sort.last
|
27
|
+
$INCFLAGS << " -I#{icu4c}/include "
|
28
|
+
$LDFLAGS << " -L#{icu4c}/lib "
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
unless have_library 'icui18n' and have_header 'unicode/ucnv.h'
|
33
|
+
STDERR.puts <<-EOS
|
34
|
+
************************************************************************
|
35
|
+
icu not found.
|
36
|
+
install by brew install icu4c or apt-get install libicu-dev)
|
37
|
+
************************************************************************ww
|
38
|
+
EOS
|
39
|
+
|
40
|
+
exit(1)
|
41
|
+
end
|
42
|
+
|
43
|
+
have_library 'icuuc' or abort 'libicuuc missing'
|
44
|
+
have_library 'icudata' or abort 'libicudata missing'
|
45
|
+
else
|
46
|
+
message "Building ICU from source.\n"
|
47
|
+
|
48
|
+
# The gem version constraint in the Rakefile is not respected at install time.
|
49
|
+
# Keep this version in sync with the one in the Rakefile !
|
50
|
+
require 'rubygems'
|
51
|
+
gem 'mini_portile2', '~> 2.2.0'
|
52
|
+
require 'mini_portile2'
|
53
|
+
|
54
|
+
# Checkout the source code of ICU.
|
55
|
+
# http://site.icu-project.org/download/
|
56
|
+
# http://userguide.icu-project.org/howtouseicu
|
57
|
+
# Also check the readme of ICU release file.
|
58
|
+
class ICURecipe < MiniPortile
|
59
|
+
def initialize(name, version, static_p)
|
60
|
+
super(name, version)
|
61
|
+
self.target = File.join(ROOT, "ports")
|
62
|
+
# Prefer host_alias over host in order to use i586-mingw32msvc as
|
63
|
+
# correct compiler prefix for cross build, but use host if not set.
|
64
|
+
self.host = RbConfig::CONFIG["host_alias"].empty? ? RbConfig::CONFIG["host"] : RbConfig::CONFIG["host_alias"]
|
65
|
+
self.patch_files = Dir[File.join(ROOT, "patches", name, "*.patch")].sort
|
66
|
+
self.configure_options << "--libdir=#{File.join(self.path, "lib")}"
|
67
|
+
|
68
|
+
yield self
|
69
|
+
|
70
|
+
env = Hash.new do |hash, key|
|
71
|
+
hash[key] = ENV[key].dup.to_s rescue ''
|
72
|
+
end
|
73
|
+
|
74
|
+
self.configure_options.flatten!
|
75
|
+
|
76
|
+
self.configure_options.delete_if do |option|
|
77
|
+
case option
|
78
|
+
when /\A(\w+)=(.*)\z/
|
79
|
+
env[$1] = $2
|
80
|
+
true
|
81
|
+
else
|
82
|
+
false
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
if static_p
|
87
|
+
self.configure_options += [
|
88
|
+
"--enable-shared",
|
89
|
+
"--enable-static",
|
90
|
+
"--disable-renaming"
|
91
|
+
]
|
92
|
+
env['CFLAGS'] = "-fPIC #{env['CFLAGS']}"
|
93
|
+
env['CPPFLAGS'] = "-DU_CHARSET_IS_UTF8=1 -DU_USING_ICU_NAMESPACE=0 -DU_STATIC_IMPLEMENTATION #{env['CPPFLAGS']}"
|
94
|
+
env['CXXFLAGS'] = "-fPIC -fno-exceptions #{env['CXXFLAGS']}"
|
95
|
+
env['LDFLAGS'] = "-fPIC -static-libstdc++ #{env['CFLAGS']}"
|
96
|
+
else
|
97
|
+
self.configure_options += [
|
98
|
+
"--enable-shared",
|
99
|
+
"--disable-static",
|
100
|
+
]
|
101
|
+
end
|
102
|
+
|
103
|
+
if RbConfig::CONFIG['target_cpu'] == 'universal'
|
104
|
+
%w[CFLAGS LDFLAGS].each do |key|
|
105
|
+
unless env[key].include?('-arch')
|
106
|
+
env[key] += ' ' + RbConfig::CONFIG['ARCH_FLAG']
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
@env = env
|
112
|
+
end
|
113
|
+
|
114
|
+
def cook
|
115
|
+
|
116
|
+
message <<-"EOS"
|
117
|
+
************************************************************************
|
118
|
+
IMPORTANT NOTICE:
|
119
|
+
|
120
|
+
Building ICU with a packaged version of #{name}-#{version}#{'.' if self.patch_files.empty?}
|
121
|
+
EOS
|
122
|
+
|
123
|
+
unless self.patch_files.empty?
|
124
|
+
message "with the following patches applied:\n"
|
125
|
+
|
126
|
+
self.patch_files.each do |patch|
|
127
|
+
message "\t- %s\n" % File.basename(patch)
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
message <<-"EOS"
|
132
|
+
|
133
|
+
gem install icu -- --use-system-libraries
|
134
|
+
|
135
|
+
If you are using Bundler, tell it to use the option:
|
136
|
+
|
137
|
+
bundle config build.icu --use-system-libraries
|
138
|
+
bundle install
|
139
|
+
EOS
|
140
|
+
|
141
|
+
message <<-"EOS"
|
142
|
+
************************************************************************
|
143
|
+
EOS
|
144
|
+
super
|
145
|
+
end
|
146
|
+
|
147
|
+
def configure
|
148
|
+
# run as recommend, basically set up compiler and flags
|
149
|
+
platform = if RUBY_PLATFORM =~ /mingw|mswin/
|
150
|
+
'MSYS/MSVC'
|
151
|
+
elsif RUBY_PLATFORM =~ /darwin/
|
152
|
+
'MacOSX'
|
153
|
+
else
|
154
|
+
'Linux'
|
155
|
+
end # double quotes are significant.
|
156
|
+
execute('ICU Configure', [@env] + ['./runConfigureICU', platform] + computed_options)
|
157
|
+
super
|
158
|
+
end
|
159
|
+
|
160
|
+
def work_path
|
161
|
+
File.join(Dir.glob("#{tmp_path}/*").find { |d| File.directory?(d) }, 'source')
|
162
|
+
end
|
163
|
+
|
164
|
+
end
|
165
|
+
|
166
|
+
message "Using mini_portile version #{MiniPortile::VERSION}\n"
|
167
|
+
|
168
|
+
static_p = enable_config('static', true) or
|
169
|
+
message "Static linking is disabled.\n"
|
170
|
+
recipes = []
|
171
|
+
|
172
|
+
libicu_recipe = ICURecipe.new("libicu", "59.1", static_p) do |recipe|
|
173
|
+
recipe.files = [{
|
174
|
+
url: "https://downloads.sourceforge.net/project/icu/ICU4C/59.1/icu4c-59_1-src.tgz?r=&ts=1501595646",
|
175
|
+
sha256: "7132fdaf9379429d004005217f10e00b7d2319d0fea22bdfddef8991c45b75fe"
|
176
|
+
# gpg: Signature made Fri Apr 14 21:00:23 2017 CEST using RSA key ID 4FB419E3
|
177
|
+
# gpg: requesting key 4FB419E3 from hkps server hkps.pool.sks-keyservers.net
|
178
|
+
# gpg: key 4FB419E3: public key "Steven R. Loomis (filfla-signing) <srloomis@us.ibm.com>" imported
|
179
|
+
# gpg: 3 marginal(s) needed, 1 complete(s) needed, PGP trust model
|
180
|
+
# gpg: depth: 0 valid: 2 signed: 1 trust: 0-, 0q, 0n, 0m, 0f, 2u
|
181
|
+
# gpg: depth: 1 valid: 1 signed: 0 trust: 1-, 0q, 0n, 0m, 0f, 0u
|
182
|
+
# gpg: next trustdb check due at 2018-08-19
|
183
|
+
# gpg: Total number processed: 1
|
184
|
+
# gpg: imported: 1 (RSA: 1)
|
185
|
+
# gpg: Good signature from "Steven R. Loomis (filfla-signing) <srloomis@us.ibm.com>" [unknown]
|
186
|
+
# gpg: aka "Steven R. Loomis (filfla-signing) <srl295@gmail.com>" [unknown]
|
187
|
+
# gpg: aka "Steven R. Loomis (filfla-signing) <srl@icu-project.org>" [unknown]
|
188
|
+
# gpg: aka "[jpeg image of size 4680]" [unknown]
|
189
|
+
# gpg: WARNING: This key is not certified with a trusted signature!
|
190
|
+
# gpg: There is no indication that the signature belongs to the owner.
|
191
|
+
# Primary key fingerprint: BA90 283A 60D6 7BA0 DD91 0A89 3932 080F 4FB4 19E3
|
192
|
+
}]
|
193
|
+
end
|
194
|
+
recipes.push libicu_recipe
|
195
|
+
|
196
|
+
recipes.each do |recipe|
|
197
|
+
checkpoint = "#{recipe.target}/#{recipe.name}-#{recipe.version}-#{recipe.host}.installed"
|
198
|
+
unless File.exist?(checkpoint)
|
199
|
+
recipe.cook
|
200
|
+
FileUtils.touch checkpoint
|
201
|
+
end
|
202
|
+
|
203
|
+
recipe.activate
|
204
|
+
end
|
205
|
+
|
206
|
+
$libs = $libs.shellsplit.tap do |libs|
|
207
|
+
[libicu_recipe].each do |recipe|
|
208
|
+
libname = recipe.name[/\Alib(.+)\z/, 1]
|
209
|
+
# TODO: build with pkg-config
|
210
|
+
# Should do like PKG_CONFIG_PATH=/root/icu4r/ports/x86_64-pc-linux-gnu/libicu/59.1/lib/pkgconfig/ pkg-config --static icu-uc
|
211
|
+
File.join(recipe.path, "bin", "#{libname}-config").tap do |config|
|
212
|
+
# call config scripts explicit with 'sh' for compat with Windows
|
213
|
+
$CPPFLAGS = '-DU_DISABLE_RENAMING=1 -DU_CHARSET_IS_UTF8=1 -DU_USING_ICU_NAMESPACE=0 -DU_STATIC_IMPLEMENTATION' << ' ' << $CPPFLAGS
|
214
|
+
`sh #{config} --ldflags`.strip.shellsplit.each do |arg|
|
215
|
+
case arg
|
216
|
+
when /\A-L(.+)\z/
|
217
|
+
# Prioritize ports' directories
|
218
|
+
if $1.start_with?(ROOT + '/')
|
219
|
+
$LIBPATH = [$1] | $LIBPATH
|
220
|
+
else
|
221
|
+
$LIBPATH = $LIBPATH | [$1]
|
222
|
+
end
|
223
|
+
when /\A-l./
|
224
|
+
libs.unshift(arg)
|
225
|
+
else
|
226
|
+
$LDFLAGS << ' ' << arg.shellescape
|
227
|
+
end
|
228
|
+
end
|
229
|
+
$INCFLAGS = `sh #{config} --cppflags-searchpath `.strip << ' ' << $INCFLAGS
|
230
|
+
$CFLAGS = '-DU_DISABLE_RENAMING=1 -DU_CHARSET_IS_UTF8=1 -DU_USING_ICU_NAMESPACE=0 -DU_STATIC_IMPLEMENTATION' << ' ' << `sh #{config} --cflags`.strip << $CFLAGS
|
231
|
+
end
|
232
|
+
end
|
233
|
+
end.shelljoin
|
234
|
+
|
235
|
+
end
|
236
|
+
|
237
|
+
$CFLAGS << ' -O3 -funroll-loops -std=c99'
|
238
|
+
$CFLAGS << ' -Wextra -O0 -ggdb3' if ENV['DEBUG']
|
239
|
+
|
240
|
+
puts $CFLAGS, $CPPFLAGS, $CXXFLAGS
|
241
|
+
|
242
|
+
create_makefile('icu/icu')
|
data/ext/icu/icu.c
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
#include "icu.h"
|
2
|
+
|
3
|
+
VALUE rb_mICU;
|
4
|
+
|
5
|
+
void Init_icu(void)
|
6
|
+
{
|
7
|
+
rb_mICU = rb_define_module("ICU");
|
8
|
+
init_internal_encoding();
|
9
|
+
init_rb_errors();
|
10
|
+
init_icu_collator();
|
11
|
+
init_icu_normalizer();
|
12
|
+
init_icu_spoof_checker();
|
13
|
+
init_icu_transliterator();
|
14
|
+
init_icu_charset_detector();
|
15
|
+
init_icu_locale();
|
16
|
+
}
|
17
|
+
|
18
|
+
/* vim: set expandtab sws=4 sw=4: */
|
data/ext/icu/icu.h
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
#ifndef RUBY_EXTENSION_ICU_H_
|
2
|
+
#define RUBY_EXTENSION_ICU_H_
|
3
|
+
|
4
|
+
/* System libraries */
|
5
|
+
#include <stdlib.h>
|
6
|
+
|
7
|
+
/* Ruby headers */
|
8
|
+
#define ONIG_ESCAPE_UCHAR_COLLISION 1 // ruby.h defines UChar macro
|
9
|
+
#include <ruby.h>
|
10
|
+
#include <ruby/encoding.h>
|
11
|
+
#ifdef UChar // fail-safe
|
12
|
+
#undef UChar
|
13
|
+
#endif
|
14
|
+
#include "unicode/ustring.h"
|
15
|
+
#include "unicode/uenum.h"
|
16
|
+
#include "unicode/parseerr.h"
|
17
|
+
|
18
|
+
/* Globals */
|
19
|
+
|
20
|
+
extern VALUE rb_mICU;
|
21
|
+
extern VALUE rb_eICU_Error;
|
22
|
+
extern VALUE rb_eICU_InvalidParameterError;
|
23
|
+
extern VALUE rb_cICU_UString;
|
24
|
+
extern VALUE rb_cICU_Collator;
|
25
|
+
extern VALUE rb_cICU_Normalizer;
|
26
|
+
extern VALUE rb_cICU_SpoofChecker;
|
27
|
+
extern VALUE rb_cICU_Transliterator;
|
28
|
+
extern VALUE rb_cICU_CharsetDetector;
|
29
|
+
extern VALUE rb_cICU_CharsetDetector_Match;
|
30
|
+
extern VALUE rb_cICU_Locale;
|
31
|
+
|
32
|
+
/* Prototypes */
|
33
|
+
void Init_icu _(( void ));
|
34
|
+
void init_internal_encoding _(( void ));
|
35
|
+
void init_rb_errors _(( void ));
|
36
|
+
void init_icu_collator _(( void ));
|
37
|
+
void init_icu_normalizer _(( void ));
|
38
|
+
void init_icu_spoof_checker _(( void ));
|
39
|
+
void init_icu_transliterator _(( void ));
|
40
|
+
void init_icu_charset_detector _(( void ));
|
41
|
+
void init_icu_locale _(( void ));
|
42
|
+
|
43
|
+
int icu_is_rb_enc_idx_as_utf_8 _(( int ));
|
44
|
+
int icu_is_rb_str_as_utf_8 _(( VALUE ));
|
45
|
+
const char* icu_rb_str_enc_name _(( int ));
|
46
|
+
VALUE rb_str_enc_to_ascii_as_utf8 _(( VALUE ));
|
47
|
+
int icu_rb_str_enc_idx _(( VALUE ));
|
48
|
+
VALUE icu_enum_to_rb_ary _(( UEnumeration*, UErrorCode, long ));
|
49
|
+
extern void icu_rb_raise_icu_error _(( UErrorCode ));
|
50
|
+
extern void icu_rb_raise_icu_parse_error _(( const UParseError* ));
|
51
|
+
extern void icu_rb_raise_icu_invalid_parameter _(( const char*, const char* ));
|
52
|
+
|
53
|
+
VALUE icu_ustring_init_with_capa_enc _(( int32_t, int ));
|
54
|
+
VALUE icu_ustring_from_rb_str _(( VALUE ));
|
55
|
+
VALUE icu_ustring_from_uchar_str _(( const UChar*, int32_t ));
|
56
|
+
void icu_ustring_clear_ptr _(( VALUE ));
|
57
|
+
void icu_ustring_resize _(( VALUE, int32_t ));
|
58
|
+
void icu_ustring_set_enc _(( VALUE, int ));
|
59
|
+
VALUE icu_ustring_to_rb_enc_str_with_len _(( VALUE, int32_t ));
|
60
|
+
VALUE icu_ustring_to_rb_enc_str _(( VALUE ));
|
61
|
+
UChar* icu_ustring_ptr _(( VALUE ));
|
62
|
+
int32_t icu_ustring_len _(( VALUE ));
|
63
|
+
int32_t icu_ustring_capa _(( VALUE ));
|
64
|
+
VALUE char_buffer_to_rb_str _(( const char* ));
|
65
|
+
char* char_buffer_new _(( int32_t ));
|
66
|
+
void char_buffer_resize _(( const char*, int32_t ));
|
67
|
+
void char_buffer_free _(( const char* ));
|
68
|
+
|
69
|
+
/* Constants */
|
70
|
+
#define RUBY_C_STRING_TERMINATOR_SIZE 1
|
71
|
+
|
72
|
+
/* Macros */
|
73
|
+
#define ICU_RUBY_ENCODING_INDEX (rb_enc_to_index(rb_default_internal_encoding()) || rb_locale_encindex())
|
74
|
+
#define ICU_RB_STRING_ENC_NAME_IDX(_idx) rb_enc_from_index(_idx) != NULL ? (rb_enc_from_index(_idx))->name : ""
|
75
|
+
|
76
|
+
#endif // RUBY_EXTENSION_ICU_H_
|
77
|
+
|
78
|
+
/* vim: set expandtab sws=4 sw=4: */
|
@@ -0,0 +1,192 @@
|
|
1
|
+
#include "icu.h"
|
2
|
+
#include "unicode/ucsdet.h"
|
3
|
+
|
4
|
+
#define GET_DETECTOR(_data) icu_detector_data* _data; \
|
5
|
+
TypedData_Get_Struct(self, icu_detector_data, &icu_detector_type, _data)
|
6
|
+
|
7
|
+
VALUE rb_cICU_CharsetDetector;
|
8
|
+
VALUE rb_cICU_CharsetDetector_Match;
|
9
|
+
|
10
|
+
typedef struct {
|
11
|
+
VALUE rb_instance;
|
12
|
+
UCharsetDetector* service;
|
13
|
+
char* dummy_str; // used for reset
|
14
|
+
} icu_detector_data;
|
15
|
+
|
16
|
+
static void detector_free(void* _this)
|
17
|
+
{
|
18
|
+
icu_detector_data* this = _this;
|
19
|
+
if (this->dummy_str != NULL) {
|
20
|
+
ruby_xfree(this->dummy_str);
|
21
|
+
}
|
22
|
+
ucsdet_close(this->service);
|
23
|
+
}
|
24
|
+
|
25
|
+
static size_t detector_memsize(const void* _)
|
26
|
+
{
|
27
|
+
return sizeof(icu_detector_data);
|
28
|
+
}
|
29
|
+
|
30
|
+
static const rb_data_type_t icu_detector_type = {
|
31
|
+
"icu/charset_detector",
|
32
|
+
{NULL, detector_free, detector_memsize,},
|
33
|
+
0, 0,
|
34
|
+
RUBY_TYPED_FREE_IMMEDIATELY,
|
35
|
+
};
|
36
|
+
|
37
|
+
static VALUE detector_populate_match_struct(const UCharsetMatch* match)
|
38
|
+
{
|
39
|
+
UErrorCode status = U_ZERO_ERROR;
|
40
|
+
int32_t confidence = ucsdet_getConfidence(match, &status);
|
41
|
+
if (U_FAILURE(status)) {
|
42
|
+
icu_rb_raise_icu_error(status);
|
43
|
+
}
|
44
|
+
status = U_ZERO_ERROR;
|
45
|
+
const char* name = ucsdet_getName(match, &status);
|
46
|
+
if (U_FAILURE(status)) {
|
47
|
+
icu_rb_raise_icu_error(status);
|
48
|
+
}
|
49
|
+
status = U_ZERO_ERROR;
|
50
|
+
const char* language = ucsdet_getLanguage(match, &status);
|
51
|
+
if (U_FAILURE(status)) {
|
52
|
+
icu_rb_raise_icu_error(status);
|
53
|
+
}
|
54
|
+
return rb_struct_new(rb_cICU_CharsetDetector_Match,
|
55
|
+
rb_str_new_cstr(name),
|
56
|
+
INT2NUM(confidence),
|
57
|
+
rb_str_new_cstr(language));
|
58
|
+
}
|
59
|
+
|
60
|
+
VALUE detector_alloc(VALUE self)
|
61
|
+
{
|
62
|
+
icu_detector_data* this;
|
63
|
+
return TypedData_Make_Struct(self, icu_detector_data, &icu_detector_type, this);
|
64
|
+
}
|
65
|
+
|
66
|
+
VALUE detector_initialize(int argc, VALUE* argv, VALUE self)
|
67
|
+
{
|
68
|
+
GET_DETECTOR(this);
|
69
|
+
this->rb_instance = self;
|
70
|
+
this->service = NULL;
|
71
|
+
|
72
|
+
UErrorCode status = U_ZERO_ERROR;
|
73
|
+
this->service = ucsdet_open(&status);
|
74
|
+
if (U_FAILURE(status)) {
|
75
|
+
icu_rb_raise_icu_error(status);
|
76
|
+
}
|
77
|
+
this->dummy_str = ALLOC_N(char, 1);
|
78
|
+
this->dummy_str[0] = '\0';
|
79
|
+
|
80
|
+
return self;
|
81
|
+
}
|
82
|
+
|
83
|
+
static inline void detector_reset_text(const icu_detector_data* this)
|
84
|
+
{
|
85
|
+
UErrorCode status = U_ZERO_ERROR;
|
86
|
+
ucsdet_setText(this->service, this->dummy_str, 0, &status);
|
87
|
+
if (U_FAILURE(status)) {
|
88
|
+
icu_rb_raise_icu_error(status);
|
89
|
+
}
|
90
|
+
}
|
91
|
+
|
92
|
+
// rb_str must be a ruby String
|
93
|
+
static inline void detector_set_text(const icu_detector_data* this, VALUE rb_str)
|
94
|
+
{
|
95
|
+
UErrorCode status = U_ZERO_ERROR;
|
96
|
+
ucsdet_setText(this->service, RSTRING_PTR(rb_str), RSTRING_LENINT(rb_str), &status);
|
97
|
+
if (U_FAILURE(status)) {
|
98
|
+
icu_rb_raise_icu_error(status);
|
99
|
+
}
|
100
|
+
}
|
101
|
+
|
102
|
+
//
|
103
|
+
// no charset appears to match the data.
|
104
|
+
// no input text has been provided
|
105
|
+
VALUE detector_detect(VALUE self, VALUE str)
|
106
|
+
{
|
107
|
+
StringValue(str);
|
108
|
+
GET_DETECTOR(this);
|
109
|
+
|
110
|
+
detector_set_text(this, str);
|
111
|
+
UErrorCode status = U_ZERO_ERROR;
|
112
|
+
const UCharsetMatch* match = ucsdet_detect(this->service, &status);
|
113
|
+
if (U_FAILURE(status)) {
|
114
|
+
icu_rb_raise_icu_error(status);
|
115
|
+
}
|
116
|
+
|
117
|
+
VALUE rb_match = detector_populate_match_struct(match);
|
118
|
+
detector_reset_text(this);
|
119
|
+
return rb_match;
|
120
|
+
}
|
121
|
+
|
122
|
+
VALUE detector_detect_all(VALUE self, VALUE str)
|
123
|
+
{
|
124
|
+
StringValue(str);
|
125
|
+
GET_DETECTOR(this);
|
126
|
+
|
127
|
+
detector_set_text(this, str);
|
128
|
+
|
129
|
+
UErrorCode status = U_ZERO_ERROR;
|
130
|
+
int32_t len_matches = 0;
|
131
|
+
const UCharsetMatch** matches = ucsdet_detectAll(this->service, &len_matches, &status);
|
132
|
+
if (U_FAILURE(status)) {
|
133
|
+
icu_rb_raise_icu_error(status);
|
134
|
+
}
|
135
|
+
|
136
|
+
VALUE result = rb_ary_new2(3); // pre-allocate some slots
|
137
|
+
for (int32_t i = 0; i < len_matches; ++i) {
|
138
|
+
rb_ary_push(result, detector_populate_match_struct(matches[i]));
|
139
|
+
}
|
140
|
+
detector_reset_text(this);
|
141
|
+
return result;
|
142
|
+
}
|
143
|
+
|
144
|
+
static inline VALUE detector_get_input_filter_internal(const icu_detector_data* this)
|
145
|
+
{
|
146
|
+
return ucsdet_isInputFilterEnabled(this->service) != 0 ? Qtrue : Qfalse;
|
147
|
+
}
|
148
|
+
|
149
|
+
VALUE detector_get_input_filter(VALUE self)
|
150
|
+
{
|
151
|
+
GET_DETECTOR(this);
|
152
|
+
return detector_get_input_filter_internal(this);
|
153
|
+
}
|
154
|
+
|
155
|
+
VALUE detector_set_input_filter(VALUE self, VALUE flag)
|
156
|
+
{
|
157
|
+
GET_DETECTOR(this);
|
158
|
+
ucsdet_enableInputFilter(this->service, flag == Qtrue ? TRUE : FALSE);
|
159
|
+
return detector_get_input_filter_internal(this);
|
160
|
+
}
|
161
|
+
|
162
|
+
VALUE detector_detectable_charsets(VALUE self)
|
163
|
+
{
|
164
|
+
GET_DETECTOR(this);
|
165
|
+
UErrorCode status = U_ZERO_ERROR;
|
166
|
+
UEnumeration* charsets = ucsdet_getAllDetectableCharsets(this->service, &status);
|
167
|
+
return icu_enum_to_rb_ary(charsets, status, 28);
|
168
|
+
}
|
169
|
+
|
170
|
+
void init_icu_charset_detector(void)
|
171
|
+
{
|
172
|
+
rb_cICU_CharsetDetector = rb_define_class_under(rb_mICU, "CharsetDetector", rb_cObject);
|
173
|
+
rb_define_alloc_func(rb_cICU_CharsetDetector, detector_alloc);
|
174
|
+
rb_define_method(rb_cICU_CharsetDetector, "initialize", detector_initialize, -1);
|
175
|
+
rb_define_method(rb_cICU_CharsetDetector, "detect", detector_detect, 1);
|
176
|
+
rb_define_method(rb_cICU_CharsetDetector, "detect_all", detector_detect_all, 1);
|
177
|
+
rb_define_method(rb_cICU_CharsetDetector, "input_filter", detector_get_input_filter, 0);
|
178
|
+
rb_define_method(rb_cICU_CharsetDetector, "input_filter=", detector_set_input_filter, 1);
|
179
|
+
rb_define_method(rb_cICU_CharsetDetector, "detectable_charsets", detector_detectable_charsets, 0);
|
180
|
+
|
181
|
+
// define a Match struct in Ruby
|
182
|
+
rb_cICU_CharsetDetector_Match = rb_struct_define_under(rb_cICU_CharsetDetector,
|
183
|
+
"Match",
|
184
|
+
"name",
|
185
|
+
"confidence",
|
186
|
+
"language",
|
187
|
+
NULL);
|
188
|
+
}
|
189
|
+
|
190
|
+
#undef GET_DETECTOR
|
191
|
+
|
192
|
+
/* vim: set expandtab sws=4 sw=4: */
|