uchardet 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +3 -0
- data/Manifest.txt +18 -0
- data/README.rdoc +50 -0
- data/Rakefile +23 -0
- data/bin/uchardet +8 -0
- data/ext/uchardet/extconf.rb +11 -0
- data/ext/uchardet/uchardet.c +340 -0
- data/lib/uchardet.rb +37 -0
- data/lib/uchardet/cli.rb +80 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/tasks/extconf.rake +13 -0
- data/tasks/extconf/uchardet.rake +43 -0
- data/test/test_helper.rb +3 -0
- data/test/test_uchardet.rb +22 -0
- data/test/test_uchardet_cli.rb +14 -0
- data/test/test_uchardet_extn.rb +101 -0
- metadata +89 -0
data/History.txt
ADDED
data/Manifest.txt
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
History.txt
|
2
|
+
Manifest.txt
|
3
|
+
README.rdoc
|
4
|
+
Rakefile
|
5
|
+
bin/uchardet
|
6
|
+
ext/uchardet/extconf.rb
|
7
|
+
ext/uchardet/uchardet.c
|
8
|
+
lib/uchardet.rb
|
9
|
+
lib/uchardet/cli.rb
|
10
|
+
script/console
|
11
|
+
script/destroy
|
12
|
+
script/generate
|
13
|
+
tasks/extconf.rake
|
14
|
+
tasks/extconf/uchardet.rake
|
15
|
+
test/test_helper.rb
|
16
|
+
test/test_uchardet.rb
|
17
|
+
test/test_uchardet_cli.rb
|
18
|
+
test/test_uchardet_extn.rb
|
data/README.rdoc
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
= uchardet
|
2
|
+
|
3
|
+
* http://github.com/invisiblellama/uchardet
|
4
|
+
|
5
|
+
== DESCRIPTION:
|
6
|
+
|
7
|
+
Fast character set encoding detection using International Components for Unicode C++ library.
|
8
|
+
|
9
|
+
== SYNOPSIS:
|
10
|
+
|
11
|
+
require 'open-uri'
|
12
|
+
require 'uchardet'
|
13
|
+
|
14
|
+
encoding = ICU::UCharsetDetector.detect open('http://google.jp').read
|
15
|
+
encoding # => {:language=>"ja", :encoding=>"Shift_JIS", :confidence=>100}
|
16
|
+
|
17
|
+
From command line:
|
18
|
+
|
19
|
+
$ uchardet
|
20
|
+
|
21
|
+
Usage: uchardet [options] file
|
22
|
+
-l, --list Display list of detectable character sets.
|
23
|
+
-s, --strip Strip HTML or XML markup before detection.
|
24
|
+
-e, --encoding Hint the charset detector about possible encoding.
|
25
|
+
-a, --all Show all matching encodings.
|
26
|
+
-h, --help Show this help message.
|
27
|
+
|
28
|
+
$ uchardet `which uchardet`
|
29
|
+
|
30
|
+
ISO-8859-1 (confidence 60%)
|
31
|
+
|
32
|
+
== REQUIREMENTS:
|
33
|
+
|
34
|
+
ICU[http://site.icu-project.org/] (International Components for Unicode):
|
35
|
+
|
36
|
+
on Mac OS X:
|
37
|
+
|
38
|
+
sudo port install icu
|
39
|
+
|
40
|
+
on Debian/Ubuntu
|
41
|
+
|
42
|
+
sudo apt-get install libicu-dev
|
43
|
+
|
44
|
+
== INSTALL:
|
45
|
+
|
46
|
+
sudo gem install uchardet
|
47
|
+
|
48
|
+
== LICENSE:
|
49
|
+
|
50
|
+
Copyright (c) 2009 Dmitri Goutnik, released under the MIT license.
|
data/Rakefile
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
gem 'hoe', '>= 2.1.0'
|
3
|
+
require 'hoe'
|
4
|
+
require 'fileutils'
|
5
|
+
require './lib/uchardet'
|
6
|
+
|
7
|
+
Hoe.plugin :newgem
|
8
|
+
|
9
|
+
# Generate all the Rake tasks
|
10
|
+
# Run 'rake -T' to see list of generated tasks (from gem root directory)
|
11
|
+
$hoe = Hoe.spec 'uchardet' do
|
12
|
+
self.developer 'Dmitri Goutnik', 'dg@syrec.org'
|
13
|
+
self.readme_file = 'README.rdoc'
|
14
|
+
self.extra_rdoc_files = ['README.rdoc']
|
15
|
+
self.rubyforge_name = self.name
|
16
|
+
end
|
17
|
+
|
18
|
+
require 'newgem/tasks'
|
19
|
+
Dir['tasks/**/*.rake'].each { |t| load t }
|
20
|
+
|
21
|
+
# TODO - want other tests/tasks run by default? Add them to the list
|
22
|
+
# remove_task :default
|
23
|
+
# task :default => [:spec, :features]
|
data/bin/uchardet
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
require 'mkmf'
|
2
|
+
|
3
|
+
icu_config = `which icu-config`.strip
|
4
|
+
if icu_config.empty?
|
5
|
+
abort "ICU seems to be missing. Try 'port install icu' or 'apt-get install libicu-dev'"
|
6
|
+
end
|
7
|
+
|
8
|
+
$LIBS << ' ' + `#{icu_config} --ldflags-system --ldflags-libsonly`.strip
|
9
|
+
$LDFLAGS << ' ' + `#{icu_config} --ldflags-searchpath`.strip
|
10
|
+
|
11
|
+
create_makefile("uchardet")
|
@@ -0,0 +1,340 @@
|
|
1
|
+
#include <ruby.h>
|
2
|
+
#include <unicode/ucsdet.h>
|
3
|
+
|
4
|
+
#ifndef RSTRING_PTR
|
5
|
+
# define RSTRING_PTR(str) RSTRING(str)->ptr
|
6
|
+
# define RSTRING_LEN(str) RSTRING(str)->len
|
7
|
+
#endif
|
8
|
+
|
9
|
+
static VALUE cUChardetError;
|
10
|
+
static VALUE cUCharsetDetector;
|
11
|
+
|
12
|
+
static void
|
13
|
+
assure(UErrorCode status)
|
14
|
+
{
|
15
|
+
if (U_FAILURE(status)) {
|
16
|
+
VALUE ex = rb_exc_new2(cUChardetError, u_errorName(status));
|
17
|
+
rb_iv_set(ex, "@errno", INT2FIX(status));
|
18
|
+
rb_exc_raise(ex);
|
19
|
+
}
|
20
|
+
}
|
21
|
+
|
22
|
+
static void
|
23
|
+
UCharsetDetector_free(void *detector)
|
24
|
+
{
|
25
|
+
ucsdet_close(detector);
|
26
|
+
}
|
27
|
+
|
28
|
+
static VALUE
|
29
|
+
UCharsetDetector_alloc(VALUE klass)
|
30
|
+
{
|
31
|
+
UErrorCode status = U_ZERO_ERROR;
|
32
|
+
UCharsetDetector* detector = ucsdet_open(&status);
|
33
|
+
assure(status);
|
34
|
+
|
35
|
+
return Data_Wrap_Struct(klass, NULL, UCharsetDetector_free, detector);
|
36
|
+
}
|
37
|
+
|
38
|
+
/*
|
39
|
+
* call-seq:
|
40
|
+
* input_filtered
|
41
|
+
*
|
42
|
+
* Return filtering flag value this charset detector.
|
43
|
+
*/
|
44
|
+
static VALUE
|
45
|
+
UCharsetDetector_get_input_filtered(VALUE self)
|
46
|
+
{
|
47
|
+
UCharsetDetector *detector;
|
48
|
+
Data_Get_Struct(self, UCharsetDetector, detector);
|
49
|
+
|
50
|
+
return ucsdet_isInputFilterEnabled(detector) ? Qtrue : Qfalse;
|
51
|
+
}
|
52
|
+
|
53
|
+
/*
|
54
|
+
* call-seq:
|
55
|
+
* input_filtered=
|
56
|
+
*
|
57
|
+
* Enable filtering of input text. If filtering is enabled,
|
58
|
+
* text within angle brackets ("<" and ">") will be removed
|
59
|
+
* before detection, which will remove most HTML or xml markup.
|
60
|
+
*/
|
61
|
+
static VALUE
|
62
|
+
UCharsetDetector_set_input_filtered(VALUE self, VALUE flag)
|
63
|
+
{
|
64
|
+
UCharsetDetector *detector;
|
65
|
+
Data_Get_Struct(self, UCharsetDetector, detector);
|
66
|
+
|
67
|
+
ucsdet_enableInputFilter(detector, RTEST(flag) ? TRUE : FALSE);
|
68
|
+
return self;
|
69
|
+
}
|
70
|
+
|
71
|
+
/*
|
72
|
+
* call-seq:
|
73
|
+
* text
|
74
|
+
*
|
75
|
+
* Get input text for this detector.
|
76
|
+
*/
|
77
|
+
static VALUE
|
78
|
+
UCharsetDetector_get_text(VALUE self)
|
79
|
+
{
|
80
|
+
return rb_iv_get(self, "@text");
|
81
|
+
}
|
82
|
+
|
83
|
+
/*
|
84
|
+
* call-seq:
|
85
|
+
* text=
|
86
|
+
*
|
87
|
+
* Set input text for this detector.
|
88
|
+
*/
|
89
|
+
static VALUE
|
90
|
+
UCharsetDetector_set_text(VALUE self, VALUE text)
|
91
|
+
{
|
92
|
+
return rb_iv_set(self, "@text", text);
|
93
|
+
return text;
|
94
|
+
}
|
95
|
+
|
96
|
+
/*
|
97
|
+
* call-seq:
|
98
|
+
* declared_encoding
|
99
|
+
*
|
100
|
+
* Get the declared encoding for charset detection.
|
101
|
+
*/
|
102
|
+
static VALUE
|
103
|
+
UCharsetDetector_get_declared_encoding(VALUE self)
|
104
|
+
{
|
105
|
+
return rb_iv_get(self, "@declared_encoding");
|
106
|
+
}
|
107
|
+
|
108
|
+
/*
|
109
|
+
* call-seq:
|
110
|
+
* declared_encoding=
|
111
|
+
*
|
112
|
+
* Set the declared encoding for charset detection.
|
113
|
+
* The declared encoding of an input text is an encoding obtained
|
114
|
+
* by the user from an http header or xml declaration or similar source that
|
115
|
+
* can be provided as an additional hint to the charset detector.
|
116
|
+
*/
|
117
|
+
static VALUE
|
118
|
+
UCharsetDetector_set_declared_encoding(VALUE self, VALUE declared_encoding)
|
119
|
+
{
|
120
|
+
return rb_iv_set(self, "@declared_encoding", declared_encoding);
|
121
|
+
return declared_encoding;
|
122
|
+
}
|
123
|
+
|
124
|
+
static void
|
125
|
+
set_text(VALUE self, VALUE text)
|
126
|
+
{
|
127
|
+
if (!NIL_P(text)) {
|
128
|
+
text = StringValue(text);
|
129
|
+
|
130
|
+
UErrorCode status = U_ZERO_ERROR;
|
131
|
+
UCharsetDetector *detector;
|
132
|
+
Data_Get_Struct(self, UCharsetDetector, detector);
|
133
|
+
|
134
|
+
ucsdet_setText(detector, StringValuePtr(text), RSTRING_LEN(text), &status);
|
135
|
+
assure(status);
|
136
|
+
|
137
|
+
UCharsetDetector_set_text(self, text);
|
138
|
+
}
|
139
|
+
}
|
140
|
+
|
141
|
+
static void
|
142
|
+
set_declared_encoding(VALUE self, VALUE declared_encoding)
|
143
|
+
{
|
144
|
+
if (!NIL_P(declared_encoding)){
|
145
|
+
declared_encoding = StringValue(declared_encoding);
|
146
|
+
|
147
|
+
UErrorCode status = U_ZERO_ERROR;
|
148
|
+
UCharsetDetector *detector;
|
149
|
+
Data_Get_Struct(self, UCharsetDetector, detector);
|
150
|
+
|
151
|
+
ucsdet_setDeclaredEncoding(detector, StringValuePtr(declared_encoding), RSTRING_LEN(declared_encoding), &status);
|
152
|
+
assure(status);
|
153
|
+
|
154
|
+
UCharsetDetector_set_declared_encoding(self, declared_encoding);
|
155
|
+
}
|
156
|
+
}
|
157
|
+
|
158
|
+
/*
|
159
|
+
* call-seq:
|
160
|
+
* new(text=nil, declared_encoding=nil)
|
161
|
+
*
|
162
|
+
* Create a new charset detector. Optionally set input text and declared encoding.
|
163
|
+
*/
|
164
|
+
static VALUE
|
165
|
+
UCharsetDetector_initialize(int argc, VALUE *argv, VALUE self)
|
166
|
+
{
|
167
|
+
VALUE text;
|
168
|
+
VALUE declared_encoding;
|
169
|
+
|
170
|
+
rb_scan_args(argc, argv, "02", &text, &declared_encoding);
|
171
|
+
if (NIL_P(text))
|
172
|
+
UCharsetDetector_set_text(self, Qnil);
|
173
|
+
else
|
174
|
+
set_text(self, text);
|
175
|
+
|
176
|
+
if (NIL_P(declared_encoding))
|
177
|
+
UCharsetDetector_set_declared_encoding(self, Qnil);
|
178
|
+
else
|
179
|
+
set_declared_encoding(self, declared_encoding);
|
180
|
+
|
181
|
+
return self;
|
182
|
+
}
|
183
|
+
|
184
|
+
/*
|
185
|
+
* call-seq:
|
186
|
+
* detect(text=nil, declared_encoding=nil)
|
187
|
+
*
|
188
|
+
* Return the charset that best matches the supplied input data.
|
189
|
+
*
|
190
|
+
* Note though, that because the detection
|
191
|
+
* only looks at the start of the input data,
|
192
|
+
* there is a possibility that the returned charset will fail to handle
|
193
|
+
* the full set of input data.
|
194
|
+
*
|
195
|
+
* The function will fail if
|
196
|
+
* * no charset appears to match the data
|
197
|
+
* * no input text has been provided (with +text+ or set with #text= )
|
198
|
+
*/
|
199
|
+
static VALUE
|
200
|
+
UCharsetDetector_detect(int argc, VALUE *argv, VALUE self)
|
201
|
+
{
|
202
|
+
VALUE text;
|
203
|
+
VALUE declared_encoding;
|
204
|
+
|
205
|
+
rb_scan_args(argc, argv, "02", &text, &declared_encoding);
|
206
|
+
set_text(self, text);
|
207
|
+
set_declared_encoding(self, declared_encoding);
|
208
|
+
|
209
|
+
UErrorCode status = U_ZERO_ERROR;
|
210
|
+
UCharsetDetector *detector;
|
211
|
+
Data_Get_Struct(self, UCharsetDetector, detector);
|
212
|
+
|
213
|
+
const UCharsetMatch *match = ucsdet_detect(detector, &status);
|
214
|
+
assure(status);
|
215
|
+
|
216
|
+
const char *encoding_name = ucsdet_getName(match, &status);
|
217
|
+
assure(status);
|
218
|
+
|
219
|
+
int32_t encoding_confidence = ucsdet_getConfidence(match, &status);
|
220
|
+
assure(status);
|
221
|
+
|
222
|
+
const char *encoding_language = ucsdet_getLanguage(match, &status);
|
223
|
+
assure(status);
|
224
|
+
|
225
|
+
VALUE hash = rb_hash_new();
|
226
|
+
rb_hash_aset(hash, ID2SYM(rb_intern("encoding")), rb_str_new2(encoding_name));
|
227
|
+
rb_hash_aset(hash, ID2SYM(rb_intern("confidence")), INT2NUM(encoding_confidence));
|
228
|
+
rb_hash_aset(hash, ID2SYM(rb_intern("language")), rb_str_new2(encoding_language));
|
229
|
+
|
230
|
+
return hash;
|
231
|
+
}
|
232
|
+
|
233
|
+
/*
|
234
|
+
* call-seq:
|
235
|
+
* detect_all(text=nil, declared_encoding=nil)
|
236
|
+
*
|
237
|
+
* Find all charset matches that appear to be consistent with the input,
|
238
|
+
* returning an array of results. The results are ordered with the
|
239
|
+
* best quality match first.
|
240
|
+
*
|
241
|
+
* Because the detection only looks at a limited amount of the
|
242
|
+
* input byte data, some of the returned charsets may fail to handle
|
243
|
+
* the all of input data.
|
244
|
+
*
|
245
|
+
* Return an error if
|
246
|
+
* * no charset appears to match the data
|
247
|
+
* * no input text has been provided (with +text+ or set with #text= )
|
248
|
+
*/
|
249
|
+
static VALUE
|
250
|
+
UCharsetDetector_detect_all(int argc, VALUE *argv, VALUE self)
|
251
|
+
{
|
252
|
+
VALUE text;
|
253
|
+
VALUE declared_encoding;
|
254
|
+
|
255
|
+
rb_scan_args(argc, argv, "02", &text, &declared_encoding);
|
256
|
+
set_text(self, text);
|
257
|
+
set_declared_encoding(self, declared_encoding);
|
258
|
+
|
259
|
+
UCharsetDetector *detector;
|
260
|
+
Data_Get_Struct(self, UCharsetDetector, detector);
|
261
|
+
UErrorCode status = U_ZERO_ERROR;
|
262
|
+
int32_t matches_found;
|
263
|
+
|
264
|
+
const UCharsetMatch **matches = ucsdet_detectAll(detector, &matches_found, &status);
|
265
|
+
assure(status);
|
266
|
+
|
267
|
+
VALUE ary = rb_ary_new();
|
268
|
+
int i = 0;
|
269
|
+
|
270
|
+
for (i = 0; i < matches_found; i++) {
|
271
|
+
const char *encoding_name = ucsdet_getName(matches[i], &status);
|
272
|
+
assure(status);
|
273
|
+
|
274
|
+
int32_t encoding_confidence = ucsdet_getConfidence(matches[i], &status);
|
275
|
+
assure(status);
|
276
|
+
|
277
|
+
const char *encoding_language = ucsdet_getLanguage(matches[i], &status);
|
278
|
+
assure(status);
|
279
|
+
|
280
|
+
VALUE hash = rb_hash_new();
|
281
|
+
rb_hash_aset(hash, ID2SYM(rb_intern("encoding")), rb_str_new2(encoding_name));
|
282
|
+
rb_hash_aset(hash, ID2SYM(rb_intern("confidence")), INT2NUM(encoding_confidence));
|
283
|
+
rb_hash_aset(hash, ID2SYM(rb_intern("language")), rb_str_new2(encoding_language));
|
284
|
+
|
285
|
+
rb_ary_push(ary, hash);
|
286
|
+
}
|
287
|
+
|
288
|
+
return ary;
|
289
|
+
}
|
290
|
+
|
291
|
+
/*
|
292
|
+
* call-seq:
|
293
|
+
* detectable_charsets
|
294
|
+
*
|
295
|
+
* Get array of names of all detectable charsets that are known to the
|
296
|
+
* charset detection service.
|
297
|
+
*/
|
298
|
+
static VALUE
|
299
|
+
UCharsetDetector_get_detectable_charsets(VALUE self)
|
300
|
+
{
|
301
|
+
UCharsetDetector *detector;
|
302
|
+
Data_Get_Struct(self, UCharsetDetector, detector);
|
303
|
+
UErrorCode status = U_ZERO_ERROR;
|
304
|
+
|
305
|
+
UEnumeration *charsets = ucsdet_getAllDetectableCharsets(detector, &status);
|
306
|
+
assure(status);
|
307
|
+
|
308
|
+
VALUE ary = rb_ary_new();
|
309
|
+
int32_t result_length;
|
310
|
+
const char *charset_name;
|
311
|
+
|
312
|
+
while (charset_name = uenum_next(charsets, &result_length, &status)) {
|
313
|
+
assure(status);
|
314
|
+
rb_ary_push(ary, rb_str_new2(charset_name));
|
315
|
+
}
|
316
|
+
uenum_close(charsets);
|
317
|
+
|
318
|
+
return ary;
|
319
|
+
}
|
320
|
+
|
321
|
+
void
|
322
|
+
Init_uchardet()
|
323
|
+
{
|
324
|
+
VALUE mICU = rb_define_module("ICU");
|
325
|
+
|
326
|
+
cUChardetError = rb_define_class_under(mICU, "Error", rb_eStandardError);
|
327
|
+
|
328
|
+
cUCharsetDetector = rb_define_class_under(mICU, "UCharsetDetector", rb_cObject);
|
329
|
+
rb_define_alloc_func(cUCharsetDetector, UCharsetDetector_alloc);
|
330
|
+
rb_define_method(cUCharsetDetector, "initialize", UCharsetDetector_initialize, -1);
|
331
|
+
rb_define_method(cUCharsetDetector, "input_filtered?", UCharsetDetector_get_input_filtered, 0);
|
332
|
+
rb_define_method(cUCharsetDetector, "input_filtered=", UCharsetDetector_set_input_filtered, 1);
|
333
|
+
rb_define_method(cUCharsetDetector, "text", UCharsetDetector_get_text, 0);
|
334
|
+
rb_define_method(cUCharsetDetector, "text=", UCharsetDetector_set_text, 1);
|
335
|
+
rb_define_method(cUCharsetDetector, "declared_encoding", UCharsetDetector_get_declared_encoding, 0);
|
336
|
+
rb_define_method(cUCharsetDetector, "declared_encoding=", UCharsetDetector_set_declared_encoding, 1);
|
337
|
+
rb_define_method(cUCharsetDetector, "detect", UCharsetDetector_detect, -1);
|
338
|
+
rb_define_method(cUCharsetDetector, "detect_all", UCharsetDetector_detect_all, -1);
|
339
|
+
rb_define_method(cUCharsetDetector, "detectable_charsets", UCharsetDetector_get_detectable_charsets, 0);
|
340
|
+
}
|
data/lib/uchardet.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
$:.unshift(File.dirname(__FILE__)) unless
|
2
|
+
$:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
3
|
+
|
4
|
+
module Uchardet
|
5
|
+
VERSION = '0.1.1'
|
6
|
+
end
|
7
|
+
|
8
|
+
begin
|
9
|
+
require 'uchardet.so'
|
10
|
+
rescue LoadError
|
11
|
+
# uh-oh
|
12
|
+
end
|
13
|
+
|
14
|
+
module ICU # :main: README
|
15
|
+
class UCharsetDetector # :main: README
|
16
|
+
##
|
17
|
+
# Shortcut for ICU::UCharsetDetector#detect
|
18
|
+
#
|
19
|
+
def self.detect(*args)
|
20
|
+
self.new.detect(*args)
|
21
|
+
end
|
22
|
+
|
23
|
+
##
|
24
|
+
# Shortcut for ICU::UCharsetDetector#detect_all
|
25
|
+
#
|
26
|
+
def self.detect_all(*args)
|
27
|
+
self.new.detect_all(*args)
|
28
|
+
end
|
29
|
+
|
30
|
+
##
|
31
|
+
# Shortcut for ICU::UCharsetDetector#detectable_charsets
|
32
|
+
#
|
33
|
+
def self.detectable_charsets
|
34
|
+
self.new.detectable_charsets
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
data/lib/uchardet/cli.rb
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
require 'optparse'
|
2
|
+
|
3
|
+
module Uchardet
|
4
|
+
class CLI
|
5
|
+
def self.execute(stdout, args=[])
|
6
|
+
@stdout = stdout
|
7
|
+
@options = {
|
8
|
+
:input_filtered => false,
|
9
|
+
:declared_encoding => nil,
|
10
|
+
:detect_all => false,
|
11
|
+
:path => nil
|
12
|
+
}
|
13
|
+
|
14
|
+
parser = OptionParser.new do |opts|
|
15
|
+
opts.banner = <<-BANNER.gsub(/^\s*/,'')
|
16
|
+
Usage: #{File.basename($0)} [options] file
|
17
|
+
BANNER
|
18
|
+
|
19
|
+
opts.on("-l", "--list",
|
20
|
+
"Display list of detectable character sets."
|
21
|
+
) { self.list; exit }
|
22
|
+
opts.on("-s", "--strip",
|
23
|
+
"Strip HTML or XML markup before detection."
|
24
|
+
) { @options[:input_filtered] = true }
|
25
|
+
opts.on("-e", "--encoding",
|
26
|
+
"Hint the charset detector about possible encoding."
|
27
|
+
) { |arg| @options[:declared_encoding] = arg }
|
28
|
+
opts.on("-a", "--all",
|
29
|
+
"Show all matching encodings."
|
30
|
+
) { @options[:detect_all] = true }
|
31
|
+
opts.on("-h", "--help",
|
32
|
+
"Show this help message."
|
33
|
+
) { @stdout.puts opts; exit }
|
34
|
+
|
35
|
+
if args.empty?
|
36
|
+
@stdout.puts opts
|
37
|
+
else
|
38
|
+
begin
|
39
|
+
opts.parse!(args)
|
40
|
+
rescue OptionParser::ParseError => ex
|
41
|
+
STDERR.puts "ERROR: #{ex.to_s}. See #{File.basename($0)} --help"
|
42
|
+
exit
|
43
|
+
end
|
44
|
+
|
45
|
+
@options[:path] = args.last
|
46
|
+
if @options[:path].nil? || @options[:path].empty?
|
47
|
+
@stdout.puts opts
|
48
|
+
STDERR.puts "ERROR: please specify a file path."
|
49
|
+
exit
|
50
|
+
end
|
51
|
+
|
52
|
+
self.detect
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def self.list
|
58
|
+
ICU::UCharsetDetector.detectable_charsets.uniq.sort.each { |name| @stdout.puts name }
|
59
|
+
end
|
60
|
+
|
61
|
+
def self.detect
|
62
|
+
detector = ICU::UCharsetDetector.new
|
63
|
+
detector.input_filtered = @options[:input_filtered]
|
64
|
+
detector.declared_encoding = @options[:declared_encoding]
|
65
|
+
|
66
|
+
source = IO.read(@options[:path])
|
67
|
+
matches = if @options[:detect_all]
|
68
|
+
detector.detect_all(source)
|
69
|
+
else
|
70
|
+
[detector.detect(source)]
|
71
|
+
end
|
72
|
+
|
73
|
+
matches.each do |match|
|
74
|
+
@stdout.puts "#{match[:encoding]} (confidence #{match[:confidence]}%)"
|
75
|
+
end
|
76
|
+
rescue Exception => ex
|
77
|
+
STDERR.puts "ERROR: #{ex.to_s}"
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
data/script/console
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# File: script/console
|
3
|
+
irb = RUBY_PLATFORM =~ /(:?mswin|mingw)/ ? 'irb.bat' : 'irb'
|
4
|
+
|
5
|
+
libs = " -r irb/completion"
|
6
|
+
# Perhaps use a console_lib to store any extra methods I may want available in the cosole
|
7
|
+
# libs << " -r #{File.dirname(__FILE__) + '/../lib/console_lib/console_logger.rb'}"
|
8
|
+
libs << " -r #{File.dirname(__FILE__) + '/../lib/chardet-icu.rb'}"
|
9
|
+
puts "Loading chardet-icu gem"
|
10
|
+
exec "#{irb} #{libs} --simple-prompt"
|
data/script/destroy
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'rubigen'
|
6
|
+
rescue LoadError
|
7
|
+
require 'rubygems'
|
8
|
+
require 'rubigen'
|
9
|
+
end
|
10
|
+
require 'rubigen/scripts/destroy'
|
11
|
+
|
12
|
+
ARGV.shift if ['--help', '-h'].include?(ARGV[0])
|
13
|
+
RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
|
14
|
+
RubiGen::Scripts::Destroy.new.run(ARGV)
|
data/script/generate
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'rubigen'
|
6
|
+
rescue LoadError
|
7
|
+
require 'rubygems'
|
8
|
+
require 'rubigen'
|
9
|
+
end
|
10
|
+
require 'rubigen/scripts/generate'
|
11
|
+
|
12
|
+
ARGV.shift if ['--help', '-h'].include?(ARGV[0])
|
13
|
+
RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
|
14
|
+
RubiGen::Scripts::Generate.new.run(ARGV)
|
data/tasks/extconf.rake
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
namespace :extconf do
|
2
|
+
desc "Compiles the Ruby extension"
|
3
|
+
task :compile
|
4
|
+
end
|
5
|
+
|
6
|
+
task :compile => "extconf:compile"
|
7
|
+
|
8
|
+
task :test => :compile
|
9
|
+
|
10
|
+
BIN = "*.{o,bundle,jar,so,obj,pdb,lib,def,exp}"
|
11
|
+
$hoe.clean_globs |= ["ext/**/#{BIN}", "lib/**/#{BIN}", 'ext/**/Makefile']
|
12
|
+
$hoe.spec.require_paths = Dir['{lib,ext/*}']
|
13
|
+
$hoe.spec.extensions = FileList["ext/**/extconf.rb"].to_a
|
@@ -0,0 +1,43 @@
|
|
1
|
+
namespace :extconf do
|
2
|
+
extension = File.basename(__FILE__, '.rake')
|
3
|
+
|
4
|
+
ext = "ext/#{extension}"
|
5
|
+
ext_so = "#{ext}/#{extension}.#{Config::CONFIG['DLEXT']}"
|
6
|
+
ext_files = FileList[
|
7
|
+
"#{ext}/*.c",
|
8
|
+
"#{ext}/*.h",
|
9
|
+
"#{ext}/*.rl",
|
10
|
+
"#{ext}/extconf.rb",
|
11
|
+
"#{ext}/Makefile",
|
12
|
+
# "lib"
|
13
|
+
]
|
14
|
+
|
15
|
+
|
16
|
+
task :compile => extension do
|
17
|
+
if Dir.glob("**/#{extension}.{o,so,dll}").length == 0
|
18
|
+
STDERR.puts "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
|
19
|
+
STDERR.puts "Gem actually failed to build. Your system is"
|
20
|
+
STDERR.puts "NOT configured properly to build #{GEM_NAME}."
|
21
|
+
STDERR.puts "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
|
22
|
+
exit(1)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
desc "Builds just the #{extension} extension"
|
27
|
+
task extension.to_sym => ["#{ext}/Makefile", ext_so ]
|
28
|
+
|
29
|
+
file "#{ext}/Makefile" => ["#{ext}/extconf.rb"] do
|
30
|
+
Dir.chdir(ext) do ruby "extconf.rb" end
|
31
|
+
end
|
32
|
+
|
33
|
+
file ext_so => ext_files do
|
34
|
+
Dir.chdir(ext) do
|
35
|
+
sh(RUBY_PLATFORM =~ /win32/ ? 'nmake' : 'make') do |ok, res|
|
36
|
+
if !ok
|
37
|
+
require "fileutils"
|
38
|
+
FileUtils.rm Dir.glob('*.{so,o,dll,bundle}')
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
data/test/test_helper.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require File.dirname(__FILE__) + '/test_helper.rb'
|
4
|
+
|
5
|
+
class TestUchardet < Test::Unit::TestCase # :nodoc:
|
6
|
+
|
7
|
+
def test_detect
|
8
|
+
detector = ICU::UCharsetDetector.new
|
9
|
+
assert_equal(detector.detect(''), ICU::UCharsetDetector.detect(''))
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_detect_all
|
13
|
+
detector = ICU::UCharsetDetector.new
|
14
|
+
assert_equal(detector.detect_all('∑'), ICU::UCharsetDetector.detect_all('∑'))
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_detectable_charsets
|
18
|
+
detector = ICU::UCharsetDetector.new
|
19
|
+
assert_equal(detector.detectable_charsets, ICU::UCharsetDetector.detectable_charsets)
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), "test_helper.rb")
|
2
|
+
require 'uchardet/cli'
|
3
|
+
|
4
|
+
class TestUchardetCli < Test::Unit::TestCase
|
5
|
+
def setup
|
6
|
+
Uchardet::CLI.execute(@stdout_io = StringIO.new, [])
|
7
|
+
@stdout_io.rewind
|
8
|
+
@stdout = @stdout_io.read
|
9
|
+
end
|
10
|
+
|
11
|
+
def test_print_default_output
|
12
|
+
assert_match(/Usage: .* \[options\] file/, @stdout)
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,101 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require "test/unit"
|
4
|
+
|
5
|
+
$:.unshift File.dirname(__FILE__) + "/../ext/uchardet"
|
6
|
+
require "uchardet.so"
|
7
|
+
|
8
|
+
class TestUchardetExtn < Test::Unit::TestCase # :nodoc:
|
9
|
+
|
10
|
+
def test_init
|
11
|
+
assert_not_nil(ICU::UCharsetDetector)
|
12
|
+
|
13
|
+
assert_nothing_raised do
|
14
|
+
detector = ICU::UCharsetDetector.new
|
15
|
+
assert_not_nil(detector)
|
16
|
+
|
17
|
+
detector = ICU::UCharsetDetector.new nil
|
18
|
+
assert_not_nil(detector)
|
19
|
+
|
20
|
+
detector = ICU::UCharsetDetector.new 'some text'
|
21
|
+
assert_not_nil(detector)
|
22
|
+
end
|
23
|
+
|
24
|
+
assert_raise(TypeError) do
|
25
|
+
detector = ICU::UCharsetDetector.new 0
|
26
|
+
end
|
27
|
+
|
28
|
+
assert_raise(TypeError) do
|
29
|
+
detector = ICU::UCharsetDetector.new Time.now
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def test_detect
|
34
|
+
detector = ICU::UCharsetDetector.new
|
35
|
+
assert_raise(ICU::Error) do
|
36
|
+
detector.detect
|
37
|
+
end
|
38
|
+
e = detector.detect '∂∆∂∆∂∆'
|
39
|
+
assert(e.is_a? Hash)
|
40
|
+
assert(e.has_key? :encoding)
|
41
|
+
assert(e.has_key? :confidence)
|
42
|
+
assert(e.has_key? :language)
|
43
|
+
assert_equal('utf-8', e[:encoding].downcase)
|
44
|
+
e = detector.detect '··', 'utf-8'
|
45
|
+
assert_equal('utf-8', e[:encoding].downcase)
|
46
|
+
e = detector.detect '··', 'Shift_JIS'
|
47
|
+
assert_equal('utf-8', e[:encoding].downcase)
|
48
|
+
end
|
49
|
+
|
50
|
+
def test_detect_all
|
51
|
+
detector = ICU::UCharsetDetector.new
|
52
|
+
assert_raise(ICU::Error) do
|
53
|
+
detector.detect_all
|
54
|
+
end
|
55
|
+
a = detector.detect_all '€‹€‹€'
|
56
|
+
assert(a.is_a? Array)
|
57
|
+
assert_equal(false, a.empty?)
|
58
|
+
assert(a[0].is_a? Hash)
|
59
|
+
assert(a[0].has_key? :encoding)
|
60
|
+
assert(a[0].has_key? :confidence)
|
61
|
+
assert(a[0].has_key? :language)
|
62
|
+
end
|
63
|
+
|
64
|
+
def test_input_filtered_accessor
|
65
|
+
detector = ICU::UCharsetDetector.new
|
66
|
+
assert_equal(false, detector.input_filtered?)
|
67
|
+
detector.input_filtered = true
|
68
|
+
assert_equal(true, detector.input_filtered?)
|
69
|
+
detector.input_filtered = ''
|
70
|
+
assert_equal(true, detector.input_filtered?)
|
71
|
+
detector.input_filtered = nil
|
72
|
+
assert_equal(false, detector.input_filtered?)
|
73
|
+
end
|
74
|
+
|
75
|
+
def test_text_accessor
|
76
|
+
detector = ICU::UCharsetDetector.new
|
77
|
+
assert_equal(nil, detector.text)
|
78
|
+
detector = ICU::UCharsetDetector.new 'blah'
|
79
|
+
assert_equal('blah', detector.text)
|
80
|
+
detector.text = 'test'
|
81
|
+
assert_equal('test', detector.text)
|
82
|
+
detector.detect
|
83
|
+
assert_equal('test', detector.text)
|
84
|
+
end
|
85
|
+
|
86
|
+
def test_declared_encoding_accessor
|
87
|
+
detector = ICU::UCharsetDetector.new
|
88
|
+
assert_equal(nil, detector.declared_encoding)
|
89
|
+
detector.declared_encoding = 'iso-8859-15'
|
90
|
+
assert_equal('iso-8859-15', detector.declared_encoding)
|
91
|
+
detector.detect 'test'
|
92
|
+
assert_equal('iso-8859-15', detector.declared_encoding)
|
93
|
+
end
|
94
|
+
|
95
|
+
def test_detectable_charsets
|
96
|
+
detector = ICU::UCharsetDetector.new
|
97
|
+
assert_not_nil(detector.detectable_charsets)
|
98
|
+
assert(detector.detectable_charsets.is_a? Array)
|
99
|
+
end
|
100
|
+
|
101
|
+
end
|
metadata
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: uchardet
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Dmitri Goutnik
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-12-19 00:00:00 +03:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: hoe
|
17
|
+
type: :development
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 2.4.0
|
24
|
+
version:
|
25
|
+
description: Fast character set encoding detection using International Components for Unicode C++ library.
|
26
|
+
email:
|
27
|
+
- dg@syrec.org
|
28
|
+
executables:
|
29
|
+
- uchardet
|
30
|
+
extensions:
|
31
|
+
- ext/uchardet/extconf.rb
|
32
|
+
extra_rdoc_files:
|
33
|
+
- History.txt
|
34
|
+
- Manifest.txt
|
35
|
+
- README.rdoc
|
36
|
+
files:
|
37
|
+
- History.txt
|
38
|
+
- Manifest.txt
|
39
|
+
- README.rdoc
|
40
|
+
- Rakefile
|
41
|
+
- bin/uchardet
|
42
|
+
- ext/uchardet/extconf.rb
|
43
|
+
- ext/uchardet/uchardet.c
|
44
|
+
- lib/uchardet.rb
|
45
|
+
- lib/uchardet/cli.rb
|
46
|
+
- script/console
|
47
|
+
- script/destroy
|
48
|
+
- script/generate
|
49
|
+
- tasks/extconf.rake
|
50
|
+
- tasks/extconf/uchardet.rake
|
51
|
+
- test/test_helper.rb
|
52
|
+
- test/test_uchardet.rb
|
53
|
+
- test/test_uchardet_cli.rb
|
54
|
+
- test/test_uchardet_extn.rb
|
55
|
+
has_rdoc: true
|
56
|
+
homepage: http://github.com/invisiblellama/uchardet
|
57
|
+
licenses: []
|
58
|
+
|
59
|
+
post_install_message:
|
60
|
+
rdoc_options:
|
61
|
+
- --main
|
62
|
+
- README.rdoc
|
63
|
+
require_paths:
|
64
|
+
- lib
|
65
|
+
- ext/uchardet
|
66
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
67
|
+
requirements:
|
68
|
+
- - ">="
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
version: "0"
|
71
|
+
version:
|
72
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
73
|
+
requirements:
|
74
|
+
- - ">="
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: "0"
|
77
|
+
version:
|
78
|
+
requirements: []
|
79
|
+
|
80
|
+
rubyforge_project: uchardet
|
81
|
+
rubygems_version: 1.3.5
|
82
|
+
signing_key:
|
83
|
+
specification_version: 3
|
84
|
+
summary: Fast character set encoding detection using International Components for Unicode C++ library.
|
85
|
+
test_files:
|
86
|
+
- test/test_helper.rb
|
87
|
+
- test/test_uchardet.rb
|
88
|
+
- test/test_uchardet_cli.rb
|
89
|
+
- test/test_uchardet_extn.rb
|