uchardet 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +3 -0
- data/Manifest.txt +18 -0
- data/README.rdoc +50 -0
- data/Rakefile +23 -0
- data/bin/uchardet +8 -0
- data/ext/uchardet/extconf.rb +11 -0
- data/ext/uchardet/uchardet.c +340 -0
- data/lib/uchardet.rb +37 -0
- data/lib/uchardet/cli.rb +80 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/tasks/extconf.rake +13 -0
- data/tasks/extconf/uchardet.rake +43 -0
- data/test/test_helper.rb +3 -0
- data/test/test_uchardet.rb +22 -0
- data/test/test_uchardet_cli.rb +14 -0
- data/test/test_uchardet_extn.rb +101 -0
- metadata +89 -0
data/History.txt
ADDED
data/Manifest.txt
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
History.txt
|
2
|
+
Manifest.txt
|
3
|
+
README.rdoc
|
4
|
+
Rakefile
|
5
|
+
bin/uchardet
|
6
|
+
ext/uchardet/extconf.rb
|
7
|
+
ext/uchardet/uchardet.c
|
8
|
+
lib/uchardet.rb
|
9
|
+
lib/uchardet/cli.rb
|
10
|
+
script/console
|
11
|
+
script/destroy
|
12
|
+
script/generate
|
13
|
+
tasks/extconf.rake
|
14
|
+
tasks/extconf/uchardet.rake
|
15
|
+
test/test_helper.rb
|
16
|
+
test/test_uchardet.rb
|
17
|
+
test/test_uchardet_cli.rb
|
18
|
+
test/test_uchardet_extn.rb
|
data/README.rdoc
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
= uchardet
|
2
|
+
|
3
|
+
* http://github.com/invisiblellama/uchardet
|
4
|
+
|
5
|
+
== DESCRIPTION:
|
6
|
+
|
7
|
+
Fast character set encoding detection using International Components for Unicode C++ library.
|
8
|
+
|
9
|
+
== SYNOPSIS:
|
10
|
+
|
11
|
+
require 'open-uri'
|
12
|
+
require 'uchardet'
|
13
|
+
|
14
|
+
encoding = ICU::UCharsetDetector.detect open('http://google.jp').read
|
15
|
+
encoding # => {:language=>"ja", :encoding=>"Shift_JIS", :confidence=>100}
|
16
|
+
|
17
|
+
From command line:
|
18
|
+
|
19
|
+
$ uchardet
|
20
|
+
|
21
|
+
Usage: uchardet [options] file
|
22
|
+
-l, --list Display list of detectable character sets.
|
23
|
+
-s, --strip Strip HTML or XML markup before detection.
|
24
|
+
-e, --encoding Hint the charset detector about possible encoding.
|
25
|
+
-a, --all Show all matching encodings.
|
26
|
+
-h, --help Show this help message.
|
27
|
+
|
28
|
+
$ uchardet `which uchardet`
|
29
|
+
|
30
|
+
ISO-8859-1 (confidence 60%)
|
31
|
+
|
32
|
+
== REQUIREMENTS:
|
33
|
+
|
34
|
+
ICU[http://site.icu-project.org/] (International Components for Unicode):
|
35
|
+
|
36
|
+
on Mac OS X:
|
37
|
+
|
38
|
+
sudo port install icu
|
39
|
+
|
40
|
+
on Debian/Ubuntu
|
41
|
+
|
42
|
+
sudo apt-get install libicu-dev
|
43
|
+
|
44
|
+
== INSTALL:
|
45
|
+
|
46
|
+
sudo gem install uchardet
|
47
|
+
|
48
|
+
== LICENSE:
|
49
|
+
|
50
|
+
Copyright (c) 2009 Dmitri Goutnik, released under the MIT license.
|
data/Rakefile
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
gem 'hoe', '>= 2.1.0'
|
3
|
+
require 'hoe'
|
4
|
+
require 'fileutils'
|
5
|
+
require './lib/uchardet'
|
6
|
+
|
7
|
+
Hoe.plugin :newgem
|
8
|
+
|
9
|
+
# Generate all the Rake tasks
|
10
|
+
# Run 'rake -T' to see list of generated tasks (from gem root directory)
|
11
|
+
$hoe = Hoe.spec 'uchardet' do
|
12
|
+
self.developer 'Dmitri Goutnik', 'dg@syrec.org'
|
13
|
+
self.readme_file = 'README.rdoc'
|
14
|
+
self.extra_rdoc_files = ['README.rdoc']
|
15
|
+
self.rubyforge_name = self.name
|
16
|
+
end
|
17
|
+
|
18
|
+
require 'newgem/tasks'
|
19
|
+
Dir['tasks/**/*.rake'].each { |t| load t }
|
20
|
+
|
21
|
+
# TODO - want other tests/tasks run by default? Add them to the list
|
22
|
+
# remove_task :default
|
23
|
+
# task :default => [:spec, :features]
|
data/bin/uchardet
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
require 'mkmf'
|
2
|
+
|
3
|
+
icu_config = `which icu-config`.strip
|
4
|
+
if icu_config.empty?
|
5
|
+
abort "ICU seems to be missing. Try 'port install icu' or 'apt-get install libicu-dev'"
|
6
|
+
end
|
7
|
+
|
8
|
+
$LIBS << ' ' + `#{icu_config} --ldflags-system --ldflags-libsonly`.strip
|
9
|
+
$LDFLAGS << ' ' + `#{icu_config} --ldflags-searchpath`.strip
|
10
|
+
|
11
|
+
create_makefile("uchardet")
|
@@ -0,0 +1,340 @@
|
|
1
|
+
#include <ruby.h>
|
2
|
+
#include <unicode/ucsdet.h>
|
3
|
+
|
4
|
+
#ifndef RSTRING_PTR
|
5
|
+
# define RSTRING_PTR(str) RSTRING(str)->ptr
|
6
|
+
# define RSTRING_LEN(str) RSTRING(str)->len
|
7
|
+
#endif
|
8
|
+
|
9
|
+
static VALUE cUChardetError;
|
10
|
+
static VALUE cUCharsetDetector;
|
11
|
+
|
12
|
+
static void
|
13
|
+
assure(UErrorCode status)
|
14
|
+
{
|
15
|
+
if (U_FAILURE(status)) {
|
16
|
+
VALUE ex = rb_exc_new2(cUChardetError, u_errorName(status));
|
17
|
+
rb_iv_set(ex, "@errno", INT2FIX(status));
|
18
|
+
rb_exc_raise(ex);
|
19
|
+
}
|
20
|
+
}
|
21
|
+
|
22
|
+
static void
|
23
|
+
UCharsetDetector_free(void *detector)
|
24
|
+
{
|
25
|
+
ucsdet_close(detector);
|
26
|
+
}
|
27
|
+
|
28
|
+
static VALUE
|
29
|
+
UCharsetDetector_alloc(VALUE klass)
|
30
|
+
{
|
31
|
+
UErrorCode status = U_ZERO_ERROR;
|
32
|
+
UCharsetDetector* detector = ucsdet_open(&status);
|
33
|
+
assure(status);
|
34
|
+
|
35
|
+
return Data_Wrap_Struct(klass, NULL, UCharsetDetector_free, detector);
|
36
|
+
}
|
37
|
+
|
38
|
+
/*
|
39
|
+
* call-seq:
|
40
|
+
* input_filtered
|
41
|
+
*
|
42
|
+
* Return filtering flag value this charset detector.
|
43
|
+
*/
|
44
|
+
static VALUE
|
45
|
+
UCharsetDetector_get_input_filtered(VALUE self)
|
46
|
+
{
|
47
|
+
UCharsetDetector *detector;
|
48
|
+
Data_Get_Struct(self, UCharsetDetector, detector);
|
49
|
+
|
50
|
+
return ucsdet_isInputFilterEnabled(detector) ? Qtrue : Qfalse;
|
51
|
+
}
|
52
|
+
|
53
|
+
/*
|
54
|
+
* call-seq:
|
55
|
+
* input_filtered=
|
56
|
+
*
|
57
|
+
* Enable filtering of input text. If filtering is enabled,
|
58
|
+
* text within angle brackets ("<" and ">") will be removed
|
59
|
+
* before detection, which will remove most HTML or xml markup.
|
60
|
+
*/
|
61
|
+
static VALUE
|
62
|
+
UCharsetDetector_set_input_filtered(VALUE self, VALUE flag)
|
63
|
+
{
|
64
|
+
UCharsetDetector *detector;
|
65
|
+
Data_Get_Struct(self, UCharsetDetector, detector);
|
66
|
+
|
67
|
+
ucsdet_enableInputFilter(detector, RTEST(flag) ? TRUE : FALSE);
|
68
|
+
return self;
|
69
|
+
}
|
70
|
+
|
71
|
+
/*
|
72
|
+
* call-seq:
|
73
|
+
* text
|
74
|
+
*
|
75
|
+
* Get input text for this detector.
|
76
|
+
*/
|
77
|
+
static VALUE
|
78
|
+
UCharsetDetector_get_text(VALUE self)
|
79
|
+
{
|
80
|
+
return rb_iv_get(self, "@text");
|
81
|
+
}
|
82
|
+
|
83
|
+
/*
|
84
|
+
* call-seq:
|
85
|
+
* text=
|
86
|
+
*
|
87
|
+
* Set input text for this detector.
|
88
|
+
*/
|
89
|
+
static VALUE
|
90
|
+
UCharsetDetector_set_text(VALUE self, VALUE text)
|
91
|
+
{
|
92
|
+
return rb_iv_set(self, "@text", text);
|
93
|
+
return text;
|
94
|
+
}
|
95
|
+
|
96
|
+
/*
|
97
|
+
* call-seq:
|
98
|
+
* declared_encoding
|
99
|
+
*
|
100
|
+
* Get the declared encoding for charset detection.
|
101
|
+
*/
|
102
|
+
static VALUE
|
103
|
+
UCharsetDetector_get_declared_encoding(VALUE self)
|
104
|
+
{
|
105
|
+
return rb_iv_get(self, "@declared_encoding");
|
106
|
+
}
|
107
|
+
|
108
|
+
/*
|
109
|
+
* call-seq:
|
110
|
+
* declared_encoding=
|
111
|
+
*
|
112
|
+
* Set the declared encoding for charset detection.
|
113
|
+
* The declared encoding of an input text is an encoding obtained
|
114
|
+
* by the user from an http header or xml declaration or similar source that
|
115
|
+
* can be provided as an additional hint to the charset detector.
|
116
|
+
*/
|
117
|
+
static VALUE
|
118
|
+
UCharsetDetector_set_declared_encoding(VALUE self, VALUE declared_encoding)
|
119
|
+
{
|
120
|
+
return rb_iv_set(self, "@declared_encoding", declared_encoding);
|
121
|
+
return declared_encoding;
|
122
|
+
}
|
123
|
+
|
124
|
+
static void
|
125
|
+
set_text(VALUE self, VALUE text)
|
126
|
+
{
|
127
|
+
if (!NIL_P(text)) {
|
128
|
+
text = StringValue(text);
|
129
|
+
|
130
|
+
UErrorCode status = U_ZERO_ERROR;
|
131
|
+
UCharsetDetector *detector;
|
132
|
+
Data_Get_Struct(self, UCharsetDetector, detector);
|
133
|
+
|
134
|
+
ucsdet_setText(detector, StringValuePtr(text), RSTRING_LEN(text), &status);
|
135
|
+
assure(status);
|
136
|
+
|
137
|
+
UCharsetDetector_set_text(self, text);
|
138
|
+
}
|
139
|
+
}
|
140
|
+
|
141
|
+
static void
|
142
|
+
set_declared_encoding(VALUE self, VALUE declared_encoding)
|
143
|
+
{
|
144
|
+
if (!NIL_P(declared_encoding)){
|
145
|
+
declared_encoding = StringValue(declared_encoding);
|
146
|
+
|
147
|
+
UErrorCode status = U_ZERO_ERROR;
|
148
|
+
UCharsetDetector *detector;
|
149
|
+
Data_Get_Struct(self, UCharsetDetector, detector);
|
150
|
+
|
151
|
+
ucsdet_setDeclaredEncoding(detector, StringValuePtr(declared_encoding), RSTRING_LEN(declared_encoding), &status);
|
152
|
+
assure(status);
|
153
|
+
|
154
|
+
UCharsetDetector_set_declared_encoding(self, declared_encoding);
|
155
|
+
}
|
156
|
+
}
|
157
|
+
|
158
|
+
/*
|
159
|
+
* call-seq:
|
160
|
+
* new(text=nil, declared_encoding=nil)
|
161
|
+
*
|
162
|
+
* Create a new charset detector. Optionally set input text and declared encoding.
|
163
|
+
*/
|
164
|
+
static VALUE
|
165
|
+
UCharsetDetector_initialize(int argc, VALUE *argv, VALUE self)
|
166
|
+
{
|
167
|
+
VALUE text;
|
168
|
+
VALUE declared_encoding;
|
169
|
+
|
170
|
+
rb_scan_args(argc, argv, "02", &text, &declared_encoding);
|
171
|
+
if (NIL_P(text))
|
172
|
+
UCharsetDetector_set_text(self, Qnil);
|
173
|
+
else
|
174
|
+
set_text(self, text);
|
175
|
+
|
176
|
+
if (NIL_P(declared_encoding))
|
177
|
+
UCharsetDetector_set_declared_encoding(self, Qnil);
|
178
|
+
else
|
179
|
+
set_declared_encoding(self, declared_encoding);
|
180
|
+
|
181
|
+
return self;
|
182
|
+
}
|
183
|
+
|
184
|
+
/*
|
185
|
+
* call-seq:
|
186
|
+
* detect(text=nil, declared_encoding=nil)
|
187
|
+
*
|
188
|
+
* Return the charset that best matches the supplied input data.
|
189
|
+
*
|
190
|
+
* Note though, that because the detection
|
191
|
+
* only looks at the start of the input data,
|
192
|
+
* there is a possibility that the returned charset will fail to handle
|
193
|
+
* the full set of input data.
|
194
|
+
*
|
195
|
+
* The function will fail if
|
196
|
+
* * no charset appears to match the data
|
197
|
+
* * no input text has been provided (with +text+ or set with #text= )
|
198
|
+
*/
|
199
|
+
static VALUE
|
200
|
+
UCharsetDetector_detect(int argc, VALUE *argv, VALUE self)
|
201
|
+
{
|
202
|
+
VALUE text;
|
203
|
+
VALUE declared_encoding;
|
204
|
+
|
205
|
+
rb_scan_args(argc, argv, "02", &text, &declared_encoding);
|
206
|
+
set_text(self, text);
|
207
|
+
set_declared_encoding(self, declared_encoding);
|
208
|
+
|
209
|
+
UErrorCode status = U_ZERO_ERROR;
|
210
|
+
UCharsetDetector *detector;
|
211
|
+
Data_Get_Struct(self, UCharsetDetector, detector);
|
212
|
+
|
213
|
+
const UCharsetMatch *match = ucsdet_detect(detector, &status);
|
214
|
+
assure(status);
|
215
|
+
|
216
|
+
const char *encoding_name = ucsdet_getName(match, &status);
|
217
|
+
assure(status);
|
218
|
+
|
219
|
+
int32_t encoding_confidence = ucsdet_getConfidence(match, &status);
|
220
|
+
assure(status);
|
221
|
+
|
222
|
+
const char *encoding_language = ucsdet_getLanguage(match, &status);
|
223
|
+
assure(status);
|
224
|
+
|
225
|
+
VALUE hash = rb_hash_new();
|
226
|
+
rb_hash_aset(hash, ID2SYM(rb_intern("encoding")), rb_str_new2(encoding_name));
|
227
|
+
rb_hash_aset(hash, ID2SYM(rb_intern("confidence")), INT2NUM(encoding_confidence));
|
228
|
+
rb_hash_aset(hash, ID2SYM(rb_intern("language")), rb_str_new2(encoding_language));
|
229
|
+
|
230
|
+
return hash;
|
231
|
+
}
|
232
|
+
|
233
|
+
/*
|
234
|
+
* call-seq:
|
235
|
+
* detect_all(text=nil, declared_encoding=nil)
|
236
|
+
*
|
237
|
+
* Find all charset matches that appear to be consistent with the input,
|
238
|
+
* returning an array of results. The results are ordered with the
|
239
|
+
* best quality match first.
|
240
|
+
*
|
241
|
+
* Because the detection only looks at a limited amount of the
|
242
|
+
* input byte data, some of the returned charsets may fail to handle
|
243
|
+
* the all of input data.
|
244
|
+
*
|
245
|
+
* Return an error if
|
246
|
+
* * no charset appears to match the data
|
247
|
+
* * no input text has been provided (with +text+ or set with #text= )
|
248
|
+
*/
|
249
|
+
static VALUE
|
250
|
+
UCharsetDetector_detect_all(int argc, VALUE *argv, VALUE self)
|
251
|
+
{
|
252
|
+
VALUE text;
|
253
|
+
VALUE declared_encoding;
|
254
|
+
|
255
|
+
rb_scan_args(argc, argv, "02", &text, &declared_encoding);
|
256
|
+
set_text(self, text);
|
257
|
+
set_declared_encoding(self, declared_encoding);
|
258
|
+
|
259
|
+
UCharsetDetector *detector;
|
260
|
+
Data_Get_Struct(self, UCharsetDetector, detector);
|
261
|
+
UErrorCode status = U_ZERO_ERROR;
|
262
|
+
int32_t matches_found;
|
263
|
+
|
264
|
+
const UCharsetMatch **matches = ucsdet_detectAll(detector, &matches_found, &status);
|
265
|
+
assure(status);
|
266
|
+
|
267
|
+
VALUE ary = rb_ary_new();
|
268
|
+
int i = 0;
|
269
|
+
|
270
|
+
for (i = 0; i < matches_found; i++) {
|
271
|
+
const char *encoding_name = ucsdet_getName(matches[i], &status);
|
272
|
+
assure(status);
|
273
|
+
|
274
|
+
int32_t encoding_confidence = ucsdet_getConfidence(matches[i], &status);
|
275
|
+
assure(status);
|
276
|
+
|
277
|
+
const char *encoding_language = ucsdet_getLanguage(matches[i], &status);
|
278
|
+
assure(status);
|
279
|
+
|
280
|
+
VALUE hash = rb_hash_new();
|
281
|
+
rb_hash_aset(hash, ID2SYM(rb_intern("encoding")), rb_str_new2(encoding_name));
|
282
|
+
rb_hash_aset(hash, ID2SYM(rb_intern("confidence")), INT2NUM(encoding_confidence));
|
283
|
+
rb_hash_aset(hash, ID2SYM(rb_intern("language")), rb_str_new2(encoding_language));
|
284
|
+
|
285
|
+
rb_ary_push(ary, hash);
|
286
|
+
}
|
287
|
+
|
288
|
+
return ary;
|
289
|
+
}
|
290
|
+
|
291
|
+
/*
|
292
|
+
* call-seq:
|
293
|
+
* detectable_charsets
|
294
|
+
*
|
295
|
+
* Get array of names of all detectable charsets that are known to the
|
296
|
+
* charset detection service.
|
297
|
+
*/
|
298
|
+
static VALUE
|
299
|
+
UCharsetDetector_get_detectable_charsets(VALUE self)
|
300
|
+
{
|
301
|
+
UCharsetDetector *detector;
|
302
|
+
Data_Get_Struct(self, UCharsetDetector, detector);
|
303
|
+
UErrorCode status = U_ZERO_ERROR;
|
304
|
+
|
305
|
+
UEnumeration *charsets = ucsdet_getAllDetectableCharsets(detector, &status);
|
306
|
+
assure(status);
|
307
|
+
|
308
|
+
VALUE ary = rb_ary_new();
|
309
|
+
int32_t result_length;
|
310
|
+
const char *charset_name;
|
311
|
+
|
312
|
+
while (charset_name = uenum_next(charsets, &result_length, &status)) {
|
313
|
+
assure(status);
|
314
|
+
rb_ary_push(ary, rb_str_new2(charset_name));
|
315
|
+
}
|
316
|
+
uenum_close(charsets);
|
317
|
+
|
318
|
+
return ary;
|
319
|
+
}
|
320
|
+
|
321
|
+
void
|
322
|
+
Init_uchardet()
|
323
|
+
{
|
324
|
+
VALUE mICU = rb_define_module("ICU");
|
325
|
+
|
326
|
+
cUChardetError = rb_define_class_under(mICU, "Error", rb_eStandardError);
|
327
|
+
|
328
|
+
cUCharsetDetector = rb_define_class_under(mICU, "UCharsetDetector", rb_cObject);
|
329
|
+
rb_define_alloc_func(cUCharsetDetector, UCharsetDetector_alloc);
|
330
|
+
rb_define_method(cUCharsetDetector, "initialize", UCharsetDetector_initialize, -1);
|
331
|
+
rb_define_method(cUCharsetDetector, "input_filtered?", UCharsetDetector_get_input_filtered, 0);
|
332
|
+
rb_define_method(cUCharsetDetector, "input_filtered=", UCharsetDetector_set_input_filtered, 1);
|
333
|
+
rb_define_method(cUCharsetDetector, "text", UCharsetDetector_get_text, 0);
|
334
|
+
rb_define_method(cUCharsetDetector, "text=", UCharsetDetector_set_text, 1);
|
335
|
+
rb_define_method(cUCharsetDetector, "declared_encoding", UCharsetDetector_get_declared_encoding, 0);
|
336
|
+
rb_define_method(cUCharsetDetector, "declared_encoding=", UCharsetDetector_set_declared_encoding, 1);
|
337
|
+
rb_define_method(cUCharsetDetector, "detect", UCharsetDetector_detect, -1);
|
338
|
+
rb_define_method(cUCharsetDetector, "detect_all", UCharsetDetector_detect_all, -1);
|
339
|
+
rb_define_method(cUCharsetDetector, "detectable_charsets", UCharsetDetector_get_detectable_charsets, 0);
|
340
|
+
}
|
data/lib/uchardet.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
$:.unshift(File.dirname(__FILE__)) unless
|
2
|
+
$:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
3
|
+
|
4
|
+
module Uchardet
|
5
|
+
VERSION = '0.1.1'
|
6
|
+
end
|
7
|
+
|
8
|
+
begin
|
9
|
+
require 'uchardet.so'
|
10
|
+
rescue LoadError
|
11
|
+
# uh-oh
|
12
|
+
end
|
13
|
+
|
14
|
+
module ICU # :main: README
|
15
|
+
class UCharsetDetector # :main: README
|
16
|
+
##
|
17
|
+
# Shortcut for ICU::UCharsetDetector#detect
|
18
|
+
#
|
19
|
+
def self.detect(*args)
|
20
|
+
self.new.detect(*args)
|
21
|
+
end
|
22
|
+
|
23
|
+
##
|
24
|
+
# Shortcut for ICU::UCharsetDetector#detect_all
|
25
|
+
#
|
26
|
+
def self.detect_all(*args)
|
27
|
+
self.new.detect_all(*args)
|
28
|
+
end
|
29
|
+
|
30
|
+
##
|
31
|
+
# Shortcut for ICU::UCharsetDetector#detectable_charsets
|
32
|
+
#
|
33
|
+
def self.detectable_charsets
|
34
|
+
self.new.detectable_charsets
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
data/lib/uchardet/cli.rb
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
require 'optparse'
|
2
|
+
|
3
|
+
module Uchardet
|
4
|
+
class CLI
|
5
|
+
def self.execute(stdout, args=[])
|
6
|
+
@stdout = stdout
|
7
|
+
@options = {
|
8
|
+
:input_filtered => false,
|
9
|
+
:declared_encoding => nil,
|
10
|
+
:detect_all => false,
|
11
|
+
:path => nil
|
12
|
+
}
|
13
|
+
|
14
|
+
parser = OptionParser.new do |opts|
|
15
|
+
opts.banner = <<-BANNER.gsub(/^\s*/,'')
|
16
|
+
Usage: #{File.basename($0)} [options] file
|
17
|
+
BANNER
|
18
|
+
|
19
|
+
opts.on("-l", "--list",
|
20
|
+
"Display list of detectable character sets."
|
21
|
+
) { self.list; exit }
|
22
|
+
opts.on("-s", "--strip",
|
23
|
+
"Strip HTML or XML markup before detection."
|
24
|
+
) { @options[:input_filtered] = true }
|
25
|
+
opts.on("-e", "--encoding",
|
26
|
+
"Hint the charset detector about possible encoding."
|
27
|
+
) { |arg| @options[:declared_encoding] = arg }
|
28
|
+
opts.on("-a", "--all",
|
29
|
+
"Show all matching encodings."
|
30
|
+
) { @options[:detect_all] = true }
|
31
|
+
opts.on("-h", "--help",
|
32
|
+
"Show this help message."
|
33
|
+
) { @stdout.puts opts; exit }
|
34
|
+
|
35
|
+
if args.empty?
|
36
|
+
@stdout.puts opts
|
37
|
+
else
|
38
|
+
begin
|
39
|
+
opts.parse!(args)
|
40
|
+
rescue OptionParser::ParseError => ex
|
41
|
+
STDERR.puts "ERROR: #{ex.to_s}. See #{File.basename($0)} --help"
|
42
|
+
exit
|
43
|
+
end
|
44
|
+
|
45
|
+
@options[:path] = args.last
|
46
|
+
if @options[:path].nil? || @options[:path].empty?
|
47
|
+
@stdout.puts opts
|
48
|
+
STDERR.puts "ERROR: please specify a file path."
|
49
|
+
exit
|
50
|
+
end
|
51
|
+
|
52
|
+
self.detect
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def self.list
|
58
|
+
ICU::UCharsetDetector.detectable_charsets.uniq.sort.each { |name| @stdout.puts name }
|
59
|
+
end
|
60
|
+
|
61
|
+
def self.detect
|
62
|
+
detector = ICU::UCharsetDetector.new
|
63
|
+
detector.input_filtered = @options[:input_filtered]
|
64
|
+
detector.declared_encoding = @options[:declared_encoding]
|
65
|
+
|
66
|
+
source = IO.read(@options[:path])
|
67
|
+
matches = if @options[:detect_all]
|
68
|
+
detector.detect_all(source)
|
69
|
+
else
|
70
|
+
[detector.detect(source)]
|
71
|
+
end
|
72
|
+
|
73
|
+
matches.each do |match|
|
74
|
+
@stdout.puts "#{match[:encoding]} (confidence #{match[:confidence]}%)"
|
75
|
+
end
|
76
|
+
rescue Exception => ex
|
77
|
+
STDERR.puts "ERROR: #{ex.to_s}"
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
data/script/console
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# File: script/console
|
3
|
+
irb = RUBY_PLATFORM =~ /(:?mswin|mingw)/ ? 'irb.bat' : 'irb'
|
4
|
+
|
5
|
+
libs = " -r irb/completion"
|
6
|
+
# Perhaps use a console_lib to store any extra methods I may want available in the cosole
|
7
|
+
# libs << " -r #{File.dirname(__FILE__) + '/../lib/console_lib/console_logger.rb'}"
|
8
|
+
libs << " -r #{File.dirname(__FILE__) + '/../lib/chardet-icu.rb'}"
|
9
|
+
puts "Loading chardet-icu gem"
|
10
|
+
exec "#{irb} #{libs} --simple-prompt"
|
data/script/destroy
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'rubigen'
|
6
|
+
rescue LoadError
|
7
|
+
require 'rubygems'
|
8
|
+
require 'rubigen'
|
9
|
+
end
|
10
|
+
require 'rubigen/scripts/destroy'
|
11
|
+
|
12
|
+
ARGV.shift if ['--help', '-h'].include?(ARGV[0])
|
13
|
+
RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
|
14
|
+
RubiGen::Scripts::Destroy.new.run(ARGV)
|
data/script/generate
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'rubigen'
|
6
|
+
rescue LoadError
|
7
|
+
require 'rubygems'
|
8
|
+
require 'rubigen'
|
9
|
+
end
|
10
|
+
require 'rubigen/scripts/generate'
|
11
|
+
|
12
|
+
ARGV.shift if ['--help', '-h'].include?(ARGV[0])
|
13
|
+
RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
|
14
|
+
RubiGen::Scripts::Generate.new.run(ARGV)
|
data/tasks/extconf.rake
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
namespace :extconf do
|
2
|
+
desc "Compiles the Ruby extension"
|
3
|
+
task :compile
|
4
|
+
end
|
5
|
+
|
6
|
+
task :compile => "extconf:compile"
|
7
|
+
|
8
|
+
task :test => :compile
|
9
|
+
|
10
|
+
BIN = "*.{o,bundle,jar,so,obj,pdb,lib,def,exp}"
|
11
|
+
$hoe.clean_globs |= ["ext/**/#{BIN}", "lib/**/#{BIN}", 'ext/**/Makefile']
|
12
|
+
$hoe.spec.require_paths = Dir['{lib,ext/*}']
|
13
|
+
$hoe.spec.extensions = FileList["ext/**/extconf.rb"].to_a
|
@@ -0,0 +1,43 @@
|
|
1
|
+
namespace :extconf do
|
2
|
+
extension = File.basename(__FILE__, '.rake')
|
3
|
+
|
4
|
+
ext = "ext/#{extension}"
|
5
|
+
ext_so = "#{ext}/#{extension}.#{Config::CONFIG['DLEXT']}"
|
6
|
+
ext_files = FileList[
|
7
|
+
"#{ext}/*.c",
|
8
|
+
"#{ext}/*.h",
|
9
|
+
"#{ext}/*.rl",
|
10
|
+
"#{ext}/extconf.rb",
|
11
|
+
"#{ext}/Makefile",
|
12
|
+
# "lib"
|
13
|
+
]
|
14
|
+
|
15
|
+
|
16
|
+
task :compile => extension do
|
17
|
+
if Dir.glob("**/#{extension}.{o,so,dll}").length == 0
|
18
|
+
STDERR.puts "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
|
19
|
+
STDERR.puts "Gem actually failed to build. Your system is"
|
20
|
+
STDERR.puts "NOT configured properly to build #{GEM_NAME}."
|
21
|
+
STDERR.puts "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
|
22
|
+
exit(1)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
desc "Builds just the #{extension} extension"
|
27
|
+
task extension.to_sym => ["#{ext}/Makefile", ext_so ]
|
28
|
+
|
29
|
+
file "#{ext}/Makefile" => ["#{ext}/extconf.rb"] do
|
30
|
+
Dir.chdir(ext) do ruby "extconf.rb" end
|
31
|
+
end
|
32
|
+
|
33
|
+
file ext_so => ext_files do
|
34
|
+
Dir.chdir(ext) do
|
35
|
+
sh(RUBY_PLATFORM =~ /win32/ ? 'nmake' : 'make') do |ok, res|
|
36
|
+
if !ok
|
37
|
+
require "fileutils"
|
38
|
+
FileUtils.rm Dir.glob('*.{so,o,dll,bundle}')
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
data/test/test_helper.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require File.dirname(__FILE__) + '/test_helper.rb'
|
4
|
+
|
5
|
+
class TestUchardet < Test::Unit::TestCase # :nodoc:
|
6
|
+
|
7
|
+
def test_detect
|
8
|
+
detector = ICU::UCharsetDetector.new
|
9
|
+
assert_equal(detector.detect(''), ICU::UCharsetDetector.detect(''))
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_detect_all
|
13
|
+
detector = ICU::UCharsetDetector.new
|
14
|
+
assert_equal(detector.detect_all('∑'), ICU::UCharsetDetector.detect_all('∑'))
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_detectable_charsets
|
18
|
+
detector = ICU::UCharsetDetector.new
|
19
|
+
assert_equal(detector.detectable_charsets, ICU::UCharsetDetector.detectable_charsets)
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), "test_helper.rb")
|
2
|
+
require 'uchardet/cli'
|
3
|
+
|
4
|
+
class TestUchardetCli < Test::Unit::TestCase
|
5
|
+
def setup
|
6
|
+
Uchardet::CLI.execute(@stdout_io = StringIO.new, [])
|
7
|
+
@stdout_io.rewind
|
8
|
+
@stdout = @stdout_io.read
|
9
|
+
end
|
10
|
+
|
11
|
+
def test_print_default_output
|
12
|
+
assert_match(/Usage: .* \[options\] file/, @stdout)
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,101 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require "test/unit"
|
4
|
+
|
5
|
+
$:.unshift File.dirname(__FILE__) + "/../ext/uchardet"
|
6
|
+
require "uchardet.so"
|
7
|
+
|
8
|
+
class TestUchardetExtn < Test::Unit::TestCase # :nodoc:
|
9
|
+
|
10
|
+
def test_init
|
11
|
+
assert_not_nil(ICU::UCharsetDetector)
|
12
|
+
|
13
|
+
assert_nothing_raised do
|
14
|
+
detector = ICU::UCharsetDetector.new
|
15
|
+
assert_not_nil(detector)
|
16
|
+
|
17
|
+
detector = ICU::UCharsetDetector.new nil
|
18
|
+
assert_not_nil(detector)
|
19
|
+
|
20
|
+
detector = ICU::UCharsetDetector.new 'some text'
|
21
|
+
assert_not_nil(detector)
|
22
|
+
end
|
23
|
+
|
24
|
+
assert_raise(TypeError) do
|
25
|
+
detector = ICU::UCharsetDetector.new 0
|
26
|
+
end
|
27
|
+
|
28
|
+
assert_raise(TypeError) do
|
29
|
+
detector = ICU::UCharsetDetector.new Time.now
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def test_detect
|
34
|
+
detector = ICU::UCharsetDetector.new
|
35
|
+
assert_raise(ICU::Error) do
|
36
|
+
detector.detect
|
37
|
+
end
|
38
|
+
e = detector.detect '∂∆∂∆∂∆'
|
39
|
+
assert(e.is_a? Hash)
|
40
|
+
assert(e.has_key? :encoding)
|
41
|
+
assert(e.has_key? :confidence)
|
42
|
+
assert(e.has_key? :language)
|
43
|
+
assert_equal('utf-8', e[:encoding].downcase)
|
44
|
+
e = detector.detect '··', 'utf-8'
|
45
|
+
assert_equal('utf-8', e[:encoding].downcase)
|
46
|
+
e = detector.detect '··', 'Shift_JIS'
|
47
|
+
assert_equal('utf-8', e[:encoding].downcase)
|
48
|
+
end
|
49
|
+
|
50
|
+
def test_detect_all
|
51
|
+
detector = ICU::UCharsetDetector.new
|
52
|
+
assert_raise(ICU::Error) do
|
53
|
+
detector.detect_all
|
54
|
+
end
|
55
|
+
a = detector.detect_all '€‹€‹€'
|
56
|
+
assert(a.is_a? Array)
|
57
|
+
assert_equal(false, a.empty?)
|
58
|
+
assert(a[0].is_a? Hash)
|
59
|
+
assert(a[0].has_key? :encoding)
|
60
|
+
assert(a[0].has_key? :confidence)
|
61
|
+
assert(a[0].has_key? :language)
|
62
|
+
end
|
63
|
+
|
64
|
+
def test_input_filtered_accessor
|
65
|
+
detector = ICU::UCharsetDetector.new
|
66
|
+
assert_equal(false, detector.input_filtered?)
|
67
|
+
detector.input_filtered = true
|
68
|
+
assert_equal(true, detector.input_filtered?)
|
69
|
+
detector.input_filtered = ''
|
70
|
+
assert_equal(true, detector.input_filtered?)
|
71
|
+
detector.input_filtered = nil
|
72
|
+
assert_equal(false, detector.input_filtered?)
|
73
|
+
end
|
74
|
+
|
75
|
+
def test_text_accessor
|
76
|
+
detector = ICU::UCharsetDetector.new
|
77
|
+
assert_equal(nil, detector.text)
|
78
|
+
detector = ICU::UCharsetDetector.new 'blah'
|
79
|
+
assert_equal('blah', detector.text)
|
80
|
+
detector.text = 'test'
|
81
|
+
assert_equal('test', detector.text)
|
82
|
+
detector.detect
|
83
|
+
assert_equal('test', detector.text)
|
84
|
+
end
|
85
|
+
|
86
|
+
def test_declared_encoding_accessor
|
87
|
+
detector = ICU::UCharsetDetector.new
|
88
|
+
assert_equal(nil, detector.declared_encoding)
|
89
|
+
detector.declared_encoding = 'iso-8859-15'
|
90
|
+
assert_equal('iso-8859-15', detector.declared_encoding)
|
91
|
+
detector.detect 'test'
|
92
|
+
assert_equal('iso-8859-15', detector.declared_encoding)
|
93
|
+
end
|
94
|
+
|
95
|
+
def test_detectable_charsets
|
96
|
+
detector = ICU::UCharsetDetector.new
|
97
|
+
assert_not_nil(detector.detectable_charsets)
|
98
|
+
assert(detector.detectable_charsets.is_a? Array)
|
99
|
+
end
|
100
|
+
|
101
|
+
end
|
metadata
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: uchardet
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Dmitri Goutnik
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-12-19 00:00:00 +03:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: hoe
|
17
|
+
type: :development
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 2.4.0
|
24
|
+
version:
|
25
|
+
description: Fast character set encoding detection using International Components for Unicode C++ library.
|
26
|
+
email:
|
27
|
+
- dg@syrec.org
|
28
|
+
executables:
|
29
|
+
- uchardet
|
30
|
+
extensions:
|
31
|
+
- ext/uchardet/extconf.rb
|
32
|
+
extra_rdoc_files:
|
33
|
+
- History.txt
|
34
|
+
- Manifest.txt
|
35
|
+
- README.rdoc
|
36
|
+
files:
|
37
|
+
- History.txt
|
38
|
+
- Manifest.txt
|
39
|
+
- README.rdoc
|
40
|
+
- Rakefile
|
41
|
+
- bin/uchardet
|
42
|
+
- ext/uchardet/extconf.rb
|
43
|
+
- ext/uchardet/uchardet.c
|
44
|
+
- lib/uchardet.rb
|
45
|
+
- lib/uchardet/cli.rb
|
46
|
+
- script/console
|
47
|
+
- script/destroy
|
48
|
+
- script/generate
|
49
|
+
- tasks/extconf.rake
|
50
|
+
- tasks/extconf/uchardet.rake
|
51
|
+
- test/test_helper.rb
|
52
|
+
- test/test_uchardet.rb
|
53
|
+
- test/test_uchardet_cli.rb
|
54
|
+
- test/test_uchardet_extn.rb
|
55
|
+
has_rdoc: true
|
56
|
+
homepage: http://github.com/invisiblellama/uchardet
|
57
|
+
licenses: []
|
58
|
+
|
59
|
+
post_install_message:
|
60
|
+
rdoc_options:
|
61
|
+
- --main
|
62
|
+
- README.rdoc
|
63
|
+
require_paths:
|
64
|
+
- lib
|
65
|
+
- ext/uchardet
|
66
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
67
|
+
requirements:
|
68
|
+
- - ">="
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
version: "0"
|
71
|
+
version:
|
72
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
73
|
+
requirements:
|
74
|
+
- - ">="
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: "0"
|
77
|
+
version:
|
78
|
+
requirements: []
|
79
|
+
|
80
|
+
rubyforge_project: uchardet
|
81
|
+
rubygems_version: 1.3.5
|
82
|
+
signing_key:
|
83
|
+
specification_version: 3
|
84
|
+
summary: Fast character set encoding detection using International Components for Unicode C++ library.
|
85
|
+
test_files:
|
86
|
+
- test/test_helper.rb
|
87
|
+
- test/test_uchardet.rb
|
88
|
+
- test/test_uchardet_cli.rb
|
89
|
+
- test/test_uchardet_extn.rb
|