uchardet 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,3 @@
1
+ === 0.1.1 2009-12-19
2
+
3
+ * Initial release
@@ -0,0 +1,18 @@
1
+ History.txt
2
+ Manifest.txt
3
+ README.rdoc
4
+ Rakefile
5
+ bin/uchardet
6
+ ext/uchardet/extconf.rb
7
+ ext/uchardet/uchardet.c
8
+ lib/uchardet.rb
9
+ lib/uchardet/cli.rb
10
+ script/console
11
+ script/destroy
12
+ script/generate
13
+ tasks/extconf.rake
14
+ tasks/extconf/uchardet.rake
15
+ test/test_helper.rb
16
+ test/test_uchardet.rb
17
+ test/test_uchardet_cli.rb
18
+ test/test_uchardet_extn.rb
@@ -0,0 +1,50 @@
1
+ = uchardet
2
+
3
+ * http://github.com/invisiblellama/uchardet
4
+
5
+ == DESCRIPTION:
6
+
7
+ Fast character set encoding detection using International Components for Unicode C++ library.
8
+
9
+ == SYNOPSIS:
10
+
11
+ require 'open-uri'
12
+ require 'uchardet'
13
+
14
+ encoding = ICU::UCharsetDetector.detect open('http://google.jp').read
15
+ encoding # => {:language=>"ja", :encoding=>"Shift_JIS", :confidence=>100}
16
+
17
+ From command line:
18
+
19
+ $ uchardet
20
+
21
+ Usage: uchardet [options] file
22
+ -l, --list Display list of detectable character sets.
23
+ -s, --strip Strip HTML or XML markup before detection.
24
+ -e, --encoding Hint the charset detector about possible encoding.
25
+ -a, --all Show all matching encodings.
26
+ -h, --help Show this help message.
27
+
28
+ $ uchardet `which uchardet`
29
+
30
+ ISO-8859-1 (confidence 60%)
31
+
32
+ == REQUIREMENTS:
33
+
34
+ ICU[http://site.icu-project.org/] (International Components for Unicode):
35
+
36
+ on Mac OS X:
37
+
38
+ sudo port install icu
39
+
40
+ on Debian/Ubuntu
41
+
42
+ sudo apt-get install libicu-dev
43
+
44
+ == INSTALL:
45
+
46
+ sudo gem install uchardet
47
+
48
+ == LICENSE:
49
+
50
+ Copyright (c) 2009 Dmitri Goutnik, released under the MIT license.
@@ -0,0 +1,23 @@
1
+ require 'rubygems'
2
+ gem 'hoe', '>= 2.1.0'
3
+ require 'hoe'
4
+ require 'fileutils'
5
+ require './lib/uchardet'
6
+
7
+ Hoe.plugin :newgem
8
+
9
+ # Generate all the Rake tasks
10
+ # Run 'rake -T' to see list of generated tasks (from gem root directory)
11
+ $hoe = Hoe.spec 'uchardet' do
12
+ self.developer 'Dmitri Goutnik', 'dg@syrec.org'
13
+ self.readme_file = 'README.rdoc'
14
+ self.extra_rdoc_files = ['README.rdoc']
15
+ self.rubyforge_name = self.name
16
+ end
17
+
18
+ require 'newgem/tasks'
19
+ Dir['tasks/**/*.rake'].each { |t| load t }
20
+
21
+ # TODO - want other tests/tasks run by default? Add them to the list
22
+ # remove_task :default
23
+ # task :default => [:spec, :features]
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rubygems'
4
+ require File.expand_path(
5
+ File.join(File.dirname(__FILE__), %w[.. lib uchardet]))
6
+ require "uchardet/cli"
7
+
8
+ Uchardet::CLI.execute(STDOUT, ARGV)
@@ -0,0 +1,11 @@
1
+ require 'mkmf'
2
+
3
+ icu_config = `which icu-config`.strip
4
+ if icu_config.empty?
5
+ abort "ICU seems to be missing. Try 'port install icu' or 'apt-get install libicu-dev'"
6
+ end
7
+
8
+ $LIBS << ' ' + `#{icu_config} --ldflags-system --ldflags-libsonly`.strip
9
+ $LDFLAGS << ' ' + `#{icu_config} --ldflags-searchpath`.strip
10
+
11
+ create_makefile("uchardet")
@@ -0,0 +1,340 @@
1
+ #include <ruby.h>
2
+ #include <unicode/ucsdet.h>
3
+
4
+ #ifndef RSTRING_PTR
5
+ # define RSTRING_PTR(str) RSTRING(str)->ptr
6
+ # define RSTRING_LEN(str) RSTRING(str)->len
7
+ #endif
8
+
9
+ static VALUE cUChardetError;
10
+ static VALUE cUCharsetDetector;
11
+
12
+ static void
13
+ assure(UErrorCode status)
14
+ {
15
+ if (U_FAILURE(status)) {
16
+ VALUE ex = rb_exc_new2(cUChardetError, u_errorName(status));
17
+ rb_iv_set(ex, "@errno", INT2FIX(status));
18
+ rb_exc_raise(ex);
19
+ }
20
+ }
21
+
22
+ static void
23
+ UCharsetDetector_free(void *detector)
24
+ {
25
+ ucsdet_close(detector);
26
+ }
27
+
28
+ static VALUE
29
+ UCharsetDetector_alloc(VALUE klass)
30
+ {
31
+ UErrorCode status = U_ZERO_ERROR;
32
+ UCharsetDetector* detector = ucsdet_open(&status);
33
+ assure(status);
34
+
35
+ return Data_Wrap_Struct(klass, NULL, UCharsetDetector_free, detector);
36
+ }
37
+
38
+ /*
39
+ * call-seq:
40
+ * input_filtered
41
+ *
42
+ * Return filtering flag value this charset detector.
43
+ */
44
+ static VALUE
45
+ UCharsetDetector_get_input_filtered(VALUE self)
46
+ {
47
+ UCharsetDetector *detector;
48
+ Data_Get_Struct(self, UCharsetDetector, detector);
49
+
50
+ return ucsdet_isInputFilterEnabled(detector) ? Qtrue : Qfalse;
51
+ }
52
+
53
+ /*
54
+ * call-seq:
55
+ * input_filtered=
56
+ *
57
+ * Enable filtering of input text. If filtering is enabled,
58
+ * text within angle brackets ("<" and ">") will be removed
59
+ * before detection, which will remove most HTML or xml markup.
60
+ */
61
+ static VALUE
62
+ UCharsetDetector_set_input_filtered(VALUE self, VALUE flag)
63
+ {
64
+ UCharsetDetector *detector;
65
+ Data_Get_Struct(self, UCharsetDetector, detector);
66
+
67
+ ucsdet_enableInputFilter(detector, RTEST(flag) ? TRUE : FALSE);
68
+ return self;
69
+ }
70
+
71
+ /*
72
+ * call-seq:
73
+ * text
74
+ *
75
+ * Get input text for this detector.
76
+ */
77
+ static VALUE
78
+ UCharsetDetector_get_text(VALUE self)
79
+ {
80
+ return rb_iv_get(self, "@text");
81
+ }
82
+
83
+ /*
84
+ * call-seq:
85
+ * text=
86
+ *
87
+ * Set input text for this detector.
88
+ */
89
+ static VALUE
90
+ UCharsetDetector_set_text(VALUE self, VALUE text)
91
+ {
92
+ return rb_iv_set(self, "@text", text);
93
+ return text;
94
+ }
95
+
96
+ /*
97
+ * call-seq:
98
+ * declared_encoding
99
+ *
100
+ * Get the declared encoding for charset detection.
101
+ */
102
+ static VALUE
103
+ UCharsetDetector_get_declared_encoding(VALUE self)
104
+ {
105
+ return rb_iv_get(self, "@declared_encoding");
106
+ }
107
+
108
+ /*
109
+ * call-seq:
110
+ * declared_encoding=
111
+ *
112
+ * Set the declared encoding for charset detection.
113
+ * The declared encoding of an input text is an encoding obtained
114
+ * by the user from an http header or xml declaration or similar source that
115
+ * can be provided as an additional hint to the charset detector.
116
+ */
117
+ static VALUE
118
+ UCharsetDetector_set_declared_encoding(VALUE self, VALUE declared_encoding)
119
+ {
120
+ return rb_iv_set(self, "@declared_encoding", declared_encoding);
121
+ return declared_encoding;
122
+ }
123
+
124
+ static void
125
+ set_text(VALUE self, VALUE text)
126
+ {
127
+ if (!NIL_P(text)) {
128
+ text = StringValue(text);
129
+
130
+ UErrorCode status = U_ZERO_ERROR;
131
+ UCharsetDetector *detector;
132
+ Data_Get_Struct(self, UCharsetDetector, detector);
133
+
134
+ ucsdet_setText(detector, StringValuePtr(text), RSTRING_LEN(text), &status);
135
+ assure(status);
136
+
137
+ UCharsetDetector_set_text(self, text);
138
+ }
139
+ }
140
+
141
+ static void
142
+ set_declared_encoding(VALUE self, VALUE declared_encoding)
143
+ {
144
+ if (!NIL_P(declared_encoding)){
145
+ declared_encoding = StringValue(declared_encoding);
146
+
147
+ UErrorCode status = U_ZERO_ERROR;
148
+ UCharsetDetector *detector;
149
+ Data_Get_Struct(self, UCharsetDetector, detector);
150
+
151
+ ucsdet_setDeclaredEncoding(detector, StringValuePtr(declared_encoding), RSTRING_LEN(declared_encoding), &status);
152
+ assure(status);
153
+
154
+ UCharsetDetector_set_declared_encoding(self, declared_encoding);
155
+ }
156
+ }
157
+
158
+ /*
159
+ * call-seq:
160
+ * new(text=nil, declared_encoding=nil)
161
+ *
162
+ * Create a new charset detector. Optionally set input text and declared encoding.
163
+ */
164
+ static VALUE
165
+ UCharsetDetector_initialize(int argc, VALUE *argv, VALUE self)
166
+ {
167
+ VALUE text;
168
+ VALUE declared_encoding;
169
+
170
+ rb_scan_args(argc, argv, "02", &text, &declared_encoding);
171
+ if (NIL_P(text))
172
+ UCharsetDetector_set_text(self, Qnil);
173
+ else
174
+ set_text(self, text);
175
+
176
+ if (NIL_P(declared_encoding))
177
+ UCharsetDetector_set_declared_encoding(self, Qnil);
178
+ else
179
+ set_declared_encoding(self, declared_encoding);
180
+
181
+ return self;
182
+ }
183
+
184
+ /*
185
+ * call-seq:
186
+ * detect(text=nil, declared_encoding=nil)
187
+ *
188
+ * Return the charset that best matches the supplied input data.
189
+ *
190
+ * Note though, that because the detection
191
+ * only looks at the start of the input data,
192
+ * there is a possibility that the returned charset will fail to handle
193
+ * the full set of input data.
194
+ *
195
+ * The function will fail if
196
+ * * no charset appears to match the data
197
+ * * no input text has been provided (with +text+ or set with #text= )
198
+ */
199
+ static VALUE
200
+ UCharsetDetector_detect(int argc, VALUE *argv, VALUE self)
201
+ {
202
+ VALUE text;
203
+ VALUE declared_encoding;
204
+
205
+ rb_scan_args(argc, argv, "02", &text, &declared_encoding);
206
+ set_text(self, text);
207
+ set_declared_encoding(self, declared_encoding);
208
+
209
+ UErrorCode status = U_ZERO_ERROR;
210
+ UCharsetDetector *detector;
211
+ Data_Get_Struct(self, UCharsetDetector, detector);
212
+
213
+ const UCharsetMatch *match = ucsdet_detect(detector, &status);
214
+ assure(status);
215
+
216
+ const char *encoding_name = ucsdet_getName(match, &status);
217
+ assure(status);
218
+
219
+ int32_t encoding_confidence = ucsdet_getConfidence(match, &status);
220
+ assure(status);
221
+
222
+ const char *encoding_language = ucsdet_getLanguage(match, &status);
223
+ assure(status);
224
+
225
+ VALUE hash = rb_hash_new();
226
+ rb_hash_aset(hash, ID2SYM(rb_intern("encoding")), rb_str_new2(encoding_name));
227
+ rb_hash_aset(hash, ID2SYM(rb_intern("confidence")), INT2NUM(encoding_confidence));
228
+ rb_hash_aset(hash, ID2SYM(rb_intern("language")), rb_str_new2(encoding_language));
229
+
230
+ return hash;
231
+ }
232
+
233
+ /*
234
+ * call-seq:
235
+ * detect_all(text=nil, declared_encoding=nil)
236
+ *
237
+ * Find all charset matches that appear to be consistent with the input,
238
+ * returning an array of results. The results are ordered with the
239
+ * best quality match first.
240
+ *
241
+ * Because the detection only looks at a limited amount of the
242
+ * input byte data, some of the returned charsets may fail to handle
243
+ * the all of input data.
244
+ *
245
+ * Return an error if
246
+ * * no charset appears to match the data
247
+ * * no input text has been provided (with +text+ or set with #text= )
248
+ */
249
+ static VALUE
250
+ UCharsetDetector_detect_all(int argc, VALUE *argv, VALUE self)
251
+ {
252
+ VALUE text;
253
+ VALUE declared_encoding;
254
+
255
+ rb_scan_args(argc, argv, "02", &text, &declared_encoding);
256
+ set_text(self, text);
257
+ set_declared_encoding(self, declared_encoding);
258
+
259
+ UCharsetDetector *detector;
260
+ Data_Get_Struct(self, UCharsetDetector, detector);
261
+ UErrorCode status = U_ZERO_ERROR;
262
+ int32_t matches_found;
263
+
264
+ const UCharsetMatch **matches = ucsdet_detectAll(detector, &matches_found, &status);
265
+ assure(status);
266
+
267
+ VALUE ary = rb_ary_new();
268
+ int i = 0;
269
+
270
+ for (i = 0; i < matches_found; i++) {
271
+ const char *encoding_name = ucsdet_getName(matches[i], &status);
272
+ assure(status);
273
+
274
+ int32_t encoding_confidence = ucsdet_getConfidence(matches[i], &status);
275
+ assure(status);
276
+
277
+ const char *encoding_language = ucsdet_getLanguage(matches[i], &status);
278
+ assure(status);
279
+
280
+ VALUE hash = rb_hash_new();
281
+ rb_hash_aset(hash, ID2SYM(rb_intern("encoding")), rb_str_new2(encoding_name));
282
+ rb_hash_aset(hash, ID2SYM(rb_intern("confidence")), INT2NUM(encoding_confidence));
283
+ rb_hash_aset(hash, ID2SYM(rb_intern("language")), rb_str_new2(encoding_language));
284
+
285
+ rb_ary_push(ary, hash);
286
+ }
287
+
288
+ return ary;
289
+ }
290
+
291
+ /*
292
+ * call-seq:
293
+ * detectable_charsets
294
+ *
295
+ * Get array of names of all detectable charsets that are known to the
296
+ * charset detection service.
297
+ */
298
+ static VALUE
299
+ UCharsetDetector_get_detectable_charsets(VALUE self)
300
+ {
301
+ UCharsetDetector *detector;
302
+ Data_Get_Struct(self, UCharsetDetector, detector);
303
+ UErrorCode status = U_ZERO_ERROR;
304
+
305
+ UEnumeration *charsets = ucsdet_getAllDetectableCharsets(detector, &status);
306
+ assure(status);
307
+
308
+ VALUE ary = rb_ary_new();
309
+ int32_t result_length;
310
+ const char *charset_name;
311
+
312
+ while (charset_name = uenum_next(charsets, &result_length, &status)) {
313
+ assure(status);
314
+ rb_ary_push(ary, rb_str_new2(charset_name));
315
+ }
316
+ uenum_close(charsets);
317
+
318
+ return ary;
319
+ }
320
+
321
+ void
322
+ Init_uchardet()
323
+ {
324
+ VALUE mICU = rb_define_module("ICU");
325
+
326
+ cUChardetError = rb_define_class_under(mICU, "Error", rb_eStandardError);
327
+
328
+ cUCharsetDetector = rb_define_class_under(mICU, "UCharsetDetector", rb_cObject);
329
+ rb_define_alloc_func(cUCharsetDetector, UCharsetDetector_alloc);
330
+ rb_define_method(cUCharsetDetector, "initialize", UCharsetDetector_initialize, -1);
331
+ rb_define_method(cUCharsetDetector, "input_filtered?", UCharsetDetector_get_input_filtered, 0);
332
+ rb_define_method(cUCharsetDetector, "input_filtered=", UCharsetDetector_set_input_filtered, 1);
333
+ rb_define_method(cUCharsetDetector, "text", UCharsetDetector_get_text, 0);
334
+ rb_define_method(cUCharsetDetector, "text=", UCharsetDetector_set_text, 1);
335
+ rb_define_method(cUCharsetDetector, "declared_encoding", UCharsetDetector_get_declared_encoding, 0);
336
+ rb_define_method(cUCharsetDetector, "declared_encoding=", UCharsetDetector_set_declared_encoding, 1);
337
+ rb_define_method(cUCharsetDetector, "detect", UCharsetDetector_detect, -1);
338
+ rb_define_method(cUCharsetDetector, "detect_all", UCharsetDetector_detect_all, -1);
339
+ rb_define_method(cUCharsetDetector, "detectable_charsets", UCharsetDetector_get_detectable_charsets, 0);
340
+ }
@@ -0,0 +1,37 @@
1
+ $:.unshift(File.dirname(__FILE__)) unless
2
+ $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
+
4
+ module Uchardet
5
+ VERSION = '0.1.1'
6
+ end
7
+
8
+ begin
9
+ require 'uchardet.so'
10
+ rescue LoadError
11
+ # uh-oh
12
+ end
13
+
14
+ module ICU # :main: README
15
+ class UCharsetDetector # :main: README
16
+ ##
17
+ # Shortcut for ICU::UCharsetDetector#detect
18
+ #
19
+ def self.detect(*args)
20
+ self.new.detect(*args)
21
+ end
22
+
23
+ ##
24
+ # Shortcut for ICU::UCharsetDetector#detect_all
25
+ #
26
+ def self.detect_all(*args)
27
+ self.new.detect_all(*args)
28
+ end
29
+
30
+ ##
31
+ # Shortcut for ICU::UCharsetDetector#detectable_charsets
32
+ #
33
+ def self.detectable_charsets
34
+ self.new.detectable_charsets
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,80 @@
1
+ require 'optparse'
2
+
3
+ module Uchardet
4
+ class CLI
5
+ def self.execute(stdout, args=[])
6
+ @stdout = stdout
7
+ @options = {
8
+ :input_filtered => false,
9
+ :declared_encoding => nil,
10
+ :detect_all => false,
11
+ :path => nil
12
+ }
13
+
14
+ parser = OptionParser.new do |opts|
15
+ opts.banner = <<-BANNER.gsub(/^\s*/,'')
16
+ Usage: #{File.basename($0)} [options] file
17
+ BANNER
18
+
19
+ opts.on("-l", "--list",
20
+ "Display list of detectable character sets."
21
+ ) { self.list; exit }
22
+ opts.on("-s", "--strip",
23
+ "Strip HTML or XML markup before detection."
24
+ ) { @options[:input_filtered] = true }
25
+ opts.on("-e", "--encoding",
26
+ "Hint the charset detector about possible encoding."
27
+ ) { |arg| @options[:declared_encoding] = arg }
28
+ opts.on("-a", "--all",
29
+ "Show all matching encodings."
30
+ ) { @options[:detect_all] = true }
31
+ opts.on("-h", "--help",
32
+ "Show this help message."
33
+ ) { @stdout.puts opts; exit }
34
+
35
+ if args.empty?
36
+ @stdout.puts opts
37
+ else
38
+ begin
39
+ opts.parse!(args)
40
+ rescue OptionParser::ParseError => ex
41
+ STDERR.puts "ERROR: #{ex.to_s}. See #{File.basename($0)} --help"
42
+ exit
43
+ end
44
+
45
+ @options[:path] = args.last
46
+ if @options[:path].nil? || @options[:path].empty?
47
+ @stdout.puts opts
48
+ STDERR.puts "ERROR: please specify a file path."
49
+ exit
50
+ end
51
+
52
+ self.detect
53
+ end
54
+ end
55
+ end
56
+
57
+ def self.list
58
+ ICU::UCharsetDetector.detectable_charsets.uniq.sort.each { |name| @stdout.puts name }
59
+ end
60
+
61
+ def self.detect
62
+ detector = ICU::UCharsetDetector.new
63
+ detector.input_filtered = @options[:input_filtered]
64
+ detector.declared_encoding = @options[:declared_encoding]
65
+
66
+ source = IO.read(@options[:path])
67
+ matches = if @options[:detect_all]
68
+ detector.detect_all(source)
69
+ else
70
+ [detector.detect(source)]
71
+ end
72
+
73
+ matches.each do |match|
74
+ @stdout.puts "#{match[:encoding]} (confidence #{match[:confidence]}%)"
75
+ end
76
+ rescue Exception => ex
77
+ STDERR.puts "ERROR: #{ex.to_s}"
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env ruby
2
+ # File: script/console
3
+ irb = RUBY_PLATFORM =~ /(:?mswin|mingw)/ ? 'irb.bat' : 'irb'
4
+
5
+ libs = " -r irb/completion"
6
+ # Perhaps use a console_lib to store any extra methods I may want available in the cosole
7
+ # libs << " -r #{File.dirname(__FILE__) + '/../lib/console_lib/console_logger.rb'}"
8
+ libs << " -r #{File.dirname(__FILE__) + '/../lib/chardet-icu.rb'}"
9
+ puts "Loading chardet-icu gem"
10
+ exec "#{irb} #{libs} --simple-prompt"
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
3
+
4
+ begin
5
+ require 'rubigen'
6
+ rescue LoadError
7
+ require 'rubygems'
8
+ require 'rubigen'
9
+ end
10
+ require 'rubigen/scripts/destroy'
11
+
12
+ ARGV.shift if ['--help', '-h'].include?(ARGV[0])
13
+ RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
14
+ RubiGen::Scripts::Destroy.new.run(ARGV)
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
3
+
4
+ begin
5
+ require 'rubigen'
6
+ rescue LoadError
7
+ require 'rubygems'
8
+ require 'rubigen'
9
+ end
10
+ require 'rubigen/scripts/generate'
11
+
12
+ ARGV.shift if ['--help', '-h'].include?(ARGV[0])
13
+ RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
14
+ RubiGen::Scripts::Generate.new.run(ARGV)
@@ -0,0 +1,13 @@
1
+ namespace :extconf do
2
+ desc "Compiles the Ruby extension"
3
+ task :compile
4
+ end
5
+
6
+ task :compile => "extconf:compile"
7
+
8
+ task :test => :compile
9
+
10
+ BIN = "*.{o,bundle,jar,so,obj,pdb,lib,def,exp}"
11
+ $hoe.clean_globs |= ["ext/**/#{BIN}", "lib/**/#{BIN}", 'ext/**/Makefile']
12
+ $hoe.spec.require_paths = Dir['{lib,ext/*}']
13
+ $hoe.spec.extensions = FileList["ext/**/extconf.rb"].to_a
@@ -0,0 +1,43 @@
1
+ namespace :extconf do
2
+ extension = File.basename(__FILE__, '.rake')
3
+
4
+ ext = "ext/#{extension}"
5
+ ext_so = "#{ext}/#{extension}.#{Config::CONFIG['DLEXT']}"
6
+ ext_files = FileList[
7
+ "#{ext}/*.c",
8
+ "#{ext}/*.h",
9
+ "#{ext}/*.rl",
10
+ "#{ext}/extconf.rb",
11
+ "#{ext}/Makefile",
12
+ # "lib"
13
+ ]
14
+
15
+
16
+ task :compile => extension do
17
+ if Dir.glob("**/#{extension}.{o,so,dll}").length == 0
18
+ STDERR.puts "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
19
+ STDERR.puts "Gem actually failed to build. Your system is"
20
+ STDERR.puts "NOT configured properly to build #{GEM_NAME}."
21
+ STDERR.puts "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
22
+ exit(1)
23
+ end
24
+ end
25
+
26
+ desc "Builds just the #{extension} extension"
27
+ task extension.to_sym => ["#{ext}/Makefile", ext_so ]
28
+
29
+ file "#{ext}/Makefile" => ["#{ext}/extconf.rb"] do
30
+ Dir.chdir(ext) do ruby "extconf.rb" end
31
+ end
32
+
33
+ file ext_so => ext_files do
34
+ Dir.chdir(ext) do
35
+ sh(RUBY_PLATFORM =~ /win32/ ? 'nmake' : 'make') do |ok, res|
36
+ if !ok
37
+ require "fileutils"
38
+ FileUtils.rm Dir.glob('*.{so,o,dll,bundle}')
39
+ end
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,3 @@
1
+ require 'stringio'
2
+ require 'test/unit'
3
+ require File.dirname(__FILE__) + '/../lib/uchardet'
@@ -0,0 +1,22 @@
1
+ # encoding: utf-8
2
+
3
+ require File.dirname(__FILE__) + '/test_helper.rb'
4
+
5
+ class TestUchardet < Test::Unit::TestCase # :nodoc:
6
+
7
+ def test_detect
8
+ detector = ICU::UCharsetDetector.new
9
+ assert_equal(detector.detect(''), ICU::UCharsetDetector.detect(''))
10
+ end
11
+
12
+ def test_detect_all
13
+ detector = ICU::UCharsetDetector.new
14
+ assert_equal(detector.detect_all('∑'), ICU::UCharsetDetector.detect_all('∑'))
15
+ end
16
+
17
+ def test_detectable_charsets
18
+ detector = ICU::UCharsetDetector.new
19
+ assert_equal(detector.detectable_charsets, ICU::UCharsetDetector.detectable_charsets)
20
+ end
21
+
22
+ end
@@ -0,0 +1,14 @@
1
+ require File.join(File.dirname(__FILE__), "test_helper.rb")
2
+ require 'uchardet/cli'
3
+
4
+ class TestUchardetCli < Test::Unit::TestCase
5
+ def setup
6
+ Uchardet::CLI.execute(@stdout_io = StringIO.new, [])
7
+ @stdout_io.rewind
8
+ @stdout = @stdout_io.read
9
+ end
10
+
11
+ def test_print_default_output
12
+ assert_match(/Usage: .* \[options\] file/, @stdout)
13
+ end
14
+ end
@@ -0,0 +1,101 @@
1
+ # encoding: utf-8
2
+
3
+ require "test/unit"
4
+
5
+ $:.unshift File.dirname(__FILE__) + "/../ext/uchardet"
6
+ require "uchardet.so"
7
+
8
+ class TestUchardetExtn < Test::Unit::TestCase # :nodoc:
9
+
10
+ def test_init
11
+ assert_not_nil(ICU::UCharsetDetector)
12
+
13
+ assert_nothing_raised do
14
+ detector = ICU::UCharsetDetector.new
15
+ assert_not_nil(detector)
16
+
17
+ detector = ICU::UCharsetDetector.new nil
18
+ assert_not_nil(detector)
19
+
20
+ detector = ICU::UCharsetDetector.new 'some text'
21
+ assert_not_nil(detector)
22
+ end
23
+
24
+ assert_raise(TypeError) do
25
+ detector = ICU::UCharsetDetector.new 0
26
+ end
27
+
28
+ assert_raise(TypeError) do
29
+ detector = ICU::UCharsetDetector.new Time.now
30
+ end
31
+ end
32
+
33
+ def test_detect
34
+ detector = ICU::UCharsetDetector.new
35
+ assert_raise(ICU::Error) do
36
+ detector.detect
37
+ end
38
+ e = detector.detect '∂∆∂∆∂∆'
39
+ assert(e.is_a? Hash)
40
+ assert(e.has_key? :encoding)
41
+ assert(e.has_key? :confidence)
42
+ assert(e.has_key? :language)
43
+ assert_equal('utf-8', e[:encoding].downcase)
44
+ e = detector.detect '··', 'utf-8'
45
+ assert_equal('utf-8', e[:encoding].downcase)
46
+ e = detector.detect '··', 'Shift_JIS'
47
+ assert_equal('utf-8', e[:encoding].downcase)
48
+ end
49
+
50
+ def test_detect_all
51
+ detector = ICU::UCharsetDetector.new
52
+ assert_raise(ICU::Error) do
53
+ detector.detect_all
54
+ end
55
+ a = detector.detect_all '€‹€‹€'
56
+ assert(a.is_a? Array)
57
+ assert_equal(false, a.empty?)
58
+ assert(a[0].is_a? Hash)
59
+ assert(a[0].has_key? :encoding)
60
+ assert(a[0].has_key? :confidence)
61
+ assert(a[0].has_key? :language)
62
+ end
63
+
64
+ def test_input_filtered_accessor
65
+ detector = ICU::UCharsetDetector.new
66
+ assert_equal(false, detector.input_filtered?)
67
+ detector.input_filtered = true
68
+ assert_equal(true, detector.input_filtered?)
69
+ detector.input_filtered = ''
70
+ assert_equal(true, detector.input_filtered?)
71
+ detector.input_filtered = nil
72
+ assert_equal(false, detector.input_filtered?)
73
+ end
74
+
75
+ def test_text_accessor
76
+ detector = ICU::UCharsetDetector.new
77
+ assert_equal(nil, detector.text)
78
+ detector = ICU::UCharsetDetector.new 'blah'
79
+ assert_equal('blah', detector.text)
80
+ detector.text = 'test'
81
+ assert_equal('test', detector.text)
82
+ detector.detect
83
+ assert_equal('test', detector.text)
84
+ end
85
+
86
+ def test_declared_encoding_accessor
87
+ detector = ICU::UCharsetDetector.new
88
+ assert_equal(nil, detector.declared_encoding)
89
+ detector.declared_encoding = 'iso-8859-15'
90
+ assert_equal('iso-8859-15', detector.declared_encoding)
91
+ detector.detect 'test'
92
+ assert_equal('iso-8859-15', detector.declared_encoding)
93
+ end
94
+
95
+ def test_detectable_charsets
96
+ detector = ICU::UCharsetDetector.new
97
+ assert_not_nil(detector.detectable_charsets)
98
+ assert(detector.detectable_charsets.is_a? Array)
99
+ end
100
+
101
+ end
metadata ADDED
@@ -0,0 +1,89 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: uchardet
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - Dmitri Goutnik
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-12-19 00:00:00 +03:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: hoe
17
+ type: :development
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 2.4.0
24
+ version:
25
+ description: Fast character set encoding detection using International Components for Unicode C++ library.
26
+ email:
27
+ - dg@syrec.org
28
+ executables:
29
+ - uchardet
30
+ extensions:
31
+ - ext/uchardet/extconf.rb
32
+ extra_rdoc_files:
33
+ - History.txt
34
+ - Manifest.txt
35
+ - README.rdoc
36
+ files:
37
+ - History.txt
38
+ - Manifest.txt
39
+ - README.rdoc
40
+ - Rakefile
41
+ - bin/uchardet
42
+ - ext/uchardet/extconf.rb
43
+ - ext/uchardet/uchardet.c
44
+ - lib/uchardet.rb
45
+ - lib/uchardet/cli.rb
46
+ - script/console
47
+ - script/destroy
48
+ - script/generate
49
+ - tasks/extconf.rake
50
+ - tasks/extconf/uchardet.rake
51
+ - test/test_helper.rb
52
+ - test/test_uchardet.rb
53
+ - test/test_uchardet_cli.rb
54
+ - test/test_uchardet_extn.rb
55
+ has_rdoc: true
56
+ homepage: http://github.com/invisiblellama/uchardet
57
+ licenses: []
58
+
59
+ post_install_message:
60
+ rdoc_options:
61
+ - --main
62
+ - README.rdoc
63
+ require_paths:
64
+ - lib
65
+ - ext/uchardet
66
+ required_ruby_version: !ruby/object:Gem::Requirement
67
+ requirements:
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ version: "0"
71
+ version:
72
+ required_rubygems_version: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - ">="
75
+ - !ruby/object:Gem::Version
76
+ version: "0"
77
+ version:
78
+ requirements: []
79
+
80
+ rubyforge_project: uchardet
81
+ rubygems_version: 1.3.5
82
+ signing_key:
83
+ specification_version: 3
84
+ summary: Fast character set encoding detection using International Components for Unicode C++ library.
85
+ test_files:
86
+ - test/test_helper.rb
87
+ - test/test_uchardet.rb
88
+ - test/test_uchardet_cli.rb
89
+ - test/test_uchardet_extn.rb