uchardet 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ === 0.1.1 2009-12-19
2
+
3
+ * Initial release
@@ -0,0 +1,18 @@
1
+ History.txt
2
+ Manifest.txt
3
+ README.rdoc
4
+ Rakefile
5
+ bin/uchardet
6
+ ext/uchardet/extconf.rb
7
+ ext/uchardet/uchardet.c
8
+ lib/uchardet.rb
9
+ lib/uchardet/cli.rb
10
+ script/console
11
+ script/destroy
12
+ script/generate
13
+ tasks/extconf.rake
14
+ tasks/extconf/uchardet.rake
15
+ test/test_helper.rb
16
+ test/test_uchardet.rb
17
+ test/test_uchardet_cli.rb
18
+ test/test_uchardet_extn.rb
@@ -0,0 +1,50 @@
1
+ = uchardet
2
+
3
+ * http://github.com/invisiblellama/uchardet
4
+
5
+ == DESCRIPTION:
6
+
7
+ Fast character set encoding detection using International Components for Unicode C++ library.
8
+
9
+ == SYNOPSIS:
10
+
11
+ require 'open-uri'
12
+ require 'uchardet'
13
+
14
+ encoding = ICU::UCharsetDetector.detect open('http://google.jp').read
15
+ encoding # => {:language=>"ja", :encoding=>"Shift_JIS", :confidence=>100}
16
+
17
+ From command line:
18
+
19
+ $ uchardet
20
+
21
+ Usage: uchardet [options] file
22
+ -l, --list Display list of detectable character sets.
23
+ -s, --strip Strip HTML or XML markup before detection.
24
+ -e, --encoding Hint the charset detector about possible encoding.
25
+ -a, --all Show all matching encodings.
26
+ -h, --help Show this help message.
27
+
28
+ $ uchardet `which uchardet`
29
+
30
+ ISO-8859-1 (confidence 60%)
31
+
32
+ == REQUIREMENTS:
33
+
34
+ ICU[http://site.icu-project.org/] (International Components for Unicode):
35
+
36
+ on Mac OS X:
37
+
38
+ sudo port install icu
39
+
40
+ on Debian/Ubuntu
41
+
42
+ sudo apt-get install libicu-dev
43
+
44
+ == INSTALL:
45
+
46
+ sudo gem install uchardet
47
+
48
+ == LICENSE:
49
+
50
+ Copyright (c) 2009 Dmitri Goutnik, released under the MIT license.
@@ -0,0 +1,23 @@
1
+ require 'rubygems'
2
+ gem 'hoe', '>= 2.1.0'
3
+ require 'hoe'
4
+ require 'fileutils'
5
+ require './lib/uchardet'
6
+
7
+ Hoe.plugin :newgem
8
+
9
+ # Generate all the Rake tasks
10
+ # Run 'rake -T' to see list of generated tasks (from gem root directory)
11
+ $hoe = Hoe.spec 'uchardet' do
12
+ self.developer 'Dmitri Goutnik', 'dg@syrec.org'
13
+ self.readme_file = 'README.rdoc'
14
+ self.extra_rdoc_files = ['README.rdoc']
15
+ self.rubyforge_name = self.name
16
+ end
17
+
18
+ require 'newgem/tasks'
19
+ Dir['tasks/**/*.rake'].each { |t| load t }
20
+
21
+ # TODO - want other tests/tasks run by default? Add them to the list
22
+ # remove_task :default
23
+ # task :default => [:spec, :features]
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rubygems'
4
+ require File.expand_path(
5
+ File.join(File.dirname(__FILE__), %w[.. lib uchardet]))
6
+ require "uchardet/cli"
7
+
8
+ Uchardet::CLI.execute(STDOUT, ARGV)
@@ -0,0 +1,11 @@
1
+ require 'mkmf'
2
+
3
+ icu_config = `which icu-config`.strip
4
+ if icu_config.empty?
5
+ abort "ICU seems to be missing. Try 'port install icu' or 'apt-get install libicu-dev'"
6
+ end
7
+
8
+ $LIBS << ' ' + `#{icu_config} --ldflags-system --ldflags-libsonly`.strip
9
+ $LDFLAGS << ' ' + `#{icu_config} --ldflags-searchpath`.strip
10
+
11
+ create_makefile("uchardet")
@@ -0,0 +1,340 @@
1
+ #include <ruby.h>
2
+ #include <unicode/ucsdet.h>
3
+
4
+ #ifndef RSTRING_PTR
5
+ # define RSTRING_PTR(str) RSTRING(str)->ptr
6
+ # define RSTRING_LEN(str) RSTRING(str)->len
7
+ #endif
8
+
9
+ static VALUE cUChardetError;
10
+ static VALUE cUCharsetDetector;
11
+
12
+ static void
13
+ assure(UErrorCode status)
14
+ {
15
+ if (U_FAILURE(status)) {
16
+ VALUE ex = rb_exc_new2(cUChardetError, u_errorName(status));
17
+ rb_iv_set(ex, "@errno", INT2FIX(status));
18
+ rb_exc_raise(ex);
19
+ }
20
+ }
21
+
22
+ static void
23
+ UCharsetDetector_free(void *detector)
24
+ {
25
+ ucsdet_close(detector);
26
+ }
27
+
28
+ static VALUE
29
+ UCharsetDetector_alloc(VALUE klass)
30
+ {
31
+ UErrorCode status = U_ZERO_ERROR;
32
+ UCharsetDetector* detector = ucsdet_open(&status);
33
+ assure(status);
34
+
35
+ return Data_Wrap_Struct(klass, NULL, UCharsetDetector_free, detector);
36
+ }
37
+
38
+ /*
39
+ * call-seq:
40
+ * input_filtered
41
+ *
42
+ * Return filtering flag value this charset detector.
43
+ */
44
+ static VALUE
45
+ UCharsetDetector_get_input_filtered(VALUE self)
46
+ {
47
+ UCharsetDetector *detector;
48
+ Data_Get_Struct(self, UCharsetDetector, detector);
49
+
50
+ return ucsdet_isInputFilterEnabled(detector) ? Qtrue : Qfalse;
51
+ }
52
+
53
+ /*
54
+ * call-seq:
55
+ * input_filtered=
56
+ *
57
+ * Enable filtering of input text. If filtering is enabled,
58
+ * text within angle brackets ("<" and ">") will be removed
59
+ * before detection, which will remove most HTML or xml markup.
60
+ */
61
+ static VALUE
62
+ UCharsetDetector_set_input_filtered(VALUE self, VALUE flag)
63
+ {
64
+ UCharsetDetector *detector;
65
+ Data_Get_Struct(self, UCharsetDetector, detector);
66
+
67
+ ucsdet_enableInputFilter(detector, RTEST(flag) ? TRUE : FALSE);
68
+ return self;
69
+ }
70
+
71
+ /*
72
+ * call-seq:
73
+ * text
74
+ *
75
+ * Get input text for this detector.
76
+ */
77
+ static VALUE
78
+ UCharsetDetector_get_text(VALUE self)
79
+ {
80
+ return rb_iv_get(self, "@text");
81
+ }
82
+
83
+ /*
84
+ * call-seq:
85
+ * text=
86
+ *
87
+ * Set input text for this detector.
88
+ */
89
+ static VALUE
90
+ UCharsetDetector_set_text(VALUE self, VALUE text)
91
+ {
92
+ return rb_iv_set(self, "@text", text);
93
+ return text;
94
+ }
95
+
96
+ /*
97
+ * call-seq:
98
+ * declared_encoding
99
+ *
100
+ * Get the declared encoding for charset detection.
101
+ */
102
+ static VALUE
103
+ UCharsetDetector_get_declared_encoding(VALUE self)
104
+ {
105
+ return rb_iv_get(self, "@declared_encoding");
106
+ }
107
+
108
+ /*
109
+ * call-seq:
110
+ * declared_encoding=
111
+ *
112
+ * Set the declared encoding for charset detection.
113
+ * The declared encoding of an input text is an encoding obtained
114
+ * by the user from an http header or xml declaration or similar source that
115
+ * can be provided as an additional hint to the charset detector.
116
+ */
117
+ static VALUE
118
+ UCharsetDetector_set_declared_encoding(VALUE self, VALUE declared_encoding)
119
+ {
120
+ return rb_iv_set(self, "@declared_encoding", declared_encoding);
121
+ return declared_encoding;
122
+ }
123
+
124
+ static void
125
+ set_text(VALUE self, VALUE text)
126
+ {
127
+ if (!NIL_P(text)) {
128
+ text = StringValue(text);
129
+
130
+ UErrorCode status = U_ZERO_ERROR;
131
+ UCharsetDetector *detector;
132
+ Data_Get_Struct(self, UCharsetDetector, detector);
133
+
134
+ ucsdet_setText(detector, StringValuePtr(text), RSTRING_LEN(text), &status);
135
+ assure(status);
136
+
137
+ UCharsetDetector_set_text(self, text);
138
+ }
139
+ }
140
+
141
+ static void
142
+ set_declared_encoding(VALUE self, VALUE declared_encoding)
143
+ {
144
+ if (!NIL_P(declared_encoding)){
145
+ declared_encoding = StringValue(declared_encoding);
146
+
147
+ UErrorCode status = U_ZERO_ERROR;
148
+ UCharsetDetector *detector;
149
+ Data_Get_Struct(self, UCharsetDetector, detector);
150
+
151
+ ucsdet_setDeclaredEncoding(detector, StringValuePtr(declared_encoding), RSTRING_LEN(declared_encoding), &status);
152
+ assure(status);
153
+
154
+ UCharsetDetector_set_declared_encoding(self, declared_encoding);
155
+ }
156
+ }
157
+
158
+ /*
159
+ * call-seq:
160
+ * new(text=nil, declared_encoding=nil)
161
+ *
162
+ * Create a new charset detector. Optionally set input text and declared encoding.
163
+ */
164
+ static VALUE
165
+ UCharsetDetector_initialize(int argc, VALUE *argv, VALUE self)
166
+ {
167
+ VALUE text;
168
+ VALUE declared_encoding;
169
+
170
+ rb_scan_args(argc, argv, "02", &text, &declared_encoding);
171
+ if (NIL_P(text))
172
+ UCharsetDetector_set_text(self, Qnil);
173
+ else
174
+ set_text(self, text);
175
+
176
+ if (NIL_P(declared_encoding))
177
+ UCharsetDetector_set_declared_encoding(self, Qnil);
178
+ else
179
+ set_declared_encoding(self, declared_encoding);
180
+
181
+ return self;
182
+ }
183
+
184
+ /*
185
+ * call-seq:
186
+ * detect(text=nil, declared_encoding=nil)
187
+ *
188
+ * Return the charset that best matches the supplied input data.
189
+ *
190
+ * Note though, that because the detection
191
+ * only looks at the start of the input data,
192
+ * there is a possibility that the returned charset will fail to handle
193
+ * the full set of input data.
194
+ *
195
+ * The function will fail if
196
+ * * no charset appears to match the data
197
+ * * no input text has been provided (with +text+ or set with #text= )
198
+ */
199
+ static VALUE
200
+ UCharsetDetector_detect(int argc, VALUE *argv, VALUE self)
201
+ {
202
+ VALUE text;
203
+ VALUE declared_encoding;
204
+
205
+ rb_scan_args(argc, argv, "02", &text, &declared_encoding);
206
+ set_text(self, text);
207
+ set_declared_encoding(self, declared_encoding);
208
+
209
+ UErrorCode status = U_ZERO_ERROR;
210
+ UCharsetDetector *detector;
211
+ Data_Get_Struct(self, UCharsetDetector, detector);
212
+
213
+ const UCharsetMatch *match = ucsdet_detect(detector, &status);
214
+ assure(status);
215
+
216
+ const char *encoding_name = ucsdet_getName(match, &status);
217
+ assure(status);
218
+
219
+ int32_t encoding_confidence = ucsdet_getConfidence(match, &status);
220
+ assure(status);
221
+
222
+ const char *encoding_language = ucsdet_getLanguage(match, &status);
223
+ assure(status);
224
+
225
+ VALUE hash = rb_hash_new();
226
+ rb_hash_aset(hash, ID2SYM(rb_intern("encoding")), rb_str_new2(encoding_name));
227
+ rb_hash_aset(hash, ID2SYM(rb_intern("confidence")), INT2NUM(encoding_confidence));
228
+ rb_hash_aset(hash, ID2SYM(rb_intern("language")), rb_str_new2(encoding_language));
229
+
230
+ return hash;
231
+ }
232
+
233
+ /*
234
+ * call-seq:
235
+ * detect_all(text=nil, declared_encoding=nil)
236
+ *
237
+ * Find all charset matches that appear to be consistent with the input,
238
+ * returning an array of results. The results are ordered with the
239
+ * best quality match first.
240
+ *
241
+ * Because the detection only looks at a limited amount of the
242
+ * input byte data, some of the returned charsets may fail to handle
243
+ * the all of input data.
244
+ *
245
+ * Return an error if
246
+ * * no charset appears to match the data
247
+ * * no input text has been provided (with +text+ or set with #text= )
248
+ */
249
+ static VALUE
250
+ UCharsetDetector_detect_all(int argc, VALUE *argv, VALUE self)
251
+ {
252
+ VALUE text;
253
+ VALUE declared_encoding;
254
+
255
+ rb_scan_args(argc, argv, "02", &text, &declared_encoding);
256
+ set_text(self, text);
257
+ set_declared_encoding(self, declared_encoding);
258
+
259
+ UCharsetDetector *detector;
260
+ Data_Get_Struct(self, UCharsetDetector, detector);
261
+ UErrorCode status = U_ZERO_ERROR;
262
+ int32_t matches_found;
263
+
264
+ const UCharsetMatch **matches = ucsdet_detectAll(detector, &matches_found, &status);
265
+ assure(status);
266
+
267
+ VALUE ary = rb_ary_new();
268
+ int i = 0;
269
+
270
+ for (i = 0; i < matches_found; i++) {
271
+ const char *encoding_name = ucsdet_getName(matches[i], &status);
272
+ assure(status);
273
+
274
+ int32_t encoding_confidence = ucsdet_getConfidence(matches[i], &status);
275
+ assure(status);
276
+
277
+ const char *encoding_language = ucsdet_getLanguage(matches[i], &status);
278
+ assure(status);
279
+
280
+ VALUE hash = rb_hash_new();
281
+ rb_hash_aset(hash, ID2SYM(rb_intern("encoding")), rb_str_new2(encoding_name));
282
+ rb_hash_aset(hash, ID2SYM(rb_intern("confidence")), INT2NUM(encoding_confidence));
283
+ rb_hash_aset(hash, ID2SYM(rb_intern("language")), rb_str_new2(encoding_language));
284
+
285
+ rb_ary_push(ary, hash);
286
+ }
287
+
288
+ return ary;
289
+ }
290
+
291
+ /*
292
+ * call-seq:
293
+ * detectable_charsets
294
+ *
295
+ * Get array of names of all detectable charsets that are known to the
296
+ * charset detection service.
297
+ */
298
+ static VALUE
299
+ UCharsetDetector_get_detectable_charsets(VALUE self)
300
+ {
301
+ UCharsetDetector *detector;
302
+ Data_Get_Struct(self, UCharsetDetector, detector);
303
+ UErrorCode status = U_ZERO_ERROR;
304
+
305
+ UEnumeration *charsets = ucsdet_getAllDetectableCharsets(detector, &status);
306
+ assure(status);
307
+
308
+ VALUE ary = rb_ary_new();
309
+ int32_t result_length;
310
+ const char *charset_name;
311
+
312
+ while (charset_name = uenum_next(charsets, &result_length, &status)) {
313
+ assure(status);
314
+ rb_ary_push(ary, rb_str_new2(charset_name));
315
+ }
316
+ uenum_close(charsets);
317
+
318
+ return ary;
319
+ }
320
+
321
+ void
322
+ Init_uchardet()
323
+ {
324
+ VALUE mICU = rb_define_module("ICU");
325
+
326
+ cUChardetError = rb_define_class_under(mICU, "Error", rb_eStandardError);
327
+
328
+ cUCharsetDetector = rb_define_class_under(mICU, "UCharsetDetector", rb_cObject);
329
+ rb_define_alloc_func(cUCharsetDetector, UCharsetDetector_alloc);
330
+ rb_define_method(cUCharsetDetector, "initialize", UCharsetDetector_initialize, -1);
331
+ rb_define_method(cUCharsetDetector, "input_filtered?", UCharsetDetector_get_input_filtered, 0);
332
+ rb_define_method(cUCharsetDetector, "input_filtered=", UCharsetDetector_set_input_filtered, 1);
333
+ rb_define_method(cUCharsetDetector, "text", UCharsetDetector_get_text, 0);
334
+ rb_define_method(cUCharsetDetector, "text=", UCharsetDetector_set_text, 1);
335
+ rb_define_method(cUCharsetDetector, "declared_encoding", UCharsetDetector_get_declared_encoding, 0);
336
+ rb_define_method(cUCharsetDetector, "declared_encoding=", UCharsetDetector_set_declared_encoding, 1);
337
+ rb_define_method(cUCharsetDetector, "detect", UCharsetDetector_detect, -1);
338
+ rb_define_method(cUCharsetDetector, "detect_all", UCharsetDetector_detect_all, -1);
339
+ rb_define_method(cUCharsetDetector, "detectable_charsets", UCharsetDetector_get_detectable_charsets, 0);
340
+ }
@@ -0,0 +1,37 @@
1
+ $:.unshift(File.dirname(__FILE__)) unless
2
+ $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
+
4
+ module Uchardet
5
+ VERSION = '0.1.1'
6
+ end
7
+
8
+ begin
9
+ require 'uchardet.so'
10
+ rescue LoadError
11
+ # uh-oh
12
+ end
13
+
14
+ module ICU # :main: README
15
+ class UCharsetDetector # :main: README
16
+ ##
17
+ # Shortcut for ICU::UCharsetDetector#detect
18
+ #
19
+ def self.detect(*args)
20
+ self.new.detect(*args)
21
+ end
22
+
23
+ ##
24
+ # Shortcut for ICU::UCharsetDetector#detect_all
25
+ #
26
+ def self.detect_all(*args)
27
+ self.new.detect_all(*args)
28
+ end
29
+
30
+ ##
31
+ # Shortcut for ICU::UCharsetDetector#detectable_charsets
32
+ #
33
+ def self.detectable_charsets
34
+ self.new.detectable_charsets
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,80 @@
1
+ require 'optparse'
2
+
3
+ module Uchardet
4
+ class CLI
5
+ def self.execute(stdout, args=[])
6
+ @stdout = stdout
7
+ @options = {
8
+ :input_filtered => false,
9
+ :declared_encoding => nil,
10
+ :detect_all => false,
11
+ :path => nil
12
+ }
13
+
14
+ parser = OptionParser.new do |opts|
15
+ opts.banner = <<-BANNER.gsub(/^\s*/,'')
16
+ Usage: #{File.basename($0)} [options] file
17
+ BANNER
18
+
19
+ opts.on("-l", "--list",
20
+ "Display list of detectable character sets."
21
+ ) { self.list; exit }
22
+ opts.on("-s", "--strip",
23
+ "Strip HTML or XML markup before detection."
24
+ ) { @options[:input_filtered] = true }
25
+ opts.on("-e", "--encoding",
26
+ "Hint the charset detector about possible encoding."
27
+ ) { |arg| @options[:declared_encoding] = arg }
28
+ opts.on("-a", "--all",
29
+ "Show all matching encodings."
30
+ ) { @options[:detect_all] = true }
31
+ opts.on("-h", "--help",
32
+ "Show this help message."
33
+ ) { @stdout.puts opts; exit }
34
+
35
+ if args.empty?
36
+ @stdout.puts opts
37
+ else
38
+ begin
39
+ opts.parse!(args)
40
+ rescue OptionParser::ParseError => ex
41
+ STDERR.puts "ERROR: #{ex.to_s}. See #{File.basename($0)} --help"
42
+ exit
43
+ end
44
+
45
+ @options[:path] = args.last
46
+ if @options[:path].nil? || @options[:path].empty?
47
+ @stdout.puts opts
48
+ STDERR.puts "ERROR: please specify a file path."
49
+ exit
50
+ end
51
+
52
+ self.detect
53
+ end
54
+ end
55
+ end
56
+
57
+ def self.list
58
+ ICU::UCharsetDetector.detectable_charsets.uniq.sort.each { |name| @stdout.puts name }
59
+ end
60
+
61
+ def self.detect
62
+ detector = ICU::UCharsetDetector.new
63
+ detector.input_filtered = @options[:input_filtered]
64
+ detector.declared_encoding = @options[:declared_encoding]
65
+
66
+ source = IO.read(@options[:path])
67
+ matches = if @options[:detect_all]
68
+ detector.detect_all(source)
69
+ else
70
+ [detector.detect(source)]
71
+ end
72
+
73
+ matches.each do |match|
74
+ @stdout.puts "#{match[:encoding]} (confidence #{match[:confidence]}%)"
75
+ end
76
+ rescue Exception => ex
77
+ STDERR.puts "ERROR: #{ex.to_s}"
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env ruby
2
+ # File: script/console
3
+ irb = RUBY_PLATFORM =~ /(:?mswin|mingw)/ ? 'irb.bat' : 'irb'
4
+
5
+ libs = " -r irb/completion"
6
+ # Perhaps use a console_lib to store any extra methods I may want available in the cosole
7
+ # libs << " -r #{File.dirname(__FILE__) + '/../lib/console_lib/console_logger.rb'}"
8
+ libs << " -r #{File.dirname(__FILE__) + '/../lib/chardet-icu.rb'}"
9
+ puts "Loading chardet-icu gem"
10
+ exec "#{irb} #{libs} --simple-prompt"
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
3
+
4
+ begin
5
+ require 'rubigen'
6
+ rescue LoadError
7
+ require 'rubygems'
8
+ require 'rubigen'
9
+ end
10
+ require 'rubigen/scripts/destroy'
11
+
12
+ ARGV.shift if ['--help', '-h'].include?(ARGV[0])
13
+ RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
14
+ RubiGen::Scripts::Destroy.new.run(ARGV)
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
3
+
4
+ begin
5
+ require 'rubigen'
6
+ rescue LoadError
7
+ require 'rubygems'
8
+ require 'rubigen'
9
+ end
10
+ require 'rubigen/scripts/generate'
11
+
12
+ ARGV.shift if ['--help', '-h'].include?(ARGV[0])
13
+ RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
14
+ RubiGen::Scripts::Generate.new.run(ARGV)
@@ -0,0 +1,13 @@
1
+ namespace :extconf do
2
+ desc "Compiles the Ruby extension"
3
+ task :compile
4
+ end
5
+
6
+ task :compile => "extconf:compile"
7
+
8
+ task :test => :compile
9
+
10
+ BIN = "*.{o,bundle,jar,so,obj,pdb,lib,def,exp}"
11
+ $hoe.clean_globs |= ["ext/**/#{BIN}", "lib/**/#{BIN}", 'ext/**/Makefile']
12
+ $hoe.spec.require_paths = Dir['{lib,ext/*}']
13
+ $hoe.spec.extensions = FileList["ext/**/extconf.rb"].to_a
@@ -0,0 +1,43 @@
1
+ namespace :extconf do
2
+ extension = File.basename(__FILE__, '.rake')
3
+
4
+ ext = "ext/#{extension}"
5
+ ext_so = "#{ext}/#{extension}.#{Config::CONFIG['DLEXT']}"
6
+ ext_files = FileList[
7
+ "#{ext}/*.c",
8
+ "#{ext}/*.h",
9
+ "#{ext}/*.rl",
10
+ "#{ext}/extconf.rb",
11
+ "#{ext}/Makefile",
12
+ # "lib"
13
+ ]
14
+
15
+
16
+ task :compile => extension do
17
+ if Dir.glob("**/#{extension}.{o,so,dll}").length == 0
18
+ STDERR.puts "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
19
+ STDERR.puts "Gem actually failed to build. Your system is"
20
+ STDERR.puts "NOT configured properly to build #{GEM_NAME}."
21
+ STDERR.puts "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
22
+ exit(1)
23
+ end
24
+ end
25
+
26
+ desc "Builds just the #{extension} extension"
27
+ task extension.to_sym => ["#{ext}/Makefile", ext_so ]
28
+
29
+ file "#{ext}/Makefile" => ["#{ext}/extconf.rb"] do
30
+ Dir.chdir(ext) do ruby "extconf.rb" end
31
+ end
32
+
33
+ file ext_so => ext_files do
34
+ Dir.chdir(ext) do
35
+ sh(RUBY_PLATFORM =~ /win32/ ? 'nmake' : 'make') do |ok, res|
36
+ if !ok
37
+ require "fileutils"
38
+ FileUtils.rm Dir.glob('*.{so,o,dll,bundle}')
39
+ end
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,3 @@
1
+ require 'stringio'
2
+ require 'test/unit'
3
+ require File.dirname(__FILE__) + '/../lib/uchardet'
@@ -0,0 +1,22 @@
1
+ # encoding: utf-8
2
+
3
+ require File.dirname(__FILE__) + '/test_helper.rb'
4
+
5
+ class TestUchardet < Test::Unit::TestCase # :nodoc:
6
+
7
+ def test_detect
8
+ detector = ICU::UCharsetDetector.new
9
+ assert_equal(detector.detect(''), ICU::UCharsetDetector.detect(''))
10
+ end
11
+
12
+ def test_detect_all
13
+ detector = ICU::UCharsetDetector.new
14
+ assert_equal(detector.detect_all('∑'), ICU::UCharsetDetector.detect_all('∑'))
15
+ end
16
+
17
+ def test_detectable_charsets
18
+ detector = ICU::UCharsetDetector.new
19
+ assert_equal(detector.detectable_charsets, ICU::UCharsetDetector.detectable_charsets)
20
+ end
21
+
22
+ end
@@ -0,0 +1,14 @@
1
+ require File.join(File.dirname(__FILE__), "test_helper.rb")
2
+ require 'uchardet/cli'
3
+
4
+ class TestUchardetCli < Test::Unit::TestCase
5
+ def setup
6
+ Uchardet::CLI.execute(@stdout_io = StringIO.new, [])
7
+ @stdout_io.rewind
8
+ @stdout = @stdout_io.read
9
+ end
10
+
11
+ def test_print_default_output
12
+ assert_match(/Usage: .* \[options\] file/, @stdout)
13
+ end
14
+ end
@@ -0,0 +1,101 @@
1
+ # encoding: utf-8
2
+
3
+ require "test/unit"
4
+
5
+ $:.unshift File.dirname(__FILE__) + "/../ext/uchardet"
6
+ require "uchardet.so"
7
+
8
+ class TestUchardetExtn < Test::Unit::TestCase # :nodoc:
9
+
10
+ def test_init
11
+ assert_not_nil(ICU::UCharsetDetector)
12
+
13
+ assert_nothing_raised do
14
+ detector = ICU::UCharsetDetector.new
15
+ assert_not_nil(detector)
16
+
17
+ detector = ICU::UCharsetDetector.new nil
18
+ assert_not_nil(detector)
19
+
20
+ detector = ICU::UCharsetDetector.new 'some text'
21
+ assert_not_nil(detector)
22
+ end
23
+
24
+ assert_raise(TypeError) do
25
+ detector = ICU::UCharsetDetector.new 0
26
+ end
27
+
28
+ assert_raise(TypeError) do
29
+ detector = ICU::UCharsetDetector.new Time.now
30
+ end
31
+ end
32
+
33
+ def test_detect
34
+ detector = ICU::UCharsetDetector.new
35
+ assert_raise(ICU::Error) do
36
+ detector.detect
37
+ end
38
+ e = detector.detect '∂∆∂∆∂∆'
39
+ assert(e.is_a? Hash)
40
+ assert(e.has_key? :encoding)
41
+ assert(e.has_key? :confidence)
42
+ assert(e.has_key? :language)
43
+ assert_equal('utf-8', e[:encoding].downcase)
44
+ e = detector.detect '··', 'utf-8'
45
+ assert_equal('utf-8', e[:encoding].downcase)
46
+ e = detector.detect '··', 'Shift_JIS'
47
+ assert_equal('utf-8', e[:encoding].downcase)
48
+ end
49
+
50
+ def test_detect_all
51
+ detector = ICU::UCharsetDetector.new
52
+ assert_raise(ICU::Error) do
53
+ detector.detect_all
54
+ end
55
+ a = detector.detect_all '€‹€‹€'
56
+ assert(a.is_a? Array)
57
+ assert_equal(false, a.empty?)
58
+ assert(a[0].is_a? Hash)
59
+ assert(a[0].has_key? :encoding)
60
+ assert(a[0].has_key? :confidence)
61
+ assert(a[0].has_key? :language)
62
+ end
63
+
64
+ def test_input_filtered_accessor
65
+ detector = ICU::UCharsetDetector.new
66
+ assert_equal(false, detector.input_filtered?)
67
+ detector.input_filtered = true
68
+ assert_equal(true, detector.input_filtered?)
69
+ detector.input_filtered = ''
70
+ assert_equal(true, detector.input_filtered?)
71
+ detector.input_filtered = nil
72
+ assert_equal(false, detector.input_filtered?)
73
+ end
74
+
75
+ def test_text_accessor
76
+ detector = ICU::UCharsetDetector.new
77
+ assert_equal(nil, detector.text)
78
+ detector = ICU::UCharsetDetector.new 'blah'
79
+ assert_equal('blah', detector.text)
80
+ detector.text = 'test'
81
+ assert_equal('test', detector.text)
82
+ detector.detect
83
+ assert_equal('test', detector.text)
84
+ end
85
+
86
+ def test_declared_encoding_accessor
87
+ detector = ICU::UCharsetDetector.new
88
+ assert_equal(nil, detector.declared_encoding)
89
+ detector.declared_encoding = 'iso-8859-15'
90
+ assert_equal('iso-8859-15', detector.declared_encoding)
91
+ detector.detect 'test'
92
+ assert_equal('iso-8859-15', detector.declared_encoding)
93
+ end
94
+
95
+ def test_detectable_charsets
96
+ detector = ICU::UCharsetDetector.new
97
+ assert_not_nil(detector.detectable_charsets)
98
+ assert(detector.detectable_charsets.is_a? Array)
99
+ end
100
+
101
+ end
metadata ADDED
@@ -0,0 +1,89 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: uchardet
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - Dmitri Goutnik
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-12-19 00:00:00 +03:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: hoe
17
+ type: :development
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 2.4.0
24
+ version:
25
+ description: Fast character set encoding detection using International Components for Unicode C++ library.
26
+ email:
27
+ - dg@syrec.org
28
+ executables:
29
+ - uchardet
30
+ extensions:
31
+ - ext/uchardet/extconf.rb
32
+ extra_rdoc_files:
33
+ - History.txt
34
+ - Manifest.txt
35
+ - README.rdoc
36
+ files:
37
+ - History.txt
38
+ - Manifest.txt
39
+ - README.rdoc
40
+ - Rakefile
41
+ - bin/uchardet
42
+ - ext/uchardet/extconf.rb
43
+ - ext/uchardet/uchardet.c
44
+ - lib/uchardet.rb
45
+ - lib/uchardet/cli.rb
46
+ - script/console
47
+ - script/destroy
48
+ - script/generate
49
+ - tasks/extconf.rake
50
+ - tasks/extconf/uchardet.rake
51
+ - test/test_helper.rb
52
+ - test/test_uchardet.rb
53
+ - test/test_uchardet_cli.rb
54
+ - test/test_uchardet_extn.rb
55
+ has_rdoc: true
56
+ homepage: http://github.com/invisiblellama/uchardet
57
+ licenses: []
58
+
59
+ post_install_message:
60
+ rdoc_options:
61
+ - --main
62
+ - README.rdoc
63
+ require_paths:
64
+ - lib
65
+ - ext/uchardet
66
+ required_ruby_version: !ruby/object:Gem::Requirement
67
+ requirements:
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ version: "0"
71
+ version:
72
+ required_rubygems_version: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - ">="
75
+ - !ruby/object:Gem::Version
76
+ version: "0"
77
+ version:
78
+ requirements: []
79
+
80
+ rubyforge_project: uchardet
81
+ rubygems_version: 1.3.5
82
+ signing_key:
83
+ specification_version: 3
84
+ summary: Fast character set encoding detection using International Components for Unicode C++ library.
85
+ test_files:
86
+ - test/test_helper.rb
87
+ - test/test_uchardet.rb
88
+ - test/test_uchardet_cli.rb
89
+ - test/test_uchardet_extn.rb