uchardet 0.1.3 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 3f88bdf62e92c58a707c9099d3b00128ba1e944d7ed242cc756401e4d6ccdecc
4
+ data.tar.gz: 3e7ad2051d31269fdffde17c3f247836c8c69e525b6da8c27b0e8b7c58ef9880
5
+ SHA512:
6
+ metadata.gz: e0e698ab4a3eec93dc0cbde2aff53728fb98947266703f34ab29c93ede539b0ef32a4e32589d62b0bff319e485e6c2f7b4dad38d8299d98b97ab32167a4749fc
7
+ data.tar.gz: 4bd4efbb61bf4cad064d5c78b3b8d412790691bdfea58dd062c7e4168ccd37791b56a62803e9604efa03527b84e6bbdcf8607995ef2c9ea913397779636009ce
@@ -0,0 +1,3 @@
1
+ *.so
2
+ /pkg/
3
+ /tmp/
@@ -0,0 +1,5 @@
1
+ sudo: false
2
+ language: ruby
3
+ rvm:
4
+ - 2.5.1
5
+ before_install: gem install bundler -v 1.16.2
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source "https://rubygems.org"
2
+ gemspec
@@ -0,0 +1,25 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ uchardet (0.2.0)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ minitest (5.11.3)
10
+ rake (10.5.0)
11
+ rake-compiler (1.0.4)
12
+ rake
13
+
14
+ PLATFORMS
15
+ ruby
16
+
17
+ DEPENDENCIES
18
+ bundler (~> 1.16)
19
+ minitest (~> 5.0)
20
+ rake (~> 10.0)
21
+ rake-compiler (~> 1.0)
22
+ uchardet!
23
+
24
+ BUNDLED WITH
25
+ 1.16.2
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2009-2018 Dmitri Goutnik
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,60 @@
1
+ # Uchardet
2
+
3
+ Fast character set encoding detection using International Components for Unicode library: [International Components for Unicode](http://site.icu-project.org/)
4
+
5
+ * https://rubygems.org/gems/uchardet
6
+ * https://github.com/dmgk/uchardet
7
+ * https://www.rubydoc.info/gems/uchardet/
8
+
9
+ ## Installation
10
+
11
+ Add this line to your application's Gemfile:
12
+
13
+ ```ruby
14
+ gem 'uchardet'
15
+ ```
16
+
17
+ And then execute:
18
+
19
+ $ bundle
20
+
21
+ Or install it yourself as:
22
+
23
+ $ gem install uchardet
24
+
25
+ ## Usage
26
+
27
+ ```ruby
28
+ require 'open-uri'
29
+ require 'uchardet'
30
+
31
+ text = open('https://raw.githubusercontent.com/dmgk/uchardet/master/test/samples/shift_jis.txt').read
32
+ encoding = ICU::UCharsetDetector.detect(text)
33
+ encoding # => {:encoding=>"Shift_JIS", :confidence=>100, :language=>"ja"}
34
+ ```
35
+
36
+ From command line:
37
+
38
+ ```
39
+ $ uchardet
40
+
41
+ Usage: uchardet [options] file
42
+ -l, --list Display list of detectable character sets.
43
+ -s, --strip Strip HTML or XML markup before detection.
44
+ -e, --encoding Hint the charset detector about possible encoding.
45
+ -a, --all Show all matching encodings.
46
+ -h, --help Show this help message.
47
+ -v, --version Show version.
48
+
49
+ $ uchardet `which uchardet`
50
+
51
+ ISO-8859-1 (confidence 25%)
52
+ ```
53
+
54
+ ## Contributing
55
+
56
+ Bug reports and pull requests are welcome on GitHub at https://github.com/dmgk/uchardet
57
+
58
+ ## License
59
+
60
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
data/Rakefile CHANGED
@@ -1,23 +1,24 @@
1
- require 'rubygems'
2
- gem 'hoe', '>= 2.1.0'
3
- require 'hoe'
4
- require 'fileutils'
5
- require './lib/uchardet'
1
+ require 'bundler/gem_tasks'
2
+ require 'rake/extensiontask'
3
+ require 'rake/testtask'
6
4
 
7
- Hoe.plugin :newgem
5
+ GEMSPEC = Gem::Specification.load("uchardet.gemspec")
8
6
 
9
- # Generate all the Rake tasks
10
- # Run 'rake -T' to see list of generated tasks (from gem root directory)
11
- $hoe = Hoe.spec 'uchardet' do
12
- self.developer 'Dmitri Goutnik', 'dg@syrec.org'
13
- self.readme_file = 'README.rdoc'
14
- self.extra_rdoc_files = ['README.rdoc']
15
- self.rubyforge_name = self.name
7
+ # Rake::ExtensionTask.new(:uchardet_ext) do |t|
8
+ # t.lib_dir = 'lib/uchardet'
9
+ # end
10
+
11
+ Rake::ExtensionTask.new(:uchardet_ext, GEMSPEC) do |t|
12
+ t.ext_dir = 'ext'
13
+ end
14
+
15
+ Rake::TestTask.new(:test) do |t|
16
+ t.libs << 'test'
17
+ t.libs << 'lib'
18
+ t.test_files = FileList['test/**/*_test.rb']
16
19
  end
17
20
 
18
- require 'newgem/tasks'
19
- Dir['tasks/**/*.rake'].each { |t| load t }
21
+ task build: :compile
22
+ task test: :compile
20
23
 
21
- # TODO - want other tests/tasks run by default? Add them to the list
22
- # remove_task :default
23
- # task :default => [:spec, :features]
24
+ task default: :test
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "uchardet"
5
+ require 'open-uri'
6
+
7
+ # You can add fixtures and/or initialization code here to make experimenting
8
+ # with your gem easier. You can also use a different console, if you like.
9
+
10
+ # (If you use this, don't forget to add pry to your Gemfile!)
11
+ # require "pry"
12
+ # Pry.start
13
+
14
+ require "irb"
15
+ IRB.start(__FILE__)
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -1,8 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require 'rubygems'
4
- require File.expand_path(
5
- File.join(File.dirname(__FILE__), %w[.. lib uchardet]))
6
4
  require "uchardet/cli"
7
5
 
8
6
  Uchardet::CLI.execute(STDOUT, ARGV)
@@ -0,0 +1,12 @@
1
+ require 'mkmf'
2
+
3
+ icu_config = `which icu-config`.strip
4
+ if icu_config.empty?
5
+ abort %q{Could not find ICU libraries and/or development tools. Try installing "icu-devtools" or "icu" package.}
6
+ end
7
+
8
+ $LIBS << ' ' + `#{icu_config} --ldflags-system`.chomp
9
+ $LIBS << ' ' + `#{icu_config} --ldflags-libsonly`.chomp
10
+ $LDFLAGS << ' ' + `#{icu_config} --ldflags-searchpath`.chomp
11
+
12
+ create_makefile('uchardet_ext')
@@ -56,7 +56,7 @@ UCharsetDetector_get_input_filtered(VALUE self)
56
56
  *
57
57
  * Enable filtering of input text. If filtering is enabled,
58
58
  * text within angle brackets ("<" and ">") will be removed
59
- * before detection, which will remove most HTML or xml markup.
59
+ * before detection, which will remove most HTML or XML markup.
60
60
  */
61
61
  static VALUE
62
62
  UCharsetDetector_set_input_filtered(VALUE self, VALUE flag)
@@ -110,7 +110,7 @@ UCharsetDetector_get_declared_encoding(VALUE self)
110
110
  *
111
111
  * Set the declared encoding for charset detection.
112
112
  * The declared encoding of an input text is an encoding obtained
113
- * by the user from an http header or xml declaration or similar source that
113
+ * by the user from an HTTP header or XML declaration or similar source that
114
114
  * can be provided as an additional hint to the charset detector.
115
115
  */
116
116
  static VALUE
@@ -123,12 +123,12 @@ static void
123
123
  set_text(VALUE self, VALUE text)
124
124
  {
125
125
  if (!NIL_P(text)) {
126
- text = StringValue(text);
127
-
128
126
  UErrorCode status = U_ZERO_ERROR;
129
127
  UCharsetDetector *detector;
128
+
130
129
  Data_Get_Struct(self, UCharsetDetector, detector);
131
-
130
+
131
+ text = StringValue(text);
132
132
  ucsdet_setText(detector, StringValuePtr(text), RSTRING_LEN(text), &status);
133
133
  ensure(status);
134
134
 
@@ -140,12 +140,12 @@ static void
140
140
  set_declared_encoding(VALUE self, VALUE declared_encoding)
141
141
  {
142
142
  if (!NIL_P(declared_encoding)){
143
- declared_encoding = StringValue(declared_encoding);
144
-
145
143
  UErrorCode status = U_ZERO_ERROR;
146
144
  UCharsetDetector *detector;
145
+
147
146
  Data_Get_Struct(self, UCharsetDetector, detector);
148
147
 
148
+ declared_encoding = StringValue(declared_encoding);
149
149
  ucsdet_setDeclaredEncoding(detector, StringValuePtr(declared_encoding), RSTRING_LEN(declared_encoding), &status);
150
150
  ensure(status);
151
151
 
@@ -183,7 +183,8 @@ UCharsetDetector_initialize(int argc, VALUE *argv, VALUE self)
183
183
  * call-seq:
184
184
  * detect(text=nil, declared_encoding=nil)
185
185
  *
186
- * Return the charset that best matches the supplied input data.
186
+ * Return the charset that best matches the supplied input data. If no match
187
+ * could be found, this method returns nil.
187
188
  *
188
189
  * Note though, that because the detection
189
190
  * only looks at the start of the input data,
@@ -199,28 +200,32 @@ UCharsetDetector_detect(int argc, VALUE *argv, VALUE self)
199
200
  {
200
201
  VALUE text;
201
202
  VALUE declared_encoding;
203
+ UErrorCode status = U_ZERO_ERROR;
204
+ UCharsetDetector *detector;
205
+ const UCharsetMatch *match = NULL;
206
+ const char *encoding_name = "";
207
+ int32_t encoding_confidence = 0;
208
+ const char *encoding_language = "";
209
+ VALUE hash = rb_hash_new();
202
210
 
203
211
  rb_scan_args(argc, argv, "02", &text, &declared_encoding);
204
212
  set_text(self, text);
205
213
  set_declared_encoding(self, declared_encoding);
206
214
 
207
- UErrorCode status = U_ZERO_ERROR;
208
- UCharsetDetector *detector;
209
215
  Data_Get_Struct(self, UCharsetDetector, detector);
210
-
211
- const UCharsetMatch *match = ucsdet_detect(detector, &status);
212
- ensure(status);
213
-
214
- const char *encoding_name = ucsdet_getName(match, &status);
215
- ensure(status);
216
216
 
217
- int32_t encoding_confidence = ucsdet_getConfidence(match, &status);
217
+ match = ucsdet_detect(detector, &status);
218
218
  ensure(status);
219
-
220
- const char *encoding_language = ucsdet_getLanguage(match, &status);
221
- ensure(status);
222
-
223
- VALUE hash = rb_hash_new();
219
+
220
+ if (match) {
221
+ encoding_name = ucsdet_getName(match, &status);
222
+ ensure(status);
223
+ encoding_confidence = ucsdet_getConfidence(match, &status);
224
+ ensure(status);
225
+ encoding_language = ucsdet_getLanguage(match, &status);
226
+ ensure(status);
227
+ }
228
+
224
229
  rb_hash_aset(hash, ID2SYM(rb_intern("encoding")), rb_str_new2(encoding_name));
225
230
  rb_hash_aset(hash, ID2SYM(rb_intern("confidence")), INT2NUM(encoding_confidence));
226
231
  rb_hash_aset(hash, ID2SYM(rb_intern("language")), rb_str_new2(encoding_language));
@@ -249,37 +254,41 @@ UCharsetDetector_detect_all(int argc, VALUE *argv, VALUE self)
249
254
  {
250
255
  VALUE text;
251
256
  VALUE declared_encoding;
257
+ UCharsetDetector *detector;
258
+ UErrorCode status = U_ZERO_ERROR;
259
+ const UCharsetMatch **matches = NULL;
260
+ int32_t matches_found = 0;
261
+ VALUE ary = rb_ary_new();
262
+ int i;
252
263
 
253
264
  rb_scan_args(argc, argv, "02", &text, &declared_encoding);
254
265
  set_text(self, text);
255
266
  set_declared_encoding(self, declared_encoding);
256
267
 
257
- UCharsetDetector *detector;
258
268
  Data_Get_Struct(self, UCharsetDetector, detector);
259
- UErrorCode status = U_ZERO_ERROR;
260
- int32_t matches_found;
261
269
 
262
- const UCharsetMatch **matches = ucsdet_detectAll(detector, &matches_found, &status);
270
+ matches = ucsdet_detectAll(detector, &matches_found, &status);
263
271
  ensure(status);
264
272
 
265
- VALUE ary = rb_ary_new();
266
- int i = 0;
267
-
268
273
  for (i = 0; i < matches_found; i++) {
269
- const char *encoding_name = ucsdet_getName(matches[i], &status);
270
- ensure(status);
274
+ const char *encoding_name = "";
275
+ int32_t encoding_confidence = 0;
276
+ const char *encoding_language = "";
277
+ VALUE hash = rb_hash_new();
271
278
 
272
- int32_t encoding_confidence = ucsdet_getConfidence(matches[i], &status);
273
- ensure(status);
274
-
275
- const char *encoding_language = ucsdet_getLanguage(matches[i], &status);
276
- ensure(status);
279
+ if (matches[i]) {
280
+ encoding_name = ucsdet_getName(matches[i], &status);
281
+ ensure(status);
282
+ encoding_confidence = ucsdet_getConfidence(matches[i], &status);
283
+ ensure(status);
284
+ encoding_language = ucsdet_getLanguage(matches[i], &status);
285
+ ensure(status);
286
+ }
277
287
 
278
- VALUE hash = rb_hash_new();
279
288
  rb_hash_aset(hash, ID2SYM(rb_intern("encoding")), rb_str_new2(encoding_name));
280
289
  rb_hash_aset(hash, ID2SYM(rb_intern("confidence")), INT2NUM(encoding_confidence));
281
290
  rb_hash_aset(hash, ID2SYM(rb_intern("language")), rb_str_new2(encoding_language));
282
-
291
+
283
292
  rb_ary_push(ary, hash);
284
293
  }
285
294
 
@@ -296,16 +305,17 @@ static VALUE
296
305
  UCharsetDetector_get_detectable_charsets(VALUE self)
297
306
  {
298
307
  UCharsetDetector *detector;
299
- Data_Get_Struct(self, UCharsetDetector, detector);
300
308
  UErrorCode status = U_ZERO_ERROR;
309
+ UEnumeration *charsets = NULL;
310
+ const char *charset_name = "";
311
+ int32_t result_length = 0;
312
+ VALUE ary = rb_ary_new();
313
+
314
+ Data_Get_Struct(self, UCharsetDetector, detector);
301
315
 
302
- UEnumeration *charsets = ucsdet_getAllDetectableCharsets(detector, &status);
316
+ charsets = ucsdet_getAllDetectableCharsets(detector, &status);
303
317
  ensure(status);
304
318
 
305
- VALUE ary = rb_ary_new();
306
- int32_t result_length;
307
- const char *charset_name;
308
-
309
319
  while (charset_name = uenum_next(charsets, &result_length, &status)) {
310
320
  ensure(status);
311
321
  rb_ary_push(ary, rb_str_new2(charset_name));
@@ -318,7 +328,7 @@ UCharsetDetector_get_detectable_charsets(VALUE self)
318
328
  /*
319
329
  */
320
330
  void
321
- Init_uchardet()
331
+ Init_uchardet_ext()
322
332
  {
323
333
  VALUE mICU = rb_define_module("ICU");
324
334
 
@@ -1,35 +1,19 @@
1
- $:.unshift(File.dirname(__FILE__)) unless
2
- $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
-
4
- module Uchardet
5
- VERSION = '0.1.3'
6
- end
7
-
8
- begin
9
- require 'uchardet.so'
10
- rescue LoadError
11
- # uh-oh
12
- end
1
+ require 'uchardet_ext'
2
+ require 'uchardet/version'
13
3
 
14
4
  module ICU # :main: README
15
5
  class UCharsetDetector # :main: README
16
- ##
17
6
  # Shortcut for ICU::UCharsetDetector#detect
18
- #
19
7
  def self.detect(*args)
20
8
  self.new.detect(*args)
21
9
  end
22
10
 
23
- ##
24
11
  # Shortcut for ICU::UCharsetDetector#detect_all
25
- #
26
12
  def self.detect_all(*args)
27
13
  self.new.detect_all(*args)
28
14
  end
29
15
 
30
- ##
31
16
  # Shortcut for ICU::UCharsetDetector#detectable_charsets
32
- #
33
17
  def self.detectable_charsets
34
18
  self.new.detectable_charsets
35
19
  end
@@ -1,18 +1,19 @@
1
1
  require 'optparse'
2
+ require 'uchardet'
2
3
 
3
4
  module Uchardet
4
5
  class CLI
5
6
  def self.execute(stdout, args=[])
6
7
  @stdout = stdout
7
8
  @options = {
8
- :input_filtered => false,
9
- :declared_encoding => nil,
10
- :detect_all => false,
11
- :path => nil
9
+ input_filtered: false,
10
+ declared_encoding: nil,
11
+ detect_all: false,
12
+ path: nil
12
13
  }
13
14
 
14
- parser = OptionParser.new do |opts|
15
- opts.banner = <<-BANNER.gsub(/^\s*/,'')
15
+ OptionParser.new do |opts|
16
+ opts.banner = <<-BANNER.gsub(/^\s*/, '')
16
17
  Usage: #{File.basename($0)} [options] file
17
18
  BANNER
18
19
 
@@ -31,6 +32,9 @@ module Uchardet
31
32
  opts.on("-h", "--help",
32
33
  "Show this help message."
33
34
  ) { @stdout.puts opts; exit }
35
+ opts.on("-v", "--version",
36
+ "Show version."
37
+ ) { @stdout.puts Uchardet::VERSION; exit }
34
38
 
35
39
  if args.empty?
36
40
  @stdout.puts opts
@@ -54,7 +58,7 @@ module Uchardet
54
58
  end
55
59
 
56
60
  def self.list
57
- ICU::UCharsetDetector.detectable_charsets.uniq.sort.each { |name| @stdout.puts name }
61
+ ICU::UCharsetDetector.detectable_charsets.uniq.sort.each {|name| @stdout.puts name}
58
62
  end
59
63
 
60
64
  def self.detect
@@ -0,0 +1,3 @@
1
+ module Uchardet
2
+ VERSION = "0.2.0"
3
+ end
@@ -0,0 +1,27 @@
1
+ lib = File.expand_path('../lib', __FILE__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+ require 'uchardet/version'
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'uchardet'
7
+ spec.version = Uchardet::VERSION
8
+ spec.authors = ['Dmitri Goutnik']
9
+ spec.email = ['dg@syrec.org']
10
+
11
+ spec.summary = 'Fast character set encoding detection using International Components for Unicode library.'
12
+ spec.homepage = 'https://github.com/dmgk/uchardet'
13
+ spec.license = 'MIT'
14
+
15
+ spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
16
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
17
+ end
18
+ spec.bindir = 'exe'
19
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
+ spec.require_paths = ['lib']
21
+ spec.extensions = ['ext/extconf.rb']
22
+
23
+ spec.add_development_dependency 'bundler', '~> 1.16'
24
+ spec.add_development_dependency 'rake', '~> 10.0'
25
+ spec.add_development_dependency 'rake-compiler', '~> 1.0'
26
+ spec.add_development_dependency 'minitest', '~> 5.0'
27
+ end
metadata CHANGED
@@ -1,89 +1,119 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: uchardet
3
- version: !ruby/object:Gem::Version
4
- version: 0.1.3
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
5
5
  platform: ruby
6
- authors:
6
+ authors:
7
7
  - Dmitri Goutnik
8
8
  autorequire:
9
- bindir: bin
9
+ bindir: exe
10
10
  cert_chain: []
11
-
12
- date: 2009-12-20 00:00:00 +03:00
13
- default_executable:
14
- dependencies:
15
- - !ruby/object:Gem::Dependency
16
- name: hoe
11
+ date: 2018-05-29 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.16'
17
20
  type: :development
18
- version_requirement:
19
- version_requirements: !ruby/object:Gem::Requirement
20
- requirements:
21
- - - ">="
22
- - !ruby/object:Gem::Version
23
- version: 2.4.0
24
- version:
25
- description: Fast character set encoding detection using International Components for Unicode C++ library.
26
- email:
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.16'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake-compiler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: minitest
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '5.0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '5.0'
69
+ description:
70
+ email:
27
71
  - dg@syrec.org
28
- executables:
72
+ executables:
29
73
  - uchardet
30
- extensions:
31
- - ext/uchardet/extconf.rb
32
- extra_rdoc_files:
33
- - History.txt
34
- - Manifest.txt
35
- - README.rdoc
36
- files:
37
- - History.txt
38
- - Manifest.txt
39
- - README.rdoc
74
+ extensions:
75
+ - ext/extconf.rb
76
+ extra_rdoc_files: []
77
+ files:
78
+ - ".gitignore"
79
+ - ".travis.yml"
80
+ - Gemfile
81
+ - Gemfile.lock
82
+ - LICENSE.txt
83
+ - README.md
40
84
  - Rakefile
41
- - bin/uchardet
42
- - ext/uchardet/extconf.rb
43
- - ext/uchardet/uchardet.c
85
+ - bin/console
86
+ - bin/setup
87
+ - exe/uchardet
88
+ - ext/extconf.rb
89
+ - ext/uchardet.c
44
90
  - lib/uchardet.rb
45
91
  - lib/uchardet/cli.rb
46
- - script/console
47
- - script/destroy
48
- - script/generate
49
- - tasks/extconf.rake
50
- - tasks/extconf/uchardet.rake
51
- - test/test_helper.rb
52
- - test/test_uchardet.rb
53
- - test/test_uchardet_cli.rb
54
- - test/test_uchardet_extn.rb
55
- has_rdoc: true
56
- homepage: http://rubyforge.org/projects/uchardet/
57
- licenses: []
58
-
92
+ - lib/uchardet/version.rb
93
+ - uchardet.gemspec
94
+ homepage: https://github.com/dmgk/uchardet
95
+ licenses:
96
+ - MIT
97
+ metadata: {}
59
98
  post_install_message:
60
- rdoc_options:
61
- - --main
62
- - README.rdoc
63
- require_paths:
99
+ rdoc_options: []
100
+ require_paths:
64
101
  - lib
65
- - ext/uchardet
66
- required_ruby_version: !ruby/object:Gem::Requirement
67
- requirements:
102
+ required_ruby_version: !ruby/object:Gem::Requirement
103
+ requirements:
68
104
  - - ">="
69
- - !ruby/object:Gem::Version
70
- version: "0"
71
- version:
72
- required_rubygems_version: !ruby/object:Gem::Requirement
73
- requirements:
105
+ - !ruby/object:Gem::Version
106
+ version: '0'
107
+ required_rubygems_version: !ruby/object:Gem::Requirement
108
+ requirements:
74
109
  - - ">="
75
- - !ruby/object:Gem::Version
76
- version: "0"
77
- version:
110
+ - !ruby/object:Gem::Version
111
+ version: '0'
78
112
  requirements: []
79
-
80
- rubyforge_project: uchardet
81
- rubygems_version: 1.3.5
113
+ rubyforge_project:
114
+ rubygems_version: 2.7.7
82
115
  signing_key:
83
- specification_version: 3
84
- summary: Fast character set encoding detection using International Components for Unicode C++ library.
85
- test_files:
86
- - test/test_helper.rb
87
- - test/test_uchardet.rb
88
- - test/test_uchardet_cli.rb
89
- - test/test_uchardet_extn.rb
116
+ specification_version: 4
117
+ summary: Fast character set encoding detection using International Components for
118
+ Unicode library.
119
+ test_files: []
@@ -1,11 +0,0 @@
1
- === 0.1.1 2009-12-19
2
-
3
- * Initial release
4
-
5
- === 0.1.2 2009-12-20
6
-
7
- * Documentation and code cleanup.
8
-
9
- === 0.1.3 2009-12-20
10
-
11
- * extconf.rb fixes for Debian/Ubuntu builds
@@ -1,18 +0,0 @@
1
- History.txt
2
- Manifest.txt
3
- README.rdoc
4
- Rakefile
5
- bin/uchardet
6
- ext/uchardet/extconf.rb
7
- ext/uchardet/uchardet.c
8
- lib/uchardet.rb
9
- lib/uchardet/cli.rb
10
- script/console
11
- script/destroy
12
- script/generate
13
- tasks/extconf.rake
14
- tasks/extconf/uchardet.rake
15
- test/test_helper.rb
16
- test/test_uchardet.rb
17
- test/test_uchardet_cli.rb
18
- test/test_uchardet_extn.rb
@@ -1,52 +0,0 @@
1
- = uchardet
2
-
3
- * http://rubyforge.org/projects/uchardet/
4
- * http://github.com/invisiblellama/uchardet
5
- * http://uchardet.rubyforge.org/rdoc/
6
-
7
- == DESCRIPTION:
8
-
9
- Fast character set encoding detection using International Components for Unicode C++ library.
10
-
11
- == SYNOPSIS:
12
-
13
- require 'open-uri'
14
- require 'uchardet'
15
-
16
- encoding = ICU::UCharsetDetector.detect open('http://google.jp').read
17
- encoding # => {:language=>"ja", :encoding=>"Shift_JIS", :confidence=>100}
18
-
19
- From command line:
20
-
21
- $ uchardet
22
-
23
- Usage: uchardet [options] file
24
- -l, --list Display list of detectable character sets.
25
- -s, --strip Strip HTML or XML markup before detection.
26
- -e, --encoding Hint the charset detector about possible encoding.
27
- -a, --all Show all matching encodings.
28
- -h, --help Show this help message.
29
-
30
- $ uchardet `which uchardet`
31
-
32
- ISO-8859-1 (confidence 60%)
33
-
34
- == REQUIREMENTS:
35
-
36
- ICU[http://site.icu-project.org/] (International Components for Unicode):
37
-
38
- on Mac OS X:
39
-
40
- sudo port install icu
41
-
42
- on Debian/Ubuntu
43
-
44
- sudo apt-get install libicu-dev
45
-
46
- == INSTALL:
47
-
48
- sudo gem install uchardet
49
-
50
- == LICENSE:
51
-
52
- Copyright (c) 2009 Dmitri Goutnik, released under the MIT license.
@@ -1,12 +0,0 @@
1
- require 'mkmf'
2
-
3
- icu_config = `which icu-config`.strip
4
- if icu_config.empty?
5
- abort "ICU seems to be missing. Try 'port install icu' or 'apt-get install libicu-dev'"
6
- end
7
-
8
- $LIBS << ' ' + `#{icu_config} --ldflags-system`.strip
9
- $LIBS << ' ' + `#{icu_config} --ldflags-libsonly`.strip
10
- $LDFLAGS << ' ' + `#{icu_config} --ldflags-searchpath`.strip
11
-
12
- create_makefile("uchardet")
@@ -1,10 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # File: script/console
3
- irb = RUBY_PLATFORM =~ /(:?mswin|mingw)/ ? 'irb.bat' : 'irb'
4
-
5
- libs = " -r irb/completion"
6
- # Perhaps use a console_lib to store any extra methods I may want available in the cosole
7
- # libs << " -r #{File.dirname(__FILE__) + '/../lib/console_lib/console_logger.rb'}"
8
- libs << " -r #{File.dirname(__FILE__) + '/../lib/chardet-icu.rb'}"
9
- puts "Loading chardet-icu gem"
10
- exec "#{irb} #{libs} --simple-prompt"
@@ -1,14 +0,0 @@
1
- #!/usr/bin/env ruby
2
- APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
3
-
4
- begin
5
- require 'rubigen'
6
- rescue LoadError
7
- require 'rubygems'
8
- require 'rubigen'
9
- end
10
- require 'rubigen/scripts/destroy'
11
-
12
- ARGV.shift if ['--help', '-h'].include?(ARGV[0])
13
- RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
14
- RubiGen::Scripts::Destroy.new.run(ARGV)
@@ -1,14 +0,0 @@
1
- #!/usr/bin/env ruby
2
- APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
3
-
4
- begin
5
- require 'rubigen'
6
- rescue LoadError
7
- require 'rubygems'
8
- require 'rubigen'
9
- end
10
- require 'rubigen/scripts/generate'
11
-
12
- ARGV.shift if ['--help', '-h'].include?(ARGV[0])
13
- RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
14
- RubiGen::Scripts::Generate.new.run(ARGV)
@@ -1,13 +0,0 @@
1
- namespace :extconf do
2
- desc "Compiles the Ruby extension"
3
- task :compile
4
- end
5
-
6
- task :compile => "extconf:compile"
7
-
8
- task :test => :compile
9
-
10
- BIN = "*.{o,bundle,jar,so,obj,pdb,lib,def,exp}"
11
- $hoe.clean_globs |= ["ext/**/#{BIN}", "lib/**/#{BIN}", 'ext/**/Makefile']
12
- $hoe.spec.require_paths = Dir['{lib,ext/*}']
13
- $hoe.spec.extensions = FileList["ext/**/extconf.rb"].to_a
@@ -1,43 +0,0 @@
1
- namespace :extconf do
2
- extension = File.basename(__FILE__, '.rake')
3
-
4
- ext = "ext/#{extension}"
5
- ext_so = "#{ext}/#{extension}.#{Config::CONFIG['DLEXT']}"
6
- ext_files = FileList[
7
- "#{ext}/*.c",
8
- "#{ext}/*.h",
9
- "#{ext}/*.rl",
10
- "#{ext}/extconf.rb",
11
- "#{ext}/Makefile",
12
- # "lib"
13
- ]
14
-
15
-
16
- task :compile => extension do
17
- if Dir.glob("**/#{extension}.{o,so,dll}").length == 0
18
- STDERR.puts "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
19
- STDERR.puts "Gem actually failed to build. Your system is"
20
- STDERR.puts "NOT configured properly to build #{GEM_NAME}."
21
- STDERR.puts "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
22
- exit(1)
23
- end
24
- end
25
-
26
- desc "Builds just the #{extension} extension"
27
- task extension.to_sym => ["#{ext}/Makefile", ext_so ]
28
-
29
- file "#{ext}/Makefile" => ["#{ext}/extconf.rb"] do
30
- Dir.chdir(ext) do ruby "extconf.rb" end
31
- end
32
-
33
- file ext_so => ext_files do
34
- Dir.chdir(ext) do
35
- sh(RUBY_PLATFORM =~ /win32/ ? 'nmake' : 'make') do |ok, res|
36
- if !ok
37
- require "fileutils"
38
- FileUtils.rm Dir.glob('*.{so,o,dll,bundle}')
39
- end
40
- end
41
- end
42
- end
43
- end
@@ -1,3 +0,0 @@
1
- require 'stringio'
2
- require 'test/unit'
3
- require File.dirname(__FILE__) + '/../lib/uchardet'
@@ -1,22 +0,0 @@
1
- # encoding: utf-8
2
-
3
- require File.dirname(__FILE__) + '/test_helper.rb'
4
-
5
- class TestUchardet < Test::Unit::TestCase # :nodoc:
6
-
7
- def test_detect
8
- detector = ICU::UCharsetDetector.new
9
- assert_equal(detector.detect(''), ICU::UCharsetDetector.detect(''))
10
- end
11
-
12
- def test_detect_all
13
- detector = ICU::UCharsetDetector.new
14
- assert_equal(detector.detect_all('∑'), ICU::UCharsetDetector.detect_all('∑'))
15
- end
16
-
17
- def test_detectable_charsets
18
- detector = ICU::UCharsetDetector.new
19
- assert_equal(detector.detectable_charsets, ICU::UCharsetDetector.detectable_charsets)
20
- end
21
-
22
- end
@@ -1,14 +0,0 @@
1
- require File.join(File.dirname(__FILE__), "test_helper.rb")
2
- require 'uchardet/cli'
3
-
4
- class TestUchardetCli < Test::Unit::TestCase
5
- def setup
6
- Uchardet::CLI.execute(@stdout_io = StringIO.new, [])
7
- @stdout_io.rewind
8
- @stdout = @stdout_io.read
9
- end
10
-
11
- def test_print_default_output
12
- assert_match(/Usage: .* \[options\] file/, @stdout)
13
- end
14
- end
@@ -1,101 +0,0 @@
1
- # encoding: utf-8
2
-
3
- require "test/unit"
4
-
5
- $:.unshift File.dirname(__FILE__) + "/../ext/uchardet"
6
- require "uchardet.so"
7
-
8
- class TestUchardetExtn < Test::Unit::TestCase # :nodoc:
9
-
10
- def test_init
11
- assert_not_nil(ICU::UCharsetDetector)
12
-
13
- assert_nothing_raised do
14
- detector = ICU::UCharsetDetector.new
15
- assert_not_nil(detector)
16
-
17
- detector = ICU::UCharsetDetector.new nil
18
- assert_not_nil(detector)
19
-
20
- detector = ICU::UCharsetDetector.new 'some text'
21
- assert_not_nil(detector)
22
- end
23
-
24
- assert_raise(TypeError) do
25
- detector = ICU::UCharsetDetector.new 0
26
- end
27
-
28
- assert_raise(TypeError) do
29
- detector = ICU::UCharsetDetector.new Time.now
30
- end
31
- end
32
-
33
- def test_detect
34
- detector = ICU::UCharsetDetector.new
35
- assert_raise(ICU::Error) do
36
- detector.detect
37
- end
38
- e = detector.detect '∂∆∂∆∂∆'
39
- assert(e.is_a? Hash)
40
- assert(e.has_key? :encoding)
41
- assert(e.has_key? :confidence)
42
- assert(e.has_key? :language)
43
- assert_equal('utf-8', e[:encoding].downcase)
44
- e = detector.detect '··', 'utf-8'
45
- assert_equal('utf-8', e[:encoding].downcase)
46
- e = detector.detect '··', 'Shift_JIS'
47
- assert_equal('utf-8', e[:encoding].downcase)
48
- end
49
-
50
- def test_detect_all
51
- detector = ICU::UCharsetDetector.new
52
- assert_raise(ICU::Error) do
53
- detector.detect_all
54
- end
55
- a = detector.detect_all '€‹€‹€'
56
- assert(a.is_a? Array)
57
- assert_equal(false, a.empty?)
58
- assert(a[0].is_a? Hash)
59
- assert(a[0].has_key? :encoding)
60
- assert(a[0].has_key? :confidence)
61
- assert(a[0].has_key? :language)
62
- end
63
-
64
- def test_input_filtered_accessor
65
- detector = ICU::UCharsetDetector.new
66
- assert_equal(false, detector.input_filtered?)
67
- detector.input_filtered = true
68
- assert_equal(true, detector.input_filtered?)
69
- detector.input_filtered = ''
70
- assert_equal(true, detector.input_filtered?)
71
- detector.input_filtered = nil
72
- assert_equal(false, detector.input_filtered?)
73
- end
74
-
75
- def test_text_accessor
76
- detector = ICU::UCharsetDetector.new
77
- assert_equal(nil, detector.text)
78
- detector = ICU::UCharsetDetector.new 'blah'
79
- assert_equal('blah', detector.text)
80
- detector.text = 'test'
81
- assert_equal('test', detector.text)
82
- detector.detect
83
- assert_equal('test', detector.text)
84
- end
85
-
86
- def test_declared_encoding_accessor
87
- detector = ICU::UCharsetDetector.new
88
- assert_equal(nil, detector.declared_encoding)
89
- detector.declared_encoding = 'iso-8859-15'
90
- assert_equal('iso-8859-15', detector.declared_encoding)
91
- detector.detect 'test'
92
- assert_equal('iso-8859-15', detector.declared_encoding)
93
- end
94
-
95
- def test_detectable_charsets
96
- detector = ICU::UCharsetDetector.new
97
- assert_not_nil(detector.detectable_charsets)
98
- assert(detector.detectable_charsets.is_a? Array)
99
- end
100
-
101
- end