uchardet 0.1.3 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 3f88bdf62e92c58a707c9099d3b00128ba1e944d7ed242cc756401e4d6ccdecc
4
+ data.tar.gz: 3e7ad2051d31269fdffde17c3f247836c8c69e525b6da8c27b0e8b7c58ef9880
5
+ SHA512:
6
+ metadata.gz: e0e698ab4a3eec93dc0cbde2aff53728fb98947266703f34ab29c93ede539b0ef32a4e32589d62b0bff319e485e6c2f7b4dad38d8299d98b97ab32167a4749fc
7
+ data.tar.gz: 4bd4efbb61bf4cad064d5c78b3b8d412790691bdfea58dd062c7e4168ccd37791b56a62803e9604efa03527b84e6bbdcf8607995ef2c9ea913397779636009ce
@@ -0,0 +1,3 @@
1
+ *.so
2
+ /pkg/
3
+ /tmp/
@@ -0,0 +1,5 @@
1
+ sudo: false
2
+ language: ruby
3
+ rvm:
4
+ - 2.5.1
5
+ before_install: gem install bundler -v 1.16.2
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source "https://rubygems.org"
2
+ gemspec
@@ -0,0 +1,25 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ uchardet (0.2.0)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ minitest (5.11.3)
10
+ rake (10.5.0)
11
+ rake-compiler (1.0.4)
12
+ rake
13
+
14
+ PLATFORMS
15
+ ruby
16
+
17
+ DEPENDENCIES
18
+ bundler (~> 1.16)
19
+ minitest (~> 5.0)
20
+ rake (~> 10.0)
21
+ rake-compiler (~> 1.0)
22
+ uchardet!
23
+
24
+ BUNDLED WITH
25
+ 1.16.2
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2009-2018 Dmitri Goutnik
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,60 @@
1
+ # Uchardet
2
+
3
+ Fast character set encoding detection using International Components for Unicode library: [International Components for Unicode](http://site.icu-project.org/)
4
+
5
+ * https://rubygems.org/gems/uchardet
6
+ * https://github.com/dmgk/uchardet
7
+ * https://www.rubydoc.info/gems/uchardet/
8
+
9
+ ## Installation
10
+
11
+ Add this line to your application's Gemfile:
12
+
13
+ ```ruby
14
+ gem 'uchardet'
15
+ ```
16
+
17
+ And then execute:
18
+
19
+ $ bundle
20
+
21
+ Or install it yourself as:
22
+
23
+ $ gem install uchardet
24
+
25
+ ## Usage
26
+
27
+ ```ruby
28
+ require 'open-uri'
29
+ require 'uchardet'
30
+
31
+ text = open('https://raw.githubusercontent.com/dmgk/uchardet/master/test/samples/shift_jis.txt').read
32
+ encoding = ICU::UCharsetDetector.detect(text)
33
+ encoding # => {:encoding=>"Shift_JIS", :confidence=>100, :language=>"ja"}
34
+ ```
35
+
36
+ From command line:
37
+
38
+ ```
39
+ $ uchardet
40
+
41
+ Usage: uchardet [options] file
42
+ -l, --list Display list of detectable character sets.
43
+ -s, --strip Strip HTML or XML markup before detection.
44
+ -e, --encoding Hint the charset detector about possible encoding.
45
+ -a, --all Show all matching encodings.
46
+ -h, --help Show this help message.
47
+ -v, --version Show version.
48
+
49
+ $ uchardet `which uchardet`
50
+
51
+ ISO-8859-1 (confidence 25%)
52
+ ```
53
+
54
+ ## Contributing
55
+
56
+ Bug reports and pull requests are welcome on GitHub at https://github.com/dmgk/uchardet
57
+
58
+ ## License
59
+
60
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
data/Rakefile CHANGED
@@ -1,23 +1,24 @@
1
- require 'rubygems'
2
- gem 'hoe', '>= 2.1.0'
3
- require 'hoe'
4
- require 'fileutils'
5
- require './lib/uchardet'
1
+ require 'bundler/gem_tasks'
2
+ require 'rake/extensiontask'
3
+ require 'rake/testtask'
6
4
 
7
- Hoe.plugin :newgem
5
+ GEMSPEC = Gem::Specification.load("uchardet.gemspec")
8
6
 
9
- # Generate all the Rake tasks
10
- # Run 'rake -T' to see list of generated tasks (from gem root directory)
11
- $hoe = Hoe.spec 'uchardet' do
12
- self.developer 'Dmitri Goutnik', 'dg@syrec.org'
13
- self.readme_file = 'README.rdoc'
14
- self.extra_rdoc_files = ['README.rdoc']
15
- self.rubyforge_name = self.name
7
+ # Rake::ExtensionTask.new(:uchardet_ext) do |t|
8
+ # t.lib_dir = 'lib/uchardet'
9
+ # end
10
+
11
+ Rake::ExtensionTask.new(:uchardet_ext, GEMSPEC) do |t|
12
+ t.ext_dir = 'ext'
13
+ end
14
+
15
+ Rake::TestTask.new(:test) do |t|
16
+ t.libs << 'test'
17
+ t.libs << 'lib'
18
+ t.test_files = FileList['test/**/*_test.rb']
16
19
  end
17
20
 
18
- require 'newgem/tasks'
19
- Dir['tasks/**/*.rake'].each { |t| load t }
21
+ task build: :compile
22
+ task test: :compile
20
23
 
21
- # TODO - want other tests/tasks run by default? Add them to the list
22
- # remove_task :default
23
- # task :default => [:spec, :features]
24
+ task default: :test
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "uchardet"
5
+ require 'open-uri'
6
+
7
+ # You can add fixtures and/or initialization code here to make experimenting
8
+ # with your gem easier. You can also use a different console, if you like.
9
+
10
+ # (If you use this, don't forget to add pry to your Gemfile!)
11
+ # require "pry"
12
+ # Pry.start
13
+
14
+ require "irb"
15
+ IRB.start(__FILE__)
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -1,8 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require 'rubygems'
4
- require File.expand_path(
5
- File.join(File.dirname(__FILE__), %w[.. lib uchardet]))
6
4
  require "uchardet/cli"
7
5
 
8
6
  Uchardet::CLI.execute(STDOUT, ARGV)
@@ -0,0 +1,12 @@
1
+ require 'mkmf'
2
+
3
+ icu_config = `which icu-config`.strip
4
+ if icu_config.empty?
5
+ abort %q{Could not find ICU libraries and/or development tools. Try installing "icu-devtools" or "icu" package.}
6
+ end
7
+
8
+ $LIBS << ' ' + `#{icu_config} --ldflags-system`.chomp
9
+ $LIBS << ' ' + `#{icu_config} --ldflags-libsonly`.chomp
10
+ $LDFLAGS << ' ' + `#{icu_config} --ldflags-searchpath`.chomp
11
+
12
+ create_makefile('uchardet_ext')
@@ -56,7 +56,7 @@ UCharsetDetector_get_input_filtered(VALUE self)
56
56
  *
57
57
  * Enable filtering of input text. If filtering is enabled,
58
58
  * text within angle brackets ("<" and ">") will be removed
59
- * before detection, which will remove most HTML or xml markup.
59
+ * before detection, which will remove most HTML or XML markup.
60
60
  */
61
61
  static VALUE
62
62
  UCharsetDetector_set_input_filtered(VALUE self, VALUE flag)
@@ -110,7 +110,7 @@ UCharsetDetector_get_declared_encoding(VALUE self)
110
110
  *
111
111
  * Set the declared encoding for charset detection.
112
112
  * The declared encoding of an input text is an encoding obtained
113
- * by the user from an http header or xml declaration or similar source that
113
+ * by the user from an HTTP header or XML declaration or similar source that
114
114
  * can be provided as an additional hint to the charset detector.
115
115
  */
116
116
  static VALUE
@@ -123,12 +123,12 @@ static void
123
123
  set_text(VALUE self, VALUE text)
124
124
  {
125
125
  if (!NIL_P(text)) {
126
- text = StringValue(text);
127
-
128
126
  UErrorCode status = U_ZERO_ERROR;
129
127
  UCharsetDetector *detector;
128
+
130
129
  Data_Get_Struct(self, UCharsetDetector, detector);
131
-
130
+
131
+ text = StringValue(text);
132
132
  ucsdet_setText(detector, StringValuePtr(text), RSTRING_LEN(text), &status);
133
133
  ensure(status);
134
134
 
@@ -140,12 +140,12 @@ static void
140
140
  set_declared_encoding(VALUE self, VALUE declared_encoding)
141
141
  {
142
142
  if (!NIL_P(declared_encoding)){
143
- declared_encoding = StringValue(declared_encoding);
144
-
145
143
  UErrorCode status = U_ZERO_ERROR;
146
144
  UCharsetDetector *detector;
145
+
147
146
  Data_Get_Struct(self, UCharsetDetector, detector);
148
147
 
148
+ declared_encoding = StringValue(declared_encoding);
149
149
  ucsdet_setDeclaredEncoding(detector, StringValuePtr(declared_encoding), RSTRING_LEN(declared_encoding), &status);
150
150
  ensure(status);
151
151
 
@@ -183,7 +183,8 @@ UCharsetDetector_initialize(int argc, VALUE *argv, VALUE self)
183
183
  * call-seq:
184
184
  * detect(text=nil, declared_encoding=nil)
185
185
  *
186
- * Return the charset that best matches the supplied input data.
186
+ * Return the charset that best matches the supplied input data. If no match
187
+ * could be found, this method returns nil.
187
188
  *
188
189
  * Note though, that because the detection
189
190
  * only looks at the start of the input data,
@@ -199,28 +200,32 @@ UCharsetDetector_detect(int argc, VALUE *argv, VALUE self)
199
200
  {
200
201
  VALUE text;
201
202
  VALUE declared_encoding;
203
+ UErrorCode status = U_ZERO_ERROR;
204
+ UCharsetDetector *detector;
205
+ const UCharsetMatch *match = NULL;
206
+ const char *encoding_name = "";
207
+ int32_t encoding_confidence = 0;
208
+ const char *encoding_language = "";
209
+ VALUE hash = rb_hash_new();
202
210
 
203
211
  rb_scan_args(argc, argv, "02", &text, &declared_encoding);
204
212
  set_text(self, text);
205
213
  set_declared_encoding(self, declared_encoding);
206
214
 
207
- UErrorCode status = U_ZERO_ERROR;
208
- UCharsetDetector *detector;
209
215
  Data_Get_Struct(self, UCharsetDetector, detector);
210
-
211
- const UCharsetMatch *match = ucsdet_detect(detector, &status);
212
- ensure(status);
213
-
214
- const char *encoding_name = ucsdet_getName(match, &status);
215
- ensure(status);
216
216
 
217
- int32_t encoding_confidence = ucsdet_getConfidence(match, &status);
217
+ match = ucsdet_detect(detector, &status);
218
218
  ensure(status);
219
-
220
- const char *encoding_language = ucsdet_getLanguage(match, &status);
221
- ensure(status);
222
-
223
- VALUE hash = rb_hash_new();
219
+
220
+ if (match) {
221
+ encoding_name = ucsdet_getName(match, &status);
222
+ ensure(status);
223
+ encoding_confidence = ucsdet_getConfidence(match, &status);
224
+ ensure(status);
225
+ encoding_language = ucsdet_getLanguage(match, &status);
226
+ ensure(status);
227
+ }
228
+
224
229
  rb_hash_aset(hash, ID2SYM(rb_intern("encoding")), rb_str_new2(encoding_name));
225
230
  rb_hash_aset(hash, ID2SYM(rb_intern("confidence")), INT2NUM(encoding_confidence));
226
231
  rb_hash_aset(hash, ID2SYM(rb_intern("language")), rb_str_new2(encoding_language));
@@ -249,37 +254,41 @@ UCharsetDetector_detect_all(int argc, VALUE *argv, VALUE self)
249
254
  {
250
255
  VALUE text;
251
256
  VALUE declared_encoding;
257
+ UCharsetDetector *detector;
258
+ UErrorCode status = U_ZERO_ERROR;
259
+ const UCharsetMatch **matches = NULL;
260
+ int32_t matches_found = 0;
261
+ VALUE ary = rb_ary_new();
262
+ int i;
252
263
 
253
264
  rb_scan_args(argc, argv, "02", &text, &declared_encoding);
254
265
  set_text(self, text);
255
266
  set_declared_encoding(self, declared_encoding);
256
267
 
257
- UCharsetDetector *detector;
258
268
  Data_Get_Struct(self, UCharsetDetector, detector);
259
- UErrorCode status = U_ZERO_ERROR;
260
- int32_t matches_found;
261
269
 
262
- const UCharsetMatch **matches = ucsdet_detectAll(detector, &matches_found, &status);
270
+ matches = ucsdet_detectAll(detector, &matches_found, &status);
263
271
  ensure(status);
264
272
 
265
- VALUE ary = rb_ary_new();
266
- int i = 0;
267
-
268
273
  for (i = 0; i < matches_found; i++) {
269
- const char *encoding_name = ucsdet_getName(matches[i], &status);
270
- ensure(status);
274
+ const char *encoding_name = "";
275
+ int32_t encoding_confidence = 0;
276
+ const char *encoding_language = "";
277
+ VALUE hash = rb_hash_new();
271
278
 
272
- int32_t encoding_confidence = ucsdet_getConfidence(matches[i], &status);
273
- ensure(status);
274
-
275
- const char *encoding_language = ucsdet_getLanguage(matches[i], &status);
276
- ensure(status);
279
+ if (matches[i]) {
280
+ encoding_name = ucsdet_getName(matches[i], &status);
281
+ ensure(status);
282
+ encoding_confidence = ucsdet_getConfidence(matches[i], &status);
283
+ ensure(status);
284
+ encoding_language = ucsdet_getLanguage(matches[i], &status);
285
+ ensure(status);
286
+ }
277
287
 
278
- VALUE hash = rb_hash_new();
279
288
  rb_hash_aset(hash, ID2SYM(rb_intern("encoding")), rb_str_new2(encoding_name));
280
289
  rb_hash_aset(hash, ID2SYM(rb_intern("confidence")), INT2NUM(encoding_confidence));
281
290
  rb_hash_aset(hash, ID2SYM(rb_intern("language")), rb_str_new2(encoding_language));
282
-
291
+
283
292
  rb_ary_push(ary, hash);
284
293
  }
285
294
 
@@ -296,16 +305,17 @@ static VALUE
296
305
  UCharsetDetector_get_detectable_charsets(VALUE self)
297
306
  {
298
307
  UCharsetDetector *detector;
299
- Data_Get_Struct(self, UCharsetDetector, detector);
300
308
  UErrorCode status = U_ZERO_ERROR;
309
+ UEnumeration *charsets = NULL;
310
+ const char *charset_name = "";
311
+ int32_t result_length = 0;
312
+ VALUE ary = rb_ary_new();
313
+
314
+ Data_Get_Struct(self, UCharsetDetector, detector);
301
315
 
302
- UEnumeration *charsets = ucsdet_getAllDetectableCharsets(detector, &status);
316
+ charsets = ucsdet_getAllDetectableCharsets(detector, &status);
303
317
  ensure(status);
304
318
 
305
- VALUE ary = rb_ary_new();
306
- int32_t result_length;
307
- const char *charset_name;
308
-
309
319
  while (charset_name = uenum_next(charsets, &result_length, &status)) {
310
320
  ensure(status);
311
321
  rb_ary_push(ary, rb_str_new2(charset_name));
@@ -318,7 +328,7 @@ UCharsetDetector_get_detectable_charsets(VALUE self)
318
328
  /*
319
329
  */
320
330
  void
321
- Init_uchardet()
331
+ Init_uchardet_ext()
322
332
  {
323
333
  VALUE mICU = rb_define_module("ICU");
324
334
 
@@ -1,35 +1,19 @@
1
- $:.unshift(File.dirname(__FILE__)) unless
2
- $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
-
4
- module Uchardet
5
- VERSION = '0.1.3'
6
- end
7
-
8
- begin
9
- require 'uchardet.so'
10
- rescue LoadError
11
- # uh-oh
12
- end
1
+ require 'uchardet_ext'
2
+ require 'uchardet/version'
13
3
 
14
4
  module ICU # :main: README
15
5
  class UCharsetDetector # :main: README
16
- ##
17
6
  # Shortcut for ICU::UCharsetDetector#detect
18
- #
19
7
  def self.detect(*args)
20
8
  self.new.detect(*args)
21
9
  end
22
10
 
23
- ##
24
11
  # Shortcut for ICU::UCharsetDetector#detect_all
25
- #
26
12
  def self.detect_all(*args)
27
13
  self.new.detect_all(*args)
28
14
  end
29
15
 
30
- ##
31
16
  # Shortcut for ICU::UCharsetDetector#detectable_charsets
32
- #
33
17
  def self.detectable_charsets
34
18
  self.new.detectable_charsets
35
19
  end
@@ -1,18 +1,19 @@
1
1
  require 'optparse'
2
+ require 'uchardet'
2
3
 
3
4
  module Uchardet
4
5
  class CLI
5
6
  def self.execute(stdout, args=[])
6
7
  @stdout = stdout
7
8
  @options = {
8
- :input_filtered => false,
9
- :declared_encoding => nil,
10
- :detect_all => false,
11
- :path => nil
9
+ input_filtered: false,
10
+ declared_encoding: nil,
11
+ detect_all: false,
12
+ path: nil
12
13
  }
13
14
 
14
- parser = OptionParser.new do |opts|
15
- opts.banner = <<-BANNER.gsub(/^\s*/,'')
15
+ OptionParser.new do |opts|
16
+ opts.banner = <<-BANNER.gsub(/^\s*/, '')
16
17
  Usage: #{File.basename($0)} [options] file
17
18
  BANNER
18
19
 
@@ -31,6 +32,9 @@ module Uchardet
31
32
  opts.on("-h", "--help",
32
33
  "Show this help message."
33
34
  ) { @stdout.puts opts; exit }
35
+ opts.on("-v", "--version",
36
+ "Show version."
37
+ ) { @stdout.puts Uchardet::VERSION; exit }
34
38
 
35
39
  if args.empty?
36
40
  @stdout.puts opts
@@ -54,7 +58,7 @@ module Uchardet
54
58
  end
55
59
 
56
60
  def self.list
57
- ICU::UCharsetDetector.detectable_charsets.uniq.sort.each { |name| @stdout.puts name }
61
+ ICU::UCharsetDetector.detectable_charsets.uniq.sort.each {|name| @stdout.puts name}
58
62
  end
59
63
 
60
64
  def self.detect
@@ -0,0 +1,3 @@
1
+ module Uchardet
2
+ VERSION = "0.2.0"
3
+ end
@@ -0,0 +1,27 @@
1
+ lib = File.expand_path('../lib', __FILE__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+ require 'uchardet/version'
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'uchardet'
7
+ spec.version = Uchardet::VERSION
8
+ spec.authors = ['Dmitri Goutnik']
9
+ spec.email = ['dg@syrec.org']
10
+
11
+ spec.summary = 'Fast character set encoding detection using International Components for Unicode library.'
12
+ spec.homepage = 'https://github.com/dmgk/uchardet'
13
+ spec.license = 'MIT'
14
+
15
+ spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
16
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
17
+ end
18
+ spec.bindir = 'exe'
19
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
+ spec.require_paths = ['lib']
21
+ spec.extensions = ['ext/extconf.rb']
22
+
23
+ spec.add_development_dependency 'bundler', '~> 1.16'
24
+ spec.add_development_dependency 'rake', '~> 10.0'
25
+ spec.add_development_dependency 'rake-compiler', '~> 1.0'
26
+ spec.add_development_dependency 'minitest', '~> 5.0'
27
+ end
metadata CHANGED
@@ -1,89 +1,119 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: uchardet
3
- version: !ruby/object:Gem::Version
4
- version: 0.1.3
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
5
5
  platform: ruby
6
- authors:
6
+ authors:
7
7
  - Dmitri Goutnik
8
8
  autorequire:
9
- bindir: bin
9
+ bindir: exe
10
10
  cert_chain: []
11
-
12
- date: 2009-12-20 00:00:00 +03:00
13
- default_executable:
14
- dependencies:
15
- - !ruby/object:Gem::Dependency
16
- name: hoe
11
+ date: 2018-05-29 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.16'
17
20
  type: :development
18
- version_requirement:
19
- version_requirements: !ruby/object:Gem::Requirement
20
- requirements:
21
- - - ">="
22
- - !ruby/object:Gem::Version
23
- version: 2.4.0
24
- version:
25
- description: Fast character set encoding detection using International Components for Unicode C++ library.
26
- email:
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.16'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake-compiler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: minitest
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '5.0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '5.0'
69
+ description:
70
+ email:
27
71
  - dg@syrec.org
28
- executables:
72
+ executables:
29
73
  - uchardet
30
- extensions:
31
- - ext/uchardet/extconf.rb
32
- extra_rdoc_files:
33
- - History.txt
34
- - Manifest.txt
35
- - README.rdoc
36
- files:
37
- - History.txt
38
- - Manifest.txt
39
- - README.rdoc
74
+ extensions:
75
+ - ext/extconf.rb
76
+ extra_rdoc_files: []
77
+ files:
78
+ - ".gitignore"
79
+ - ".travis.yml"
80
+ - Gemfile
81
+ - Gemfile.lock
82
+ - LICENSE.txt
83
+ - README.md
40
84
  - Rakefile
41
- - bin/uchardet
42
- - ext/uchardet/extconf.rb
43
- - ext/uchardet/uchardet.c
85
+ - bin/console
86
+ - bin/setup
87
+ - exe/uchardet
88
+ - ext/extconf.rb
89
+ - ext/uchardet.c
44
90
  - lib/uchardet.rb
45
91
  - lib/uchardet/cli.rb
46
- - script/console
47
- - script/destroy
48
- - script/generate
49
- - tasks/extconf.rake
50
- - tasks/extconf/uchardet.rake
51
- - test/test_helper.rb
52
- - test/test_uchardet.rb
53
- - test/test_uchardet_cli.rb
54
- - test/test_uchardet_extn.rb
55
- has_rdoc: true
56
- homepage: http://rubyforge.org/projects/uchardet/
57
- licenses: []
58
-
92
+ - lib/uchardet/version.rb
93
+ - uchardet.gemspec
94
+ homepage: https://github.com/dmgk/uchardet
95
+ licenses:
96
+ - MIT
97
+ metadata: {}
59
98
  post_install_message:
60
- rdoc_options:
61
- - --main
62
- - README.rdoc
63
- require_paths:
99
+ rdoc_options: []
100
+ require_paths:
64
101
  - lib
65
- - ext/uchardet
66
- required_ruby_version: !ruby/object:Gem::Requirement
67
- requirements:
102
+ required_ruby_version: !ruby/object:Gem::Requirement
103
+ requirements:
68
104
  - - ">="
69
- - !ruby/object:Gem::Version
70
- version: "0"
71
- version:
72
- required_rubygems_version: !ruby/object:Gem::Requirement
73
- requirements:
105
+ - !ruby/object:Gem::Version
106
+ version: '0'
107
+ required_rubygems_version: !ruby/object:Gem::Requirement
108
+ requirements:
74
109
  - - ">="
75
- - !ruby/object:Gem::Version
76
- version: "0"
77
- version:
110
+ - !ruby/object:Gem::Version
111
+ version: '0'
78
112
  requirements: []
79
-
80
- rubyforge_project: uchardet
81
- rubygems_version: 1.3.5
113
+ rubyforge_project:
114
+ rubygems_version: 2.7.7
82
115
  signing_key:
83
- specification_version: 3
84
- summary: Fast character set encoding detection using International Components for Unicode C++ library.
85
- test_files:
86
- - test/test_helper.rb
87
- - test/test_uchardet.rb
88
- - test/test_uchardet_cli.rb
89
- - test/test_uchardet_extn.rb
116
+ specification_version: 4
117
+ summary: Fast character set encoding detection using International Components for
118
+ Unicode library.
119
+ test_files: []
@@ -1,11 +0,0 @@
1
- === 0.1.1 2009-12-19
2
-
3
- * Initial release
4
-
5
- === 0.1.2 2009-12-20
6
-
7
- * Documentation and code cleanup.
8
-
9
- === 0.1.3 2009-12-20
10
-
11
- * extconf.rb fixes for Debian/Ubuntu builds
@@ -1,18 +0,0 @@
1
- History.txt
2
- Manifest.txt
3
- README.rdoc
4
- Rakefile
5
- bin/uchardet
6
- ext/uchardet/extconf.rb
7
- ext/uchardet/uchardet.c
8
- lib/uchardet.rb
9
- lib/uchardet/cli.rb
10
- script/console
11
- script/destroy
12
- script/generate
13
- tasks/extconf.rake
14
- tasks/extconf/uchardet.rake
15
- test/test_helper.rb
16
- test/test_uchardet.rb
17
- test/test_uchardet_cli.rb
18
- test/test_uchardet_extn.rb
@@ -1,52 +0,0 @@
1
- = uchardet
2
-
3
- * http://rubyforge.org/projects/uchardet/
4
- * http://github.com/invisiblellama/uchardet
5
- * http://uchardet.rubyforge.org/rdoc/
6
-
7
- == DESCRIPTION:
8
-
9
- Fast character set encoding detection using International Components for Unicode C++ library.
10
-
11
- == SYNOPSIS:
12
-
13
- require 'open-uri'
14
- require 'uchardet'
15
-
16
- encoding = ICU::UCharsetDetector.detect open('http://google.jp').read
17
- encoding # => {:language=>"ja", :encoding=>"Shift_JIS", :confidence=>100}
18
-
19
- From command line:
20
-
21
- $ uchardet
22
-
23
- Usage: uchardet [options] file
24
- -l, --list Display list of detectable character sets.
25
- -s, --strip Strip HTML or XML markup before detection.
26
- -e, --encoding Hint the charset detector about possible encoding.
27
- -a, --all Show all matching encodings.
28
- -h, --help Show this help message.
29
-
30
- $ uchardet `which uchardet`
31
-
32
- ISO-8859-1 (confidence 60%)
33
-
34
- == REQUIREMENTS:
35
-
36
- ICU[http://site.icu-project.org/] (International Components for Unicode):
37
-
38
- on Mac OS X:
39
-
40
- sudo port install icu
41
-
42
- on Debian/Ubuntu
43
-
44
- sudo apt-get install libicu-dev
45
-
46
- == INSTALL:
47
-
48
- sudo gem install uchardet
49
-
50
- == LICENSE:
51
-
52
- Copyright (c) 2009 Dmitri Goutnik, released under the MIT license.
@@ -1,12 +0,0 @@
1
- require 'mkmf'
2
-
3
- icu_config = `which icu-config`.strip
4
- if icu_config.empty?
5
- abort "ICU seems to be missing. Try 'port install icu' or 'apt-get install libicu-dev'"
6
- end
7
-
8
- $LIBS << ' ' + `#{icu_config} --ldflags-system`.strip
9
- $LIBS << ' ' + `#{icu_config} --ldflags-libsonly`.strip
10
- $LDFLAGS << ' ' + `#{icu_config} --ldflags-searchpath`.strip
11
-
12
- create_makefile("uchardet")
@@ -1,10 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # File: script/console
3
- irb = RUBY_PLATFORM =~ /(:?mswin|mingw)/ ? 'irb.bat' : 'irb'
4
-
5
- libs = " -r irb/completion"
6
- # Perhaps use a console_lib to store any extra methods I may want available in the cosole
7
- # libs << " -r #{File.dirname(__FILE__) + '/../lib/console_lib/console_logger.rb'}"
8
- libs << " -r #{File.dirname(__FILE__) + '/../lib/chardet-icu.rb'}"
9
- puts "Loading chardet-icu gem"
10
- exec "#{irb} #{libs} --simple-prompt"
@@ -1,14 +0,0 @@
1
- #!/usr/bin/env ruby
2
- APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
3
-
4
- begin
5
- require 'rubigen'
6
- rescue LoadError
7
- require 'rubygems'
8
- require 'rubigen'
9
- end
10
- require 'rubigen/scripts/destroy'
11
-
12
- ARGV.shift if ['--help', '-h'].include?(ARGV[0])
13
- RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
14
- RubiGen::Scripts::Destroy.new.run(ARGV)
@@ -1,14 +0,0 @@
1
- #!/usr/bin/env ruby
2
- APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
3
-
4
- begin
5
- require 'rubigen'
6
- rescue LoadError
7
- require 'rubygems'
8
- require 'rubigen'
9
- end
10
- require 'rubigen/scripts/generate'
11
-
12
- ARGV.shift if ['--help', '-h'].include?(ARGV[0])
13
- RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
14
- RubiGen::Scripts::Generate.new.run(ARGV)
@@ -1,13 +0,0 @@
1
- namespace :extconf do
2
- desc "Compiles the Ruby extension"
3
- task :compile
4
- end
5
-
6
- task :compile => "extconf:compile"
7
-
8
- task :test => :compile
9
-
10
- BIN = "*.{o,bundle,jar,so,obj,pdb,lib,def,exp}"
11
- $hoe.clean_globs |= ["ext/**/#{BIN}", "lib/**/#{BIN}", 'ext/**/Makefile']
12
- $hoe.spec.require_paths = Dir['{lib,ext/*}']
13
- $hoe.spec.extensions = FileList["ext/**/extconf.rb"].to_a
@@ -1,43 +0,0 @@
1
- namespace :extconf do
2
- extension = File.basename(__FILE__, '.rake')
3
-
4
- ext = "ext/#{extension}"
5
- ext_so = "#{ext}/#{extension}.#{Config::CONFIG['DLEXT']}"
6
- ext_files = FileList[
7
- "#{ext}/*.c",
8
- "#{ext}/*.h",
9
- "#{ext}/*.rl",
10
- "#{ext}/extconf.rb",
11
- "#{ext}/Makefile",
12
- # "lib"
13
- ]
14
-
15
-
16
- task :compile => extension do
17
- if Dir.glob("**/#{extension}.{o,so,dll}").length == 0
18
- STDERR.puts "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
19
- STDERR.puts "Gem actually failed to build. Your system is"
20
- STDERR.puts "NOT configured properly to build #{GEM_NAME}."
21
- STDERR.puts "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
22
- exit(1)
23
- end
24
- end
25
-
26
- desc "Builds just the #{extension} extension"
27
- task extension.to_sym => ["#{ext}/Makefile", ext_so ]
28
-
29
- file "#{ext}/Makefile" => ["#{ext}/extconf.rb"] do
30
- Dir.chdir(ext) do ruby "extconf.rb" end
31
- end
32
-
33
- file ext_so => ext_files do
34
- Dir.chdir(ext) do
35
- sh(RUBY_PLATFORM =~ /win32/ ? 'nmake' : 'make') do |ok, res|
36
- if !ok
37
- require "fileutils"
38
- FileUtils.rm Dir.glob('*.{so,o,dll,bundle}')
39
- end
40
- end
41
- end
42
- end
43
- end
@@ -1,3 +0,0 @@
1
- require 'stringio'
2
- require 'test/unit'
3
- require File.dirname(__FILE__) + '/../lib/uchardet'
@@ -1,22 +0,0 @@
1
- # encoding: utf-8
2
-
3
- require File.dirname(__FILE__) + '/test_helper.rb'
4
-
5
- class TestUchardet < Test::Unit::TestCase # :nodoc:
6
-
7
- def test_detect
8
- detector = ICU::UCharsetDetector.new
9
- assert_equal(detector.detect(''), ICU::UCharsetDetector.detect(''))
10
- end
11
-
12
- def test_detect_all
13
- detector = ICU::UCharsetDetector.new
14
- assert_equal(detector.detect_all('∑'), ICU::UCharsetDetector.detect_all('∑'))
15
- end
16
-
17
- def test_detectable_charsets
18
- detector = ICU::UCharsetDetector.new
19
- assert_equal(detector.detectable_charsets, ICU::UCharsetDetector.detectable_charsets)
20
- end
21
-
22
- end
@@ -1,14 +0,0 @@
1
- require File.join(File.dirname(__FILE__), "test_helper.rb")
2
- require 'uchardet/cli'
3
-
4
- class TestUchardetCli < Test::Unit::TestCase
5
- def setup
6
- Uchardet::CLI.execute(@stdout_io = StringIO.new, [])
7
- @stdout_io.rewind
8
- @stdout = @stdout_io.read
9
- end
10
-
11
- def test_print_default_output
12
- assert_match(/Usage: .* \[options\] file/, @stdout)
13
- end
14
- end
@@ -1,101 +0,0 @@
1
- # encoding: utf-8
2
-
3
- require "test/unit"
4
-
5
- $:.unshift File.dirname(__FILE__) + "/../ext/uchardet"
6
- require "uchardet.so"
7
-
8
- class TestUchardetExtn < Test::Unit::TestCase # :nodoc:
9
-
10
- def test_init
11
- assert_not_nil(ICU::UCharsetDetector)
12
-
13
- assert_nothing_raised do
14
- detector = ICU::UCharsetDetector.new
15
- assert_not_nil(detector)
16
-
17
- detector = ICU::UCharsetDetector.new nil
18
- assert_not_nil(detector)
19
-
20
- detector = ICU::UCharsetDetector.new 'some text'
21
- assert_not_nil(detector)
22
- end
23
-
24
- assert_raise(TypeError) do
25
- detector = ICU::UCharsetDetector.new 0
26
- end
27
-
28
- assert_raise(TypeError) do
29
- detector = ICU::UCharsetDetector.new Time.now
30
- end
31
- end
32
-
33
- def test_detect
34
- detector = ICU::UCharsetDetector.new
35
- assert_raise(ICU::Error) do
36
- detector.detect
37
- end
38
- e = detector.detect '∂∆∂∆∂∆'
39
- assert(e.is_a? Hash)
40
- assert(e.has_key? :encoding)
41
- assert(e.has_key? :confidence)
42
- assert(e.has_key? :language)
43
- assert_equal('utf-8', e[:encoding].downcase)
44
- e = detector.detect '··', 'utf-8'
45
- assert_equal('utf-8', e[:encoding].downcase)
46
- e = detector.detect '··', 'Shift_JIS'
47
- assert_equal('utf-8', e[:encoding].downcase)
48
- end
49
-
50
- def test_detect_all
51
- detector = ICU::UCharsetDetector.new
52
- assert_raise(ICU::Error) do
53
- detector.detect_all
54
- end
55
- a = detector.detect_all '€‹€‹€'
56
- assert(a.is_a? Array)
57
- assert_equal(false, a.empty?)
58
- assert(a[0].is_a? Hash)
59
- assert(a[0].has_key? :encoding)
60
- assert(a[0].has_key? :confidence)
61
- assert(a[0].has_key? :language)
62
- end
63
-
64
- def test_input_filtered_accessor
65
- detector = ICU::UCharsetDetector.new
66
- assert_equal(false, detector.input_filtered?)
67
- detector.input_filtered = true
68
- assert_equal(true, detector.input_filtered?)
69
- detector.input_filtered = ''
70
- assert_equal(true, detector.input_filtered?)
71
- detector.input_filtered = nil
72
- assert_equal(false, detector.input_filtered?)
73
- end
74
-
75
- def test_text_accessor
76
- detector = ICU::UCharsetDetector.new
77
- assert_equal(nil, detector.text)
78
- detector = ICU::UCharsetDetector.new 'blah'
79
- assert_equal('blah', detector.text)
80
- detector.text = 'test'
81
- assert_equal('test', detector.text)
82
- detector.detect
83
- assert_equal('test', detector.text)
84
- end
85
-
86
- def test_declared_encoding_accessor
87
- detector = ICU::UCharsetDetector.new
88
- assert_equal(nil, detector.declared_encoding)
89
- detector.declared_encoding = 'iso-8859-15'
90
- assert_equal('iso-8859-15', detector.declared_encoding)
91
- detector.detect 'test'
92
- assert_equal('iso-8859-15', detector.declared_encoding)
93
- end
94
-
95
- def test_detectable_charsets
96
- detector = ICU::UCharsetDetector.new
97
- assert_not_nil(detector.detectable_charsets)
98
- assert(detector.detectable_charsets.is_a? Array)
99
- end
100
-
101
- end