charlock_holmes-jruby 0.1.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.rspec +4 -0
 - data/Gemfile +3 -0
 - data/Gemfile.lock +29 -0
 - data/LICENSE +20 -0
 - data/README.md +65 -0
 - data/Rakefile +17 -0
 - data/charlock_holmes-jruby.gemspec +25 -0
 - data/lib/charlock_holmes-jruby.rb +1 -0
 - data/lib/charlock_holmes.rb +5 -0
 - data/lib/charlock_holmes/charset_match_ext.rb +17 -0
 - data/lib/charlock_holmes/converter.rb +12 -0
 - data/lib/charlock_holmes/encoding_detector.rb +54 -0
 - data/lib/charlock_holmes/string.rb +21 -0
 - data/lib/charlock_holmes/version.rb +5 -0
 - data/lib/charlock_holmes_jruby.rb +1 -0
 - data/spec/converter_spec.rb +57 -0
 - data/spec/encoding_detector_spec.rb +97 -0
 - data/spec/fixtures/AnsiGraph.psm1 +0 -0
 - data/spec/fixtures/TwigExtensionsDate.es.yml +8 -0
 - data/spec/fixtures/cl-messagepack.lisp +264 -0
 - data/spec/fixtures/core.rkt +254 -0
 - data/spec/fixtures/laholator.py +131 -0
 - data/spec/fixtures/mingpao.html +455 -0
 - data/spec/fixtures/repl2.cljs +109 -0
 - data/spec/fixtures/shift_jis.html +1244 -0
 - data/spec/spec_helpers.rb +1 -0
 - data/spec/string_spec.rb +39 -0
 - metadata +144 -0
 
    
        data/.rspec
    ADDED
    
    
    
        data/Gemfile
    ADDED
    
    
    
        data/Gemfile.lock
    ADDED
    
    | 
         @@ -0,0 +1,29 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            PATH
         
     | 
| 
      
 2 
     | 
    
         
            +
              remote: .
         
     | 
| 
      
 3 
     | 
    
         
            +
              specs:
         
     | 
| 
      
 4 
     | 
    
         
            +
                charlock_holmes-jruby (0.1.0-java)
         
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
      
 6 
     | 
    
         
            +
            GEM
         
     | 
| 
      
 7 
     | 
    
         
            +
              remote: https://rubygems.org/
         
     | 
| 
      
 8 
     | 
    
         
            +
              specs:
         
     | 
| 
      
 9 
     | 
    
         
            +
                diff-lcs (1.2.1)
         
     | 
| 
      
 10 
     | 
    
         
            +
                rake (10.0.4)
         
     | 
| 
      
 11 
     | 
    
         
            +
                rake-compiler (0.8.3)
         
     | 
| 
      
 12 
     | 
    
         
            +
                  rake
         
     | 
| 
      
 13 
     | 
    
         
            +
                rspec (2.13.0)
         
     | 
| 
      
 14 
     | 
    
         
            +
                  rspec-core (~> 2.13.0)
         
     | 
| 
      
 15 
     | 
    
         
            +
                  rspec-expectations (~> 2.13.0)
         
     | 
| 
      
 16 
     | 
    
         
            +
                  rspec-mocks (~> 2.13.0)
         
     | 
| 
      
 17 
     | 
    
         
            +
                rspec-core (2.13.1)
         
     | 
| 
      
 18 
     | 
    
         
            +
                rspec-expectations (2.13.0)
         
     | 
| 
      
 19 
     | 
    
         
            +
                  diff-lcs (>= 1.1.3, < 2.0)
         
     | 
| 
      
 20 
     | 
    
         
            +
                rspec-mocks (2.13.0)
         
     | 
| 
      
 21 
     | 
    
         
            +
             
     | 
| 
      
 22 
     | 
    
         
            +
            PLATFORMS
         
     | 
| 
      
 23 
     | 
    
         
            +
              java
         
     | 
| 
      
 24 
     | 
    
         
            +
             
     | 
| 
      
 25 
     | 
    
         
            +
            DEPENDENCIES
         
     | 
| 
      
 26 
     | 
    
         
            +
              charlock_holmes-jruby!
         
     | 
| 
      
 27 
     | 
    
         
            +
              rake
         
     | 
| 
      
 28 
     | 
    
         
            +
              rake-compiler (>= 0.7.5)
         
     | 
| 
      
 29 
     | 
    
         
            +
              rspec
         
     | 
    
        data/LICENSE
    ADDED
    
    | 
         @@ -0,0 +1,20 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            Copyright (c) 2013 Francis Chong francis@ignition.hk
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            Permission is hereby granted, free of charge, to any person obtaining
         
     | 
| 
      
 4 
     | 
    
         
            +
            a copy of this software and associated documentation files (the
         
     | 
| 
      
 5 
     | 
    
         
            +
            "Software"), to deal in the Software without restriction, including
         
     | 
| 
      
 6 
     | 
    
         
            +
            without limitation the rights to use, copy, modify, merge, publish,
         
     | 
| 
      
 7 
     | 
    
         
            +
            distribute, sublicense, and/or sell copies of the Software, and to
         
     | 
| 
      
 8 
     | 
    
         
            +
            permit persons to whom the Software is furnished to do so, subject to
         
     | 
| 
      
 9 
     | 
    
         
            +
            the following conditions:
         
     | 
| 
      
 10 
     | 
    
         
            +
             
     | 
| 
      
 11 
     | 
    
         
            +
            The above copyright notice and this permission notice shall be
         
     | 
| 
      
 12 
     | 
    
         
            +
            included in all copies or substantial portions of the Software.
         
     | 
| 
      
 13 
     | 
    
         
            +
             
     | 
| 
      
 14 
     | 
    
         
            +
            THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
         
     | 
| 
      
 15 
     | 
    
         
            +
            EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
         
     | 
| 
      
 16 
     | 
    
         
            +
            MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
         
     | 
| 
      
 17 
     | 
    
         
            +
            NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
         
     | 
| 
      
 18 
     | 
    
         
            +
            LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
         
     | 
| 
      
 19 
     | 
    
         
            +
            OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
         
     | 
| 
      
 20 
     | 
    
         
            +
            WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
         
     | 
    
        data/README.md
    ADDED
    
    | 
         @@ -0,0 +1,65 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # CharlockHolmes for JRuby
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            Character encoding detecting library for JRuby using [ICU4J](http://site.icu-project.org/).
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
            ## Requirements
         
     | 
| 
      
 6 
     | 
    
         
            +
             
     | 
| 
      
 7 
     | 
    
         
            +
            - JRuby in 1.9 mode (or above)
         
     | 
| 
      
 8 
     | 
    
         
            +
             
     | 
| 
      
 9 
     | 
    
         
            +
            ## Usage
         
     | 
| 
      
 10 
     | 
    
         
            +
             
     | 
| 
      
 11 
     | 
    
         
            +
            First you'll need to require it
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
            ```
         
     | 
| 
      
 14 
     | 
    
         
            +
            require 'charlock_holmes'
         
     | 
| 
      
 15 
     | 
    
         
            +
            ```
         
     | 
| 
      
 16 
     | 
    
         
            +
             
     | 
| 
      
 17 
     | 
    
         
            +
            ## Encoding detection
         
     | 
| 
      
 18 
     | 
    
         
            +
             
     | 
| 
      
 19 
     | 
    
         
            +
            ```ruby
         
     | 
| 
      
 20 
     | 
    
         
            +
            contents = File.read('test.xml')
         
     | 
| 
      
 21 
     | 
    
         
            +
            detection = CharlockHolmes::EncodingDetector.detect(contents)
         
     | 
| 
      
 22 
     | 
    
         
            +
            # => {:encoding => 'UTF-8', :confidence => 100}
         
     | 
| 
      
 23 
     | 
    
         
            +
             
     | 
| 
      
 24 
     | 
    
         
            +
            # optionally there will be a :language key as well, but
         
     | 
| 
      
 25 
     | 
    
         
            +
            # that's mostly only returned for legacy encodings like ISO-8859-1
         
     | 
| 
      
 26 
     | 
    
         
            +
            ```
         
     | 
| 
      
 27 
     | 
    
         
            +
             
     | 
| 
      
 28 
     | 
    
         
            +
            NOTE: ```CharlockHolmes::EncodingDetector.detect``` will return nil if it was unable to find an encoding.
         
     | 
| 
      
 29 
     | 
    
         
            +
             
     | 
| 
      
 30 
     | 
    
         
            +
            ## String monkey patch
         
     | 
| 
      
 31 
     | 
    
         
            +
             
     | 
| 
      
 32 
     | 
    
         
            +
            Alternatively, you can just use the detect_encoding method on the String class
         
     | 
| 
      
 33 
     | 
    
         
            +
             
     | 
| 
      
 34 
     | 
    
         
            +
            ```
         
     | 
| 
      
 35 
     | 
    
         
            +
            require 'charlock_holmes/string'
         
     | 
| 
      
 36 
     | 
    
         
            +
             
     | 
| 
      
 37 
     | 
    
         
            +
            contents = File.read('test.xml')
         
     | 
| 
      
 38 
     | 
    
         
            +
             
     | 
| 
      
 39 
     | 
    
         
            +
            detection = contents.detect_encoding
         
     | 
| 
      
 40 
     | 
    
         
            +
            # => {:encoding => 'UTF-8', :confidence => 100}
         
     | 
| 
      
 41 
     | 
    
         
            +
             
     | 
| 
      
 42 
     | 
    
         
            +
            # this will detect and set the encoding of `contents`, then return self
         
     | 
| 
      
 43 
     | 
    
         
            +
            contents.detect_encoding!
         
     | 
| 
      
 44 
     | 
    
         
            +
            ```
         
     | 
| 
      
 45 
     | 
    
         
            +
             
     | 
| 
      
 46 
     | 
    
         
            +
            ## Transcoding
         
     | 
| 
      
 47 
     | 
    
         
            +
             
     | 
| 
      
 48 
     | 
    
         
            +
            Being able to detect the encoding of some arbitrary content is nice, but what you probably want is to be able to transcode that content into an encoding your application is using.
         
     | 
| 
      
 49 
     | 
    
         
            +
             
     | 
| 
      
 50 
     | 
    
         
            +
            ```
         
     | 
| 
      
 51 
     | 
    
         
            +
             
     | 
| 
      
 52 
     | 
    
         
            +
            content = File.read('test2.txt')
         
     | 
| 
      
 53 
     | 
    
         
            +
            detection = CharlockHolmes::EncodingDetector.detect(content)
         
     | 
| 
      
 54 
     | 
    
         
            +
            utf8_encoded_content = CharlockHolmes::Converter.convert content, detection[:encoding], 'UTF-8'
         
     | 
| 
      
 55 
     | 
    
         
            +
            ```
         
     | 
| 
      
 56 
     | 
    
         
            +
             
     | 
| 
      
 57 
     | 
    
         
            +
            The first parameter is the content to transcode, the second is the source encoding (the encoding the content is assumed to be in), and the third parameter is the destination encoding.
         
     | 
| 
      
 58 
     | 
    
         
            +
             
     | 
| 
      
 59 
     | 
    
         
            +
            ## Installing
         
     | 
| 
      
 60 
     | 
    
         
            +
             
     | 
| 
      
 61 
     | 
    
         
            +
            ```
         
     | 
| 
      
 62 
     | 
    
         
            +
            gem install charlock_holmes_jruby
         
     | 
| 
      
 63 
     | 
    
         
            +
            ```
         
     | 
| 
      
 64 
     | 
    
         
            +
             
     | 
| 
      
 65 
     | 
    
         
            +
             
     | 
    
        data/Rakefile
    ADDED
    
    | 
         @@ -0,0 +1,17 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            $LOAD_PATH << 'lib'
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            require 'rake/testtask'
         
     | 
| 
      
 4 
     | 
    
         
            +
            require 'charlock_holmes/version'
         
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
      
 6 
     | 
    
         
            +
            task :package do
         
     | 
| 
      
 7 
     | 
    
         
            +
              version_string = "v#{CharlockHolmes::VERSION}"
         
     | 
| 
      
 8 
     | 
    
         
            +
              system %(gem build charlock_holmes-jruby.gemspec)
         
     | 
| 
      
 9 
     | 
    
         
            +
            end
         
     | 
| 
      
 10 
     | 
    
         
            +
             
     | 
| 
      
 11 
     | 
    
         
            +
            task :release => :package do
         
     | 
| 
      
 12 
     | 
    
         
            +
              version_string = "v#{CharlockHolmes::VERSION}"
         
     | 
| 
      
 13 
     | 
    
         
            +
              unless %x(git tag -l).include?(version_string)
         
     | 
| 
      
 14 
     | 
    
         
            +
                system %(git tag -a #{version_string} -m #{version_string})
         
     | 
| 
      
 15 
     | 
    
         
            +
              end
         
     | 
| 
      
 16 
     | 
    
         
            +
              system %(gem push charlock_holmes-jruby-*.gem && mv charlock_holmes-jruby-*.gem pkg)
         
     | 
| 
      
 17 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,25 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # encoding: utf-8
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            require './lib/charlock_holmes/version' unless defined? CharlockHolmes::VERSION
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
            Gem::Specification.new do |s|
         
     | 
| 
      
 6 
     | 
    
         
            +
              s.name = %q{charlock_holmes-jruby}
         
     | 
| 
      
 7 
     | 
    
         
            +
              s.version = CharlockHolmes::VERSION
         
     | 
| 
      
 8 
     | 
    
         
            +
              s.platform = 'java'
         
     | 
| 
      
 9 
     | 
    
         
            +
              s.authors = ["Francis Chong"]
         
     | 
| 
      
 10 
     | 
    
         
            +
              s.date = Time.now.utc.strftime("%Y-%m-%d")
         
     | 
| 
      
 11 
     | 
    
         
            +
              s.email = %q{francis@ignition.hk}
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
              s.files = `git ls-files`.split("\n")
         
     | 
| 
      
 14 
     | 
    
         
            +
              s.homepage = %q{http://github.com/siuying/charlock_holmes-jruby}
         
     | 
| 
      
 15 
     | 
    
         
            +
              s.rdoc_options = ["--charset=UTF-8"]
         
     | 
| 
      
 16 
     | 
    
         
            +
              s.require_paths = ["lib"]
         
     | 
| 
      
 17 
     | 
    
         
            +
              s.rubygems_version = %q{1.4.2}
         
     | 
| 
      
 18 
     | 
    
         
            +
              s.summary = %q{Character encoding detection, brought to you by ICU.}
         
     | 
| 
      
 19 
     | 
    
         
            +
              s.description = %q{JRuby compatible ICU encoding detection build on top of ICU4J.}
         
     | 
| 
      
 20 
     | 
    
         
            +
              s.test_files = `git ls-files spec`.split("\n")
         
     | 
| 
      
 21 
     | 
    
         
            +
             
     | 
| 
      
 22 
     | 
    
         
            +
              s.add_development_dependency 'rake-compiler', ">= 0.7.5"
         
     | 
| 
      
 23 
     | 
    
         
            +
              s.add_development_dependency 'rspec'
         
     | 
| 
      
 24 
     | 
    
         
            +
              s.add_development_dependency 'rake'
         
     | 
| 
      
 25 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require 'charlock_holmes'
         
     | 
| 
         @@ -0,0 +1,17 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require_relative 'version'
         
     | 
| 
      
 2 
     | 
    
         
            +
            require_relative "../#{CharlockHolmes::ICU_JAR_FILENAME}"
         
     | 
| 
      
 3 
     | 
    
         
            +
            java_import 'com.ibm.icu.text.CharsetMatch'
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
            module CharlockHolmes
         
     | 
| 
      
 6 
     | 
    
         
            +
              module CharsetMatchExt
         
     | 
| 
      
 7 
     | 
    
         
            +
                def to_hash
         
     | 
| 
      
 8 
     | 
    
         
            +
                  {
         
     | 
| 
      
 9 
     | 
    
         
            +
                    encoding: self.getName(),
         
     | 
| 
      
 10 
     | 
    
         
            +
                    confidence: self.getConfidence(),
         
     | 
| 
      
 11 
     | 
    
         
            +
                    language: self.getLanguage()
         
     | 
| 
      
 12 
     | 
    
         
            +
                  }
         
     | 
| 
      
 13 
     | 
    
         
            +
                end
         
     | 
| 
      
 14 
     | 
    
         
            +
              end
         
     | 
| 
      
 15 
     | 
    
         
            +
            end
         
     | 
| 
      
 16 
     | 
    
         
            +
             
     | 
| 
      
 17 
     | 
    
         
            +
            com::ibm::icu::text::CharsetMatch.send :include, CharlockHolmes::CharsetMatchExt
         
     | 
| 
         @@ -0,0 +1,12 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            module CharlockHolmes
         
     | 
| 
      
 2 
     | 
    
         
            +
              module Converter
         
     | 
| 
      
 3 
     | 
    
         
            +
                extend self
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
                def convert(string, from, to)
         
     | 
| 
      
 6 
     | 
    
         
            +
                  raise TypeError.new("string cannot be nil") unless string
         
     | 
| 
      
 7 
     | 
    
         
            +
                  raise TypeError.new("from cannot be nil") unless from
         
     | 
| 
      
 8 
     | 
    
         
            +
                  raise TypeError.new("to cannot be nil") unless to
         
     | 
| 
      
 9 
     | 
    
         
            +
                  string.encode(to, from)
         
     | 
| 
      
 10 
     | 
    
         
            +
                end
         
     | 
| 
      
 11 
     | 
    
         
            +
              end
         
     | 
| 
      
 12 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,54 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require_relative 'version'
         
     | 
| 
      
 2 
     | 
    
         
            +
            require_relative "../#{CharlockHolmes::ICU_JAR_FILENAME}"
         
     | 
| 
      
 3 
     | 
    
         
            +
            require_relative 'charset_match_ext'
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
            java_import 'com.ibm.icu.text.CharsetDetector'
         
     | 
| 
      
 6 
     | 
    
         
            +
            java_import 'java.io.ByteArrayInputStream'
         
     | 
| 
      
 7 
     | 
    
         
            +
             
     | 
| 
      
 8 
     | 
    
         
            +
            module CharlockHolmes
         
     | 
| 
      
 9 
     | 
    
         
            +
              module EncodingDetector
         
     | 
| 
      
 10 
     | 
    
         
            +
                extend self
         
     | 
| 
      
 11 
     | 
    
         
            +
             
     | 
| 
      
 12 
     | 
    
         
            +
                def detect(string, hint=nil)
         
     | 
| 
      
 13 
     | 
    
         
            +
                  detector = create_detector(string, hint)
         
     | 
| 
      
 14 
     | 
    
         
            +
             
     | 
| 
      
 15 
     | 
    
         
            +
                  begin
         
     | 
| 
      
 16 
     | 
    
         
            +
                    if charset_match = detector.detect()
         
     | 
| 
      
 17 
     | 
    
         
            +
                      charset_match.to_hash
         
     | 
| 
      
 18 
     | 
    
         
            +
                    else
         
     | 
| 
      
 19 
     | 
    
         
            +
                      nil
         
     | 
| 
      
 20 
     | 
    
         
            +
                    end
         
     | 
| 
      
 21 
     | 
    
         
            +
                  rescue
         
     | 
| 
      
 22 
     | 
    
         
            +
                    nil
         
     | 
| 
      
 23 
     | 
    
         
            +
                  end
         
     | 
| 
      
 24 
     | 
    
         
            +
                end
         
     | 
| 
      
 25 
     | 
    
         
            +
             
     | 
| 
      
 26 
     | 
    
         
            +
                def detect_all(string, hint=nil)
         
     | 
| 
      
 27 
     | 
    
         
            +
                  detector = create_detector(string, hint)
         
     | 
| 
      
 28 
     | 
    
         
            +
             
     | 
| 
      
 29 
     | 
    
         
            +
                  begin
         
     | 
| 
      
 30 
     | 
    
         
            +
                    if charset_matchs = detector.detectAll()
         
     | 
| 
      
 31 
     | 
    
         
            +
                      charset_matchs.collect {|match| match.to_hash }
         
     | 
| 
      
 32 
     | 
    
         
            +
                    else
         
     | 
| 
      
 33 
     | 
    
         
            +
                      nil
         
     | 
| 
      
 34 
     | 
    
         
            +
                    end
         
     | 
| 
      
 35 
     | 
    
         
            +
                  rescue
         
     | 
| 
      
 36 
     | 
    
         
            +
                    nil
         
     | 
| 
      
 37 
     | 
    
         
            +
                  end
         
     | 
| 
      
 38 
     | 
    
         
            +
                end
         
     | 
| 
      
 39 
     | 
    
         
            +
             
     | 
| 
      
 40 
     | 
    
         
            +
                def all_detectable_charsets
         
     | 
| 
      
 41 
     | 
    
         
            +
                  CharsetDetector.getAllDetectableCharsets().to_a
         
     | 
| 
      
 42 
     | 
    
         
            +
                end
         
     | 
| 
      
 43 
     | 
    
         
            +
             
     | 
| 
      
 44 
     | 
    
         
            +
                private
         
     | 
| 
      
 45 
     | 
    
         
            +
                def create_detector(string, hint=nil)
         
     | 
| 
      
 46 
     | 
    
         
            +
                  detector = CharsetDetector.new
         
     | 
| 
      
 47 
     | 
    
         
            +
                  detector.setText ByteArrayInputStream.new(string.to_java_bytes)
         
     | 
| 
      
 48 
     | 
    
         
            +
                  detector.setDeclaredEncoding(hint)
         
     | 
| 
      
 49 
     | 
    
         
            +
                  detector
         
     | 
| 
      
 50 
     | 
    
         
            +
                end
         
     | 
| 
      
 51 
     | 
    
         
            +
              end
         
     | 
| 
      
 52 
     | 
    
         
            +
             
     | 
| 
      
 53 
     | 
    
         
            +
             
     | 
| 
      
 54 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,21 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require_relative 'encoding_detector'
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            module CharlockHolmes
         
     | 
| 
      
 4 
     | 
    
         
            +
              module StringExt
         
     | 
| 
      
 5 
     | 
    
         
            +
                def detect_encoding(hint=nil)
         
     | 
| 
      
 6 
     | 
    
         
            +
                  EncodingDetector.detect(self, hint)
         
     | 
| 
      
 7 
     | 
    
         
            +
                end
         
     | 
| 
      
 8 
     | 
    
         
            +
             
     | 
| 
      
 9 
     | 
    
         
            +
                def detect_encodings(hint=nil)
         
     | 
| 
      
 10 
     | 
    
         
            +
                  EncodingDetector.detect_all(self, hint)
         
     | 
| 
      
 11 
     | 
    
         
            +
                end
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
                def detect_encoding!(hint=nil)
         
     | 
| 
      
 14 
     | 
    
         
            +
                  detected = EncodingDetector.detect(self, hint)
         
     | 
| 
      
 15 
     | 
    
         
            +
                  self.force_encoding(detected[:encoding]) if detected[:encoding]
         
     | 
| 
      
 16 
     | 
    
         
            +
                  self
         
     | 
| 
      
 17 
     | 
    
         
            +
                end
         
     | 
| 
      
 18 
     | 
    
         
            +
              end
         
     | 
| 
      
 19 
     | 
    
         
            +
            end
         
     | 
| 
      
 20 
     | 
    
         
            +
             
     | 
| 
      
 21 
     | 
    
         
            +
            String.send :include, CharlockHolmes::StringExt
         
     | 
| 
         @@ -0,0 +1 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require 'charlock_holmes'
         
     | 
| 
         @@ -0,0 +1,57 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # encoding: utf-8
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            require 'spec_helpers'
         
     | 
| 
      
 4 
     | 
    
         
            +
            require 'charlock_holmes'
         
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
      
 6 
     | 
    
         
            +
            describe CharlockHolmes::Converter do
         
     | 
| 
      
 7 
     | 
    
         
            +
              subject { CharlockHolmes::Converter }
         
     | 
| 
      
 8 
     | 
    
         
            +
             
     | 
| 
      
 9 
     | 
    
         
            +
              describe "#convert" do 
         
     | 
| 
      
 10 
     | 
    
         
            +
                it "should convert ascii from iso8859-1 to utf-16 and back" do
         
     | 
| 
      
 11 
     | 
    
         
            +
                  input = 'test'
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
                  output = subject.convert input, 'ISO-8859-1', 'UTF-16'
         
     | 
| 
      
 14 
     | 
    
         
            +
                  output.encoding.to_s.should == "UTF-16"
         
     | 
| 
      
 15 
     | 
    
         
            +
                  input.bytesize.should < output.bytesize
         
     | 
| 
      
 16 
     | 
    
         
            +
                  input.should_not == output
         
     | 
| 
      
 17 
     | 
    
         
            +
             
     | 
| 
      
 18 
     | 
    
         
            +
                  output = subject.convert output, 'UTF-16', 'ISO-8859-1'
         
     | 
| 
      
 19 
     | 
    
         
            +
                  output.encoding.to_s.should == "ISO-8859-1"
         
     | 
| 
      
 20 
     | 
    
         
            +
                  input.bytesize.should == output.bytesize
         
     | 
| 
      
 21 
     | 
    
         
            +
                  input.should == output
         
     | 
| 
      
 22 
     | 
    
         
            +
                end
         
     | 
| 
      
 23 
     | 
    
         
            +
             
     | 
| 
      
 24 
     | 
    
         
            +
                it "should convert utf8 to utf16 and back" do
         
     | 
| 
      
 25 
     | 
    
         
            +
                  input = 'λ, λ, λ'
         
     | 
| 
      
 26 
     | 
    
         
            +
             
     | 
| 
      
 27 
     | 
    
         
            +
                  output = subject.convert input, 'UTF-8', 'UTF-16'
         
     | 
| 
      
 28 
     | 
    
         
            +
                  output.encoding.to_s.should == "UTF-16"
         
     | 
| 
      
 29 
     | 
    
         
            +
                  input.bytesize.should < output.bytesize
         
     | 
| 
      
 30 
     | 
    
         
            +
                  input.should_not == output
         
     | 
| 
      
 31 
     | 
    
         
            +
             
     | 
| 
      
 32 
     | 
    
         
            +
                  output = subject.convert output, 'UTF-16', 'UTF-8'
         
     | 
| 
      
 33 
     | 
    
         
            +
                  output.encoding.to_s.should == "UTF-8"
         
     | 
| 
      
 34 
     | 
    
         
            +
                  input.bytesize.should == output.bytesize
         
     | 
| 
      
 35 
     | 
    
         
            +
                  input.should == output
         
     | 
| 
      
 36 
     | 
    
         
            +
                end
         
     | 
| 
      
 37 
     | 
    
         
            +
             
     | 
| 
      
 38 
     | 
    
         
            +
                it "should raise error if params are not string" do
         
     | 
| 
      
 39 
     | 
    
         
            +
                  expect {
         
     | 
| 
      
 40 
     | 
    
         
            +
                    subject.convert nil, 'UTF-8', 'UTF-16'
         
     | 
| 
      
 41 
     | 
    
         
            +
                  }.to raise_error(TypeError)
         
     | 
| 
      
 42 
     | 
    
         
            +
             
     | 
| 
      
 43 
     | 
    
         
            +
                  expect {
         
     | 
| 
      
 44 
     | 
    
         
            +
                    subject.convert 'lol', 'UTF-8', nil
         
     | 
| 
      
 45 
     | 
    
         
            +
                  }.to raise_error(TypeError)
         
     | 
| 
      
 46 
     | 
    
         
            +
             
     | 
| 
      
 47 
     | 
    
         
            +
                  expect {
         
     | 
| 
      
 48 
     | 
    
         
            +
                    subject.convert 'lol', nil, 'UTF-16'
         
     | 
| 
      
 49 
     | 
    
         
            +
                  }.to raise_error(TypeError)
         
     | 
| 
      
 50 
     | 
    
         
            +
             
     | 
| 
      
 51 
     | 
    
         
            +
                  expect {
         
     | 
| 
      
 52 
     | 
    
         
            +
                    subject.convert 'lol', 'UTF-8', 'UTF-8'
         
     | 
| 
      
 53 
     | 
    
         
            +
                  }.to_not raise_error(TypeError)
         
     | 
| 
      
 54 
     | 
    
         
            +
                end
         
     | 
| 
      
 55 
     | 
    
         
            +
              end
         
     | 
| 
      
 56 
     | 
    
         
            +
             
     | 
| 
      
 57 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,97 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # encoding: utf-8
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            require 'spec_helpers'
         
     | 
| 
      
 4 
     | 
    
         
            +
            require 'charlock_holmes'
         
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
      
 6 
     | 
    
         
            +
            describe CharlockHolmes::EncodingDetector do
         
     | 
| 
      
 7 
     | 
    
         
            +
              subject { CharlockHolmes::EncodingDetector }
         
     | 
| 
      
 8 
     | 
    
         
            +
             
     | 
| 
      
 9 
     | 
    
         
            +
              describe "#detect" do
         
     | 
| 
      
 10 
     | 
    
         
            +
                it "should detect encoding of string" do
         
     | 
| 
      
 11 
     | 
    
         
            +
                  detected = subject.detect 'hello'
         
     | 
| 
      
 12 
     | 
    
         
            +
                  detected[:encoding].should == 'ISO-8859-1'
         
     | 
| 
      
 13 
     | 
    
         
            +
                end
         
     | 
| 
      
 14 
     | 
    
         
            +
             
     | 
| 
      
 15 
     | 
    
         
            +
                it "should accept encoding hint" do
         
     | 
| 
      
 16 
     | 
    
         
            +
                  detected = subject.detect 'hello', 'UTF-8'
         
     | 
| 
      
 17 
     | 
    
         
            +
                  detected[:encoding].should == 'ISO-8859-1'
         
     | 
| 
      
 18 
     | 
    
         
            +
                end
         
     | 
| 
      
 19 
     | 
    
         
            +
             
     | 
| 
      
 20 
     | 
    
         
            +
                MAPPING = [
         
     | 
| 
      
 21 
     | 
    
         
            +
                  ['repl2.cljs', 'ISO-8859-1'],
         
     | 
| 
      
 22 
     | 
    
         
            +
                  ['core.rkt', 'UTF-8'],
         
     | 
| 
      
 23 
     | 
    
         
            +
                  ['cl-messagepack.lisp', 'ISO-8859-1'],
         
     | 
| 
      
 24 
     | 
    
         
            +
                  ['TwigExtensionsDate.es.yml', 'UTF-8'],
         
     | 
| 
      
 25 
     | 
    
         
            +
                  ['AnsiGraph.psm1', 'UTF-16LE'],
         
     | 
| 
      
 26 
     | 
    
         
            +
                  ['laholator.py', 'UTF-8'],
         
     | 
| 
      
 27 
     | 
    
         
            +
                  ['mingpao.html', 'Big5'],
         
     | 
| 
      
 28 
     | 
    
         
            +
                  ['shift_jis.html', 'Shift_JIS']
         
     | 
| 
      
 29 
     | 
    
         
            +
                ]
         
     | 
| 
      
 30 
     | 
    
         
            +
             
     | 
| 
      
 31 
     | 
    
         
            +
                MAPPING.each do |mapping|
         
     | 
| 
      
 32 
     | 
    
         
            +
                  file, encoding, type = mapping
         
     | 
| 
      
 33 
     | 
    
         
            +
                  it "should detect encoding of test file #{file}" do
         
     | 
| 
      
 34 
     | 
    
         
            +
                    path = File.expand_path "../fixtures/#{file}", __FILE__
         
     | 
| 
      
 35 
     | 
    
         
            +
                    content = File.read path
         
     | 
| 
      
 36 
     | 
    
         
            +
                    guessed = subject.detect content
         
     | 
| 
      
 37 
     | 
    
         
            +
                    guessed[:encoding].should == encoding
         
     | 
| 
      
 38 
     | 
    
         
            +
                    content.force_encoding guessed[:encoding]
         
     | 
| 
      
 39 
     | 
    
         
            +
                    content.valid_encoding?.should be_true
         
     | 
| 
      
 40 
     | 
    
         
            +
                  end
         
     | 
| 
      
 41 
     | 
    
         
            +
                end
         
     | 
| 
      
 42 
     | 
    
         
            +
              end
         
     | 
| 
      
 43 
     | 
    
         
            +
              
         
     | 
| 
      
 44 
     | 
    
         
            +
              describe "#detect_all" do
         
     | 
| 
      
 45 
     | 
    
         
            +
                it "should returns array of possible matches" do
         
     | 
| 
      
 46 
     | 
    
         
            +
                  detected_list = subject.detect_all 'test'
         
     | 
| 
      
 47 
     | 
    
         
            +
                  detected_list.should be_a(Array)
         
     | 
| 
      
 48 
     | 
    
         
            +
             
     | 
| 
      
 49 
     | 
    
         
            +
                  encoding_list = detected_list.map {|d| d[:encoding] }.sort
         
     | 
| 
      
 50 
     | 
    
         
            +
                  encoding_list.should == ['ISO-8859-1', 'ISO-8859-2', 'UTF-8']
         
     | 
| 
      
 51 
     | 
    
         
            +
                end
         
     | 
| 
      
 52 
     | 
    
         
            +
             
     | 
| 
      
 53 
     | 
    
         
            +
                it "should accept encoding hint" do
         
     | 
| 
      
 54 
     | 
    
         
            +
                  detected_list = subject.detect_all 'test', 'UTF-8'
         
     | 
| 
      
 55 
     | 
    
         
            +
                  detected_list.should be_a(Array)
         
     | 
| 
      
 56 
     | 
    
         
            +
             
     | 
| 
      
 57 
     | 
    
         
            +
                  encoding_list = detected_list.map {|d| d[:encoding] }.sort
         
     | 
| 
      
 58 
     | 
    
         
            +
                  encoding_list.should == ['ISO-8859-1', 'ISO-8859-2', 'UTF-8']
         
     | 
| 
      
 59 
     | 
    
         
            +
                end
         
     | 
| 
      
 60 
     | 
    
         
            +
              end
         
     | 
| 
      
 61 
     | 
    
         
            +
             
     | 
| 
      
 62 
     | 
    
         
            +
              describe "#all_detectable_charsets" do
         
     | 
| 
      
 63 
     | 
    
         
            +
                it "should return array of charsets" do
         
     | 
| 
      
 64 
     | 
    
         
            +
                  list = subject.all_detectable_charsets
         
     | 
| 
      
 65 
     | 
    
         
            +
                  list.should be_a(Array)
         
     | 
| 
      
 66 
     | 
    
         
            +
                  list.should == %w{
         
     | 
| 
      
 67 
     | 
    
         
            +
                    UTF-8
         
     | 
| 
      
 68 
     | 
    
         
            +
                    UTF-16BE
         
     | 
| 
      
 69 
     | 
    
         
            +
                    UTF-16LE
         
     | 
| 
      
 70 
     | 
    
         
            +
                    UTF-32BE
         
     | 
| 
      
 71 
     | 
    
         
            +
                    UTF-32LE
         
     | 
| 
      
 72 
     | 
    
         
            +
                    Shift_JIS
         
     | 
| 
      
 73 
     | 
    
         
            +
                    ISO-2022-JP
         
     | 
| 
      
 74 
     | 
    
         
            +
                    ISO-2022-CN
         
     | 
| 
      
 75 
     | 
    
         
            +
                    ISO-2022-KR
         
     | 
| 
      
 76 
     | 
    
         
            +
                    GB18030
         
     | 
| 
      
 77 
     | 
    
         
            +
                    EUC-JP
         
     | 
| 
      
 78 
     | 
    
         
            +
                    EUC-KR
         
     | 
| 
      
 79 
     | 
    
         
            +
                    Big5
         
     | 
| 
      
 80 
     | 
    
         
            +
                    ISO-8859-1
         
     | 
| 
      
 81 
     | 
    
         
            +
                    ISO-8859-2
         
     | 
| 
      
 82 
     | 
    
         
            +
                    ISO-8859-5
         
     | 
| 
      
 83 
     | 
    
         
            +
                    ISO-8859-6
         
     | 
| 
      
 84 
     | 
    
         
            +
                    ISO-8859-7
         
     | 
| 
      
 85 
     | 
    
         
            +
                    ISO-8859-8
         
     | 
| 
      
 86 
     | 
    
         
            +
                    windows-1251
         
     | 
| 
      
 87 
     | 
    
         
            +
                    windows-1256
         
     | 
| 
      
 88 
     | 
    
         
            +
                    KOI8-R
         
     | 
| 
      
 89 
     | 
    
         
            +
                    ISO-8859-9
         
     | 
| 
      
 90 
     | 
    
         
            +
                    IBM424_rtl
         
     | 
| 
      
 91 
     | 
    
         
            +
                    IBM424_ltr
         
     | 
| 
      
 92 
     | 
    
         
            +
                    IBM420_rtl
         
     | 
| 
      
 93 
     | 
    
         
            +
                    IBM420_ltr
         
     | 
| 
      
 94 
     | 
    
         
            +
                  }
         
     | 
| 
      
 95 
     | 
    
         
            +
                end
         
     | 
| 
      
 96 
     | 
    
         
            +
              end
         
     | 
| 
      
 97 
     | 
    
         
            +
            end
         
     |