unihawk 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +24 -0
- data/README.md +40 -0
- data/lib/unihawk.rb +33 -0
- data/proof.rb +18 -0
- data/spec/assets/example-utf-16be +0 -0
- data/spec/assets/example-utf-16le +0 -0
- data/spec/assets/example-utf-8 +1 -0
- data/spec/assets/example-utf-8-no-bom +1 -0
- data/spec/read_spec.rb +40 -0
- data/spec/spec_helper.rb +6 -0
- data/unihawk.gemspec +20 -0
- metadata +98 -0
    
        data/.gitignore
    ADDED
    
    | @@ -0,0 +1 @@ | |
| 1 | 
            +
            .bundle*
         | 
    
        data/Gemfile
    ADDED
    
    
    
        data/Gemfile.lock
    ADDED
    
    | @@ -0,0 +1,24 @@ | |
| 1 | 
            +
            GEM
         | 
| 2 | 
            +
              remote: http://rubygems.org/
         | 
| 3 | 
            +
              specs:
         | 
| 4 | 
            +
                diff-lcs (1.1.3)
         | 
| 5 | 
            +
                multi_json (1.3.6)
         | 
| 6 | 
            +
                rspec (2.11.0)
         | 
| 7 | 
            +
                  rspec-core (~> 2.11.0)
         | 
| 8 | 
            +
                  rspec-expectations (~> 2.11.0)
         | 
| 9 | 
            +
                  rspec-mocks (~> 2.11.0)
         | 
| 10 | 
            +
                rspec-core (2.11.1)
         | 
| 11 | 
            +
                rspec-expectations (2.11.3)
         | 
| 12 | 
            +
                  diff-lcs (~> 1.1.3)
         | 
| 13 | 
            +
                rspec-mocks (2.11.3)
         | 
| 14 | 
            +
                simplecov (0.7.1)
         | 
| 15 | 
            +
                  multi_json (~> 1.0)
         | 
| 16 | 
            +
                  simplecov-html (~> 0.7.1)
         | 
| 17 | 
            +
                simplecov-html (0.7.1)
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            PLATFORMS
         | 
| 20 | 
            +
              ruby
         | 
| 21 | 
            +
             | 
| 22 | 
            +
            DEPENDENCIES
         | 
| 23 | 
            +
              rspec
         | 
| 24 | 
            +
              simplecov
         | 
    
        data/README.md
    ADDED
    
    | @@ -0,0 +1,40 @@ | |
| 1 | 
            +
            Unihawk
         | 
| 2 | 
            +
            ===================
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            Unihawk is a Rubygem that automatically detects the encoding of a file and convert the content into a target encoding.
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            It currently supports UTF-16LE, UTF-16BE, and UTF-8. If ```unihawk``` cannot guess the encoding, it will assume UTF-8.
         | 
| 7 | 
            +
             | 
| 8 | 
            +
             | 
| 9 | 
            +
            How to use it
         | 
| 10 | 
            +
            -------------------
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            First of all, include it in your Gemfile by adding ```gem 'unihawk'```
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            Use ```Unihawk.convert()``` to process string read from a file.
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            For example:
         | 
| 17 | 
            +
             | 
| 18 | 
            +
            ```
         | 
| 19 | 
            +
            content = File.new('/some_path/some_file.txt').read
         | 
| 20 | 
            +
            content = Unihawk.convert(content, 'utf-8')
         | 
| 21 | 
            +
            ```
         | 
| 22 | 
            +
             | 
| 23 | 
            +
            ```content``` will be converted to UTF-8 with BOM removed, if exists.
         | 
| 24 | 
            +
             | 
| 25 | 
            +
             | 
| 26 | 
            +
            Author
         | 
| 27 | 
            +
            --------------------
         | 
| 28 | 
            +
            Tanin Na Nakorn (@tanin47)
         | 
| 29 | 
            +
             | 
| 30 | 
            +
             | 
| 31 | 
            +
             | 
| 32 | 
            +
            License
         | 
| 33 | 
            +
            -------------------
         | 
| 34 | 
            +
            Copyright (c) 2012 Twitter, Inc.
         | 
| 35 | 
            +
             | 
| 36 | 
            +
            Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
         | 
| 37 | 
            +
             | 
| 38 | 
            +
            The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
         | 
| 39 | 
            +
             | 
| 40 | 
            +
            THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
         | 
    
        data/lib/unihawk.rb
    ADDED
    
    | @@ -0,0 +1,33 @@ | |
| 1 | 
            +
            require 'iconv'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            class Unihawk
         | 
| 4 | 
            +
              BYTE_ORDER_MARKS = {
         | 
| 5 | 
            +
                'utf-8' => [0xEF, 0xBB, 0xBF],
         | 
| 6 | 
            +
                'utf-16le' => [0xFF, 0xFE],
         | 
| 7 | 
            +
                'utf-16be' => [0xFE, 0xFF]
         | 
| 8 | 
            +
              }
         | 
| 9 | 
            +
             | 
| 10 | 
            +
              def self.convert(content, target_encoding)
         | 
| 11 | 
            +
                from_encoding = ''
         | 
| 12 | 
            +
                bom3 = content[0..2].scan(/./).map{|c| c[0]}
         | 
| 13 | 
            +
             | 
| 14 | 
            +
                BYTE_ORDER_MARKS.each_pair { |encoding, prefix|
         | 
| 15 | 
            +
                  ok = true
         | 
| 16 | 
            +
                  prefix.each_with_index { |b, i|
         | 
| 17 | 
            +
                    if b != bom3[i]
         | 
| 18 | 
            +
                      ok = false
         | 
| 19 | 
            +
                      break
         | 
| 20 | 
            +
                    end
         | 
| 21 | 
            +
                  }
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                  from_encoding = encoding if ok == true
         | 
| 24 | 
            +
                }
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                if from_encoding == '' # assume utf-8 with no BOM
         | 
| 27 | 
            +
                  Iconv.conv(target_encoding, 'utf-8', content)
         | 
| 28 | 
            +
                else
         | 
| 29 | 
            +
                  content = Iconv.conv(target_encoding, from_encoding, content)
         | 
| 30 | 
            +
                  content[BYTE_ORDER_MARKS[target_encoding].length..-1]
         | 
| 31 | 
            +
                end
         | 
| 32 | 
            +
              end
         | 
| 33 | 
            +
            end
         | 
    
        data/proof.rb
    ADDED
    
    | @@ -0,0 +1,18 @@ | |
| 1 | 
            +
             | 
| 2 | 
            +
            content = File.new('spec/assets/example-utf-16le').read
         | 
| 3 | 
            +
            puts content
         | 
| 4 | 
            +
            content.each_byte {|c| print "#{c.to_s(16)} " }
         | 
| 5 | 
            +
            puts
         | 
| 6 | 
            +
            puts
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            content = File.new('spec/assets/example-utf-16be').read
         | 
| 9 | 
            +
            puts content
         | 
| 10 | 
            +
            content.each_byte {|c| print "#{c.to_s(16)} " }
         | 
| 11 | 
            +
            puts
         | 
| 12 | 
            +
            puts
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            content = File.new('spec/assets/example-utf-8').read
         | 
| 15 | 
            +
            puts content
         | 
| 16 | 
            +
            content.each_byte {|c| print "#{c.to_s(16)} " }
         | 
| 17 | 
            +
            puts
         | 
| 18 | 
            +
            puts
         | 
| Binary file | 
| Binary file | 
| @@ -0,0 +1 @@ | |
| 1 | 
            +
            test
         | 
| @@ -0,0 +1 @@ | |
| 1 | 
            +
            test
         | 
    
        data/spec/read_spec.rb
    ADDED
    
    | @@ -0,0 +1,40 @@ | |
| 1 | 
            +
            require 'spec_helper'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            ALL = [
         | 
| 4 | 
            +
              { :encoding => "utf-16le", :file => File.expand_path("../assets/example-utf-16le", __FILE__) },
         | 
| 5 | 
            +
              { :encoding => "utf-16be", :file => File.expand_path("../assets/example-utf-16be", __FILE__) },
         | 
| 6 | 
            +
              { :encoding => "utf-8", :file => File.expand_path("../assets/example-utf-8", __FILE__) }
         | 
| 7 | 
            +
            ]
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            describe 'Read files' do
         | 
| 10 | 
            +
              before(:all) do
         | 
| 11 | 
            +
                ALL.each { |encoding|
         | 
| 12 | 
            +
                  encoding[:content] = File.new(encoding[:file]).read.scan(/./).map{|c| c[0]}
         | 
| 13 | 
            +
                  encoding[:content][0..Unihawk::BYTE_ORDER_MARKS[encoding[:encoding]].length-1].should =~ Unihawk::BYTE_ORDER_MARKS[encoding[:encoding]]
         | 
| 14 | 
            +
                }
         | 
| 15 | 
            +
              end
         | 
| 16 | 
            +
             | 
| 17 | 
            +
              it "reads and convert correctly" do
         | 
| 18 | 
            +
                ALL.each { |from|
         | 
| 19 | 
            +
                  ALL.each { |to|
         | 
| 20 | 
            +
                    puts "Convert from #{from[:encoding]} to #{to[:encoding]}"
         | 
| 21 | 
            +
                    content = Unihawk.convert(File.new(from[:file]).read, to[:encoding])
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                    expected_content = to[:content][Unihawk::BYTE_ORDER_MARKS[to[:encoding]].length..-1]
         | 
| 24 | 
            +
                    content.scan(/./).map{|c| c[0]}.should =~ expected_content
         | 
| 25 | 
            +
                    puts "--OK"
         | 
| 26 | 
            +
                  }
         | 
| 27 | 
            +
                }
         | 
| 28 | 
            +
              end
         | 
| 29 | 
            +
             | 
| 30 | 
            +
              it "reads no BOM into utf-8" do
         | 
| 31 | 
            +
                ALL.each { |to|
         | 
| 32 | 
            +
                    puts "Convert from utf-8 (no BOM) to #{to[:encoding]}"
         | 
| 33 | 
            +
                    content = Unihawk.convert("test", to[:encoding])
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                    expected_content = to[:content][Unihawk::BYTE_ORDER_MARKS[to[:encoding]].length..-1]
         | 
| 36 | 
            +
                    content.scan(/./).map{|c| c[0]}.should =~ expected_content
         | 
| 37 | 
            +
                    puts "--OK"
         | 
| 38 | 
            +
                  }
         | 
| 39 | 
            +
              end
         | 
| 40 | 
            +
            end
         | 
    
        data/spec/spec_helper.rb
    ADDED
    
    
    
        data/unihawk.gemspec
    ADDED
    
    | @@ -0,0 +1,20 @@ | |
| 1 | 
            +
            # -*- encoding: utf-8 -*-
         | 
| 2 | 
            +
            $:.push File.expand_path("../lib", __FILE__)
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            Gem::Specification.new do |s|
         | 
| 5 | 
            +
              s.name        = "unihawk"
         | 
| 6 | 
            +
              s.version     = "0.0.1"
         | 
| 7 | 
            +
              s.platform    = Gem::Platform::RUBY
         | 
| 8 | 
            +
              s.authors     = ["Tanin Na Nakorn"]
         | 
| 9 | 
            +
              s.email       = ["tanin47@yahoo.com"]
         | 
| 10 | 
            +
              s.homepage    = "http://github.com/twitter/unihawk"
         | 
| 11 | 
            +
              s.summary     = %q{unihawk}
         | 
| 12 | 
            +
              s.description = %q{Read a file in a target encoding (Only works with UTF-16LE, UTF-16BE, and UTF-8)}
         | 
| 13 | 
            +
             | 
| 14 | 
            +
              s.files         = `git ls-files`.split("\n")
         | 
| 15 | 
            +
              s.test_files    = `git ls-files -- {rails,spec}/*`.split("\n")
         | 
| 16 | 
            +
              s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
         | 
| 17 | 
            +
              s.require_paths = ["lib"]
         | 
| 18 | 
            +
             | 
| 19 | 
            +
              s.add_dependency('iconv')
         | 
| 20 | 
            +
            end
         | 
    
        metadata
    ADDED
    
    | @@ -0,0 +1,98 @@ | |
| 1 | 
            +
            --- !ruby/object:Gem::Specification 
         | 
| 2 | 
            +
            name: unihawk
         | 
| 3 | 
            +
            version: !ruby/object:Gem::Version 
         | 
| 4 | 
            +
              hash: 29
         | 
| 5 | 
            +
              prerelease: 
         | 
| 6 | 
            +
              segments: 
         | 
| 7 | 
            +
              - 0
         | 
| 8 | 
            +
              - 0
         | 
| 9 | 
            +
              - 1
         | 
| 10 | 
            +
              version: 0.0.1
         | 
| 11 | 
            +
            platform: ruby
         | 
| 12 | 
            +
            authors: 
         | 
| 13 | 
            +
            - Tanin Na Nakorn
         | 
| 14 | 
            +
            autorequire: 
         | 
| 15 | 
            +
            bindir: bin
         | 
| 16 | 
            +
            cert_chain: []
         | 
| 17 | 
            +
             | 
| 18 | 
            +
            date: 2012-10-18 00:00:00 -07:00
         | 
| 19 | 
            +
            default_executable: 
         | 
| 20 | 
            +
            dependencies: 
         | 
| 21 | 
            +
            - !ruby/object:Gem::Dependency 
         | 
| 22 | 
            +
              name: iconv
         | 
| 23 | 
            +
              prerelease: false
         | 
| 24 | 
            +
              requirement: &id001 !ruby/object:Gem::Requirement 
         | 
| 25 | 
            +
                none: false
         | 
| 26 | 
            +
                requirements: 
         | 
| 27 | 
            +
                - - ">="
         | 
| 28 | 
            +
                  - !ruby/object:Gem::Version 
         | 
| 29 | 
            +
                    hash: 3
         | 
| 30 | 
            +
                    segments: 
         | 
| 31 | 
            +
                    - 0
         | 
| 32 | 
            +
                    version: "0"
         | 
| 33 | 
            +
              type: :runtime
         | 
| 34 | 
            +
              version_requirements: *id001
         | 
| 35 | 
            +
            description: Read a file in a target encoding (Only works with UTF-16LE, UTF-16BE, and UTF-8)
         | 
| 36 | 
            +
            email: 
         | 
| 37 | 
            +
            - tanin47@yahoo.com
         | 
| 38 | 
            +
            executables: []
         | 
| 39 | 
            +
             | 
| 40 | 
            +
            extensions: []
         | 
| 41 | 
            +
             | 
| 42 | 
            +
            extra_rdoc_files: []
         | 
| 43 | 
            +
             | 
| 44 | 
            +
            files: 
         | 
| 45 | 
            +
            - .gitignore
         | 
| 46 | 
            +
            - Gemfile
         | 
| 47 | 
            +
            - Gemfile.lock
         | 
| 48 | 
            +
            - README.md
         | 
| 49 | 
            +
            - lib/unihawk.rb
         | 
| 50 | 
            +
            - proof.rb
         | 
| 51 | 
            +
            - spec/assets/example-utf-16be
         | 
| 52 | 
            +
            - spec/assets/example-utf-16le
         | 
| 53 | 
            +
            - spec/assets/example-utf-8
         | 
| 54 | 
            +
            - spec/assets/example-utf-8-no-bom
         | 
| 55 | 
            +
            - spec/read_spec.rb
         | 
| 56 | 
            +
            - spec/spec_helper.rb
         | 
| 57 | 
            +
            - unihawk.gemspec
         | 
| 58 | 
            +
            has_rdoc: true
         | 
| 59 | 
            +
            homepage: http://github.com/twitter/unihawk
         | 
| 60 | 
            +
            licenses: []
         | 
| 61 | 
            +
             | 
| 62 | 
            +
            post_install_message: 
         | 
| 63 | 
            +
            rdoc_options: []
         | 
| 64 | 
            +
             | 
| 65 | 
            +
            require_paths: 
         | 
| 66 | 
            +
            - lib
         | 
| 67 | 
            +
            required_ruby_version: !ruby/object:Gem::Requirement 
         | 
| 68 | 
            +
              none: false
         | 
| 69 | 
            +
              requirements: 
         | 
| 70 | 
            +
              - - ">="
         | 
| 71 | 
            +
                - !ruby/object:Gem::Version 
         | 
| 72 | 
            +
                  hash: 3
         | 
| 73 | 
            +
                  segments: 
         | 
| 74 | 
            +
                  - 0
         | 
| 75 | 
            +
                  version: "0"
         | 
| 76 | 
            +
            required_rubygems_version: !ruby/object:Gem::Requirement 
         | 
| 77 | 
            +
              none: false
         | 
| 78 | 
            +
              requirements: 
         | 
| 79 | 
            +
              - - ">="
         | 
| 80 | 
            +
                - !ruby/object:Gem::Version 
         | 
| 81 | 
            +
                  hash: 3
         | 
| 82 | 
            +
                  segments: 
         | 
| 83 | 
            +
                  - 0
         | 
| 84 | 
            +
                  version: "0"
         | 
| 85 | 
            +
            requirements: []
         | 
| 86 | 
            +
             | 
| 87 | 
            +
            rubyforge_project: 
         | 
| 88 | 
            +
            rubygems_version: 1.6.2
         | 
| 89 | 
            +
            signing_key: 
         | 
| 90 | 
            +
            specification_version: 3
         | 
| 91 | 
            +
            summary: unihawk
         | 
| 92 | 
            +
            test_files: 
         | 
| 93 | 
            +
            - spec/assets/example-utf-16be
         | 
| 94 | 
            +
            - spec/assets/example-utf-16le
         | 
| 95 | 
            +
            - spec/assets/example-utf-8
         | 
| 96 | 
            +
            - spec/assets/example-utf-8-no-bom
         | 
| 97 | 
            +
            - spec/read_spec.rb
         | 
| 98 | 
            +
            - spec/spec_helper.rb
         |