encoding_checker 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --format=documentation
data/.rvmrc ADDED
@@ -0,0 +1,52 @@
1
+ #!/usr/bin/env bash
2
+
3
+ # This is an RVM Project .rvmrc file, used to automatically load the ruby
4
+ # development environment upon cd'ing into the directory
5
+
6
+ # First we specify our desired <ruby>[@<gemset>], the @gemset name is optional,
7
+ # Only full ruby name is supported here, for short names use:
8
+ # echo "rvm use 1.9.3" > .rvmrc
9
+ environment_id="ruby-1.9.3-p125@encoding_checker"
10
+
11
+ # Uncomment the following lines if you want to verify rvm version per project
12
+ # rvmrc_rvm_version="1.10.3" # 1.10.1 seams as a safe start
13
+ # eval "$(echo ${rvm_version}.${rvmrc_rvm_version} | awk -F. '{print "[[ "$1*65536+$2*256+$3" -ge "$4*65536+$5*256+$6" ]]"}' )" || {
14
+ # echo "This .rvmrc file requires at least RVM ${rvmrc_rvm_version}, aborting loading."
15
+ # return 1
16
+ # }
17
+
18
+ # First we attempt to load the desired environment directly from the environment
19
+ # file. This is very fast and efficient compared to running through the entire
20
+ # CLI and selector. If you want feedback on which environment was used then
21
+ # insert the word 'use' after --create as this triggers verbose mode.
22
+ if [[ -d "${rvm_path:-$HOME/.rvm}/environments"
23
+ && -s "${rvm_path:-$HOME/.rvm}/environments/$environment_id" ]]
24
+ then
25
+ \. "${rvm_path:-$HOME/.rvm}/environments/$environment_id"
26
+ [[ -s "${rvm_path:-$HOME/.rvm}/hooks/after_use" ]] &&
27
+ \. "${rvm_path:-$HOME/.rvm}/hooks/after_use" || true
28
+ if [[ $- == *i* ]] # check for interactive shells
29
+ then echo "Using: $(tput setaf 2)$GEM_HOME$(tput sgr0)" # show the user the ruby and gemset they are using in green
30
+ else echo "Using: $GEM_HOME" # don't use colors in non-interactive shells
31
+ fi
32
+ else
33
+ # If the environment file has not yet been created, use the RVM CLI to select.
34
+ rvm --create use "$environment_id" || {
35
+ echo "Failed to create RVM environment '${environment_id}'."
36
+ return 1
37
+ }
38
+ fi
39
+
40
+ # If you use bundler, this might be useful to you:
41
+ # if [[ -s Gemfile ]] && {
42
+ # ! builtin command -v bundle >/dev/null ||
43
+ # builtin command -v bundle | grep $rvm_path/bin/bundle >/dev/null
44
+ # }
45
+ # then
46
+ # printf "%b" "The rubygem 'bundler' is not installed. Installing it now.\n"
47
+ # gem install bundler
48
+ # fi
49
+ # if [[ -s Gemfile ]] && builtin command -v bundle >/dev/null
50
+ # then
51
+ # bundle install | grep -vE '^Using|Your bundle is complete'
52
+ # fi
data/.travis.yml ADDED
@@ -0,0 +1,5 @@
1
+ rvm:
2
+ - 1.9.2
3
+ - 1.9.3
4
+ - rbx-19mode
5
+ - jruby-19mode
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in encoding_checker.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 Pavel Forkert
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,52 @@
1
+ # EncodingChecker
2
+
3
+ When you need to parse some text files - you need to be sure, that they are in some particular encoding
4
+ before actually parsing them. For example, some symbols are invalid for UTF-8 encoding, but nevertheless
5
+ files which are mainly in UTF-8 can contain some invalid characters and many of editors will not show you that.
6
+ This gem will help you identify lines and characters of the text which are invalid for particular encoding.
7
+
8
+ ## Installation
9
+
10
+ This gem relies on encoding information, which is available only in Ruby 1.9.x.
11
+ Maybe sometimes I'll add 1.8.x support through iconv library, but for now 1.8.x is not supported.
12
+
13
+ Add this line to your application's Gemfile:
14
+
15
+ gem 'encoding_checker'
16
+
17
+ And then execute:
18
+
19
+ $ bundle
20
+
21
+ Or install it yourself as:
22
+
23
+ $ gem install encoding_checker
24
+
25
+ ## Usage
26
+
27
+ # instantiate checker with encoding name
28
+ checker = EncodingChecker.new("utf-8")
29
+ # check any particular text
30
+ result = checker.check("some string with wrong\xA0symbol")
31
+
32
+ unless result.empty?
33
+ result.invalid_lines.each do |line|
34
+ # use line.content, line.index and line.invalid_characters
35
+ line.invalid_characters.each do |character|
36
+ # use character.content and character.index
37
+ end
38
+ end
39
+ end
40
+
41
+ # raises EncodingChecker::Error
42
+ checker.check!("some string with wrong\xA0symbol")
43
+
44
+ Read the specs for more information.
45
+
46
+ ## Contributing
47
+
48
+ 1. Fork it
49
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
50
+ 3. Commit your changes (`git commit -am 'Added some feature'`)
51
+ 4. Push to the branch (`git push origin my-new-feature`)
52
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
3
+ require 'rspec/core/rake_task'
4
+
5
+ RSpec::Core::RakeTask.new(:spec) do |spec|
6
+ end
7
+
8
+ task :default => :spec
@@ -0,0 +1,19 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/encoding_checker/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.authors = ["Pavel Forkert"]
6
+ gem.email = ["fxposter@gmail.com"]
7
+ gem.description = %q{This gem will helps you identify lines and characters of the text which are invalid for particular encoding}
8
+ gem.summary = %q{When you need to parse some text files - you need to be sure, that they are in some particular encoding before actually parsing them. For example, some symbols are invalid for UTF-8 encoding, but nevertheless files which are mainly in UTF-8 can contain some invalid characters and many of editors will not show you that. This gem will help you identify lines and characters of the text which are invalid for particular encoding.}
9
+ gem.homepage = "https://github.com/fxposter/encoding_checker"
10
+
11
+ gem.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
12
+ gem.files = `git ls-files`.split("\n")
13
+ gem.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
14
+ gem.name = "encoding_checker"
15
+ gem.require_paths = ["lib"]
16
+ gem.version = EncodingChecker::VERSION
17
+
18
+ gem.add_development_dependency 'rspec', '~> 2.8.0'
19
+ end
@@ -0,0 +1,3 @@
1
+ class EncodingChecker
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,71 @@
1
+ # encoding: utf-8
2
+
3
+ require "encoding_checker/version"
4
+
5
+ # checker = EncodingChecker.new("utf-8")
6
+ # checker.check!("some string with wrong\xA0symbol")
7
+ class EncodingChecker
8
+ def initialize(encoding)
9
+ @encoding = encoding
10
+ end
11
+
12
+ def check(string)
13
+ string = string.dup
14
+ string.force_encoding(@encoding)
15
+ return Match.new(@encoding, []) if string.valid_encoding?
16
+ errors = []
17
+ string.each_line.with_index do |line, line_index|
18
+ unless line.valid_encoding?
19
+ line_errors = []
20
+ line.each_char.with_index do |char, index|
21
+ unless char.valid_encoding?
22
+ line_errors << CharacterMatch.new(index, char)
23
+ end
24
+ end
25
+ errors << LineMatch.new(line_index, line.strip, line_errors)
26
+ end
27
+ end
28
+ Match.new(@encoding, errors)
29
+ end
30
+
31
+ def check!(string)
32
+ check(string).tap do |match|
33
+ raise Error, match unless match.empty?
34
+ end
35
+ end
36
+
37
+ class Match < Struct.new(:encoding, :invalid_lines)
38
+ def empty?
39
+ invalid_lines.empty?
40
+ end
41
+
42
+ def to_s
43
+ invalid_lines.map { |line|
44
+ result = %(#{encoding.to_s.upcase} error on line #{line}"\n)
45
+ result << 'Invalid characters: ' << line.invalid_characters.join(', ') << "\n"
46
+ result
47
+ }.join("\n")
48
+ end
49
+ end
50
+
51
+ class LineMatch < Struct.new(:index, :content, :invalid_characters)
52
+ def to_s
53
+ %(#{index}: "#{content}")
54
+ end
55
+ end
56
+
57
+ class CharacterMatch < Struct.new(:index, :content)
58
+ def to_s
59
+ "#{content.inspect} (#{index})"
60
+ end
61
+ end
62
+
63
+ class Error < RuntimeError
64
+ attr_reader :result
65
+
66
+ def initialize(result)
67
+ @result = result
68
+ super
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,63 @@
1
+ # encoding: utf-8
2
+ require 'spec_helper'
3
+
4
+ describe EncodingChecker do
5
+ let(:checker) { EncodingChecker.new("utf-8") }
6
+ let(:valid_string) { "some string with only_right symbols" }
7
+ let(:invalid_symbol) { "\xA0" }
8
+ let(:invalid_string) { "some string with wrong#{invalid_symbol}symbol" }
9
+ let(:invalid_text) { [valid_string, invalid_string].join("\n") }
10
+
11
+ describe '#check(string)' do
12
+ it 'returns result which contains invalid lines and characters in them' do
13
+ result = checker.check(invalid_text)
14
+ result.should_not be_empty
15
+
16
+ result.invalid_lines.should have(1).element
17
+ invalid_line = result.invalid_lines[0]
18
+ check_line(invalid_line, invalid_string, 1)
19
+
20
+ invalid_line.invalid_characters.should have(1).element
21
+ invalid_character = invalid_line.invalid_characters[0]
22
+ check_character(invalid_character, invalid_symbol, 22)
23
+ end
24
+
25
+ it 'returns empty result when a string is in the specified encoding' do
26
+ checker = EncodingChecker.new("utf-8")
27
+ result = checker.check(valid_string)
28
+ result.should be_empty
29
+ result.invalid_lines.should be_empty
30
+ end
31
+ end
32
+
33
+ describe '#check!(string)' do
34
+ it 'raises EncodingChecker::Error when errors are present' do
35
+ expect {
36
+ checker.check!(invalid_string)
37
+ }.to raise_error(EncodingChecker::Error)
38
+ end
39
+
40
+ it 'raises EncodingChecker::Error which contains invalid lines when errors are present' do
41
+ begin
42
+ checker.check!(invalid_string)
43
+ rescue EncodingChecker::Error => e
44
+ e.result.should_not be_nil
45
+ e.result.invalid_lines.should have(1).element
46
+ end
47
+ end
48
+
49
+ it 'returns true when no errors are present' do
50
+ checker.check!(valid_string).should be_true
51
+ end
52
+ end
53
+
54
+ def check_line(line, content, index)
55
+ line.content.should == content
56
+ line.index.should == index
57
+ end
58
+
59
+ def check_character(character, content, index)
60
+ character.content.should == content
61
+ character.index.should == index
62
+ end
63
+ end
@@ -0,0 +1,2 @@
1
+ require 'bundler/setup'
2
+ Bundler.require(:default, :development)
metadata ADDED
@@ -0,0 +1,77 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: encoding_checker
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Pavel Forkert
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-03-10 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rspec
16
+ requirement: &70149927503160 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: 2.8.0
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: *70149927503160
25
+ description: This gem will helps you identify lines and characters of the text which
26
+ are invalid for particular encoding
27
+ email:
28
+ - fxposter@gmail.com
29
+ executables: []
30
+ extensions: []
31
+ extra_rdoc_files: []
32
+ files:
33
+ - .gitignore
34
+ - .rspec
35
+ - .rvmrc
36
+ - .travis.yml
37
+ - Gemfile
38
+ - LICENSE
39
+ - README.md
40
+ - Rakefile
41
+ - encoding_checker.gemspec
42
+ - lib/encoding_checker.rb
43
+ - lib/encoding_checker/version.rb
44
+ - spec/encoding_checker_spec.rb
45
+ - spec/spec_helper.rb
46
+ homepage: https://github.com/fxposter/encoding_checker
47
+ licenses: []
48
+ post_install_message:
49
+ rdoc_options: []
50
+ require_paths:
51
+ - lib
52
+ required_ruby_version: !ruby/object:Gem::Requirement
53
+ none: false
54
+ requirements:
55
+ - - ! '>='
56
+ - !ruby/object:Gem::Version
57
+ version: '0'
58
+ required_rubygems_version: !ruby/object:Gem::Requirement
59
+ none: false
60
+ requirements:
61
+ - - ! '>='
62
+ - !ruby/object:Gem::Version
63
+ version: '0'
64
+ requirements: []
65
+ rubyforge_project:
66
+ rubygems_version: 1.8.16
67
+ signing_key:
68
+ specification_version: 3
69
+ summary: When you need to parse some text files - you need to be sure, that they are
70
+ in some particular encoding before actually parsing them. For example, some symbols
71
+ are invalid for UTF-8 encoding, but nevertheless files which are mainly in UTF-8
72
+ can contain some invalid characters and many of editors will not show you that.
73
+ This gem will help you identify lines and characters of the text which are invalid
74
+ for particular encoding.
75
+ test_files:
76
+ - spec/encoding_checker_spec.rb
77
+ - spec/spec_helper.rb