encoding_checker 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +17 -0
- data/.rspec +1 -0
- data/.rvmrc +52 -0
- data/.travis.yml +5 -0
- data/Gemfile +4 -0
- data/LICENSE +22 -0
- data/README.md +52 -0
- data/Rakefile +8 -0
- data/encoding_checker.gemspec +19 -0
- data/lib/encoding_checker/version.rb +3 -0
- data/lib/encoding_checker.rb +71 -0
- data/spec/encoding_checker_spec.rb +63 -0
- data/spec/spec_helper.rb +2 -0
- metadata +77 -0
data/.gitignore
ADDED
data/.rspec
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--format=documentation
|
data/.rvmrc
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
|
3
|
+
# This is an RVM Project .rvmrc file, used to automatically load the ruby
|
4
|
+
# development environment upon cd'ing into the directory
|
5
|
+
|
6
|
+
# First we specify our desired <ruby>[@<gemset>], the @gemset name is optional,
|
7
|
+
# Only full ruby name is supported here, for short names use:
|
8
|
+
# echo "rvm use 1.9.3" > .rvmrc
|
9
|
+
environment_id="ruby-1.9.3-p125@encoding_checker"
|
10
|
+
|
11
|
+
# Uncomment the following lines if you want to verify rvm version per project
|
12
|
+
# rvmrc_rvm_version="1.10.3" # 1.10.1 seams as a safe start
|
13
|
+
# eval "$(echo ${rvm_version}.${rvmrc_rvm_version} | awk -F. '{print "[[ "$1*65536+$2*256+$3" -ge "$4*65536+$5*256+$6" ]]"}' )" || {
|
14
|
+
# echo "This .rvmrc file requires at least RVM ${rvmrc_rvm_version}, aborting loading."
|
15
|
+
# return 1
|
16
|
+
# }
|
17
|
+
|
18
|
+
# First we attempt to load the desired environment directly from the environment
|
19
|
+
# file. This is very fast and efficient compared to running through the entire
|
20
|
+
# CLI and selector. If you want feedback on which environment was used then
|
21
|
+
# insert the word 'use' after --create as this triggers verbose mode.
|
22
|
+
if [[ -d "${rvm_path:-$HOME/.rvm}/environments"
|
23
|
+
&& -s "${rvm_path:-$HOME/.rvm}/environments/$environment_id" ]]
|
24
|
+
then
|
25
|
+
\. "${rvm_path:-$HOME/.rvm}/environments/$environment_id"
|
26
|
+
[[ -s "${rvm_path:-$HOME/.rvm}/hooks/after_use" ]] &&
|
27
|
+
\. "${rvm_path:-$HOME/.rvm}/hooks/after_use" || true
|
28
|
+
if [[ $- == *i* ]] # check for interactive shells
|
29
|
+
then echo "Using: $(tput setaf 2)$GEM_HOME$(tput sgr0)" # show the user the ruby and gemset they are using in green
|
30
|
+
else echo "Using: $GEM_HOME" # don't use colors in non-interactive shells
|
31
|
+
fi
|
32
|
+
else
|
33
|
+
# If the environment file has not yet been created, use the RVM CLI to select.
|
34
|
+
rvm --create use "$environment_id" || {
|
35
|
+
echo "Failed to create RVM environment '${environment_id}'."
|
36
|
+
return 1
|
37
|
+
}
|
38
|
+
fi
|
39
|
+
|
40
|
+
# If you use bundler, this might be useful to you:
|
41
|
+
# if [[ -s Gemfile ]] && {
|
42
|
+
# ! builtin command -v bundle >/dev/null ||
|
43
|
+
# builtin command -v bundle | grep $rvm_path/bin/bundle >/dev/null
|
44
|
+
# }
|
45
|
+
# then
|
46
|
+
# printf "%b" "The rubygem 'bundler' is not installed. Installing it now.\n"
|
47
|
+
# gem install bundler
|
48
|
+
# fi
|
49
|
+
# if [[ -s Gemfile ]] && builtin command -v bundle >/dev/null
|
50
|
+
# then
|
51
|
+
# bundle install | grep -vE '^Using|Your bundle is complete'
|
52
|
+
# fi
|
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2012 Pavel Forkert
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
# EncodingChecker
|
2
|
+
|
3
|
+
When you need to parse some text files - you need to be sure, that they are in some particular encoding
|
4
|
+
before actually parsing them. For example, some symbols are invalid for UTF-8 encoding, but nevertheless
|
5
|
+
files which are mainly in UTF-8 can contain some invalid characters and many of editors will not show you that.
|
6
|
+
This gem will help you identify lines and characters of the text which are invalid for particular encoding.
|
7
|
+
|
8
|
+
## Installation
|
9
|
+
|
10
|
+
This gem relies on encoding information, which is available only in Ruby 1.9.x.
|
11
|
+
Maybe sometimes I'll add 1.8.x support through iconv library, but for now 1.8.x is not supported.
|
12
|
+
|
13
|
+
Add this line to your application's Gemfile:
|
14
|
+
|
15
|
+
gem 'encoding_checker'
|
16
|
+
|
17
|
+
And then execute:
|
18
|
+
|
19
|
+
$ bundle
|
20
|
+
|
21
|
+
Or install it yourself as:
|
22
|
+
|
23
|
+
$ gem install encoding_checker
|
24
|
+
|
25
|
+
## Usage
|
26
|
+
|
27
|
+
# instantiate checker with encoding name
|
28
|
+
checker = EncodingChecker.new("utf-8")
|
29
|
+
# check any particular text
|
30
|
+
result = checker.check("some string with wrong\xA0symbol")
|
31
|
+
|
32
|
+
unless result.empty?
|
33
|
+
result.invalid_lines.each do |line|
|
34
|
+
# use line.content, line.index and line.invalid_characters
|
35
|
+
line.invalid_characters.each do |character|
|
36
|
+
# use character.content and character.index
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
# raises EncodingChecker::Error
|
42
|
+
checker.check!("some string with wrong\xA0symbol")
|
43
|
+
|
44
|
+
Read the specs for more information.
|
45
|
+
|
46
|
+
## Contributing
|
47
|
+
|
48
|
+
1. Fork it
|
49
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
50
|
+
3. Commit your changes (`git commit -am 'Added some feature'`)
|
51
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
52
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/encoding_checker/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.authors = ["Pavel Forkert"]
|
6
|
+
gem.email = ["fxposter@gmail.com"]
|
7
|
+
gem.description = %q{This gem will helps you identify lines and characters of the text which are invalid for particular encoding}
|
8
|
+
gem.summary = %q{When you need to parse some text files - you need to be sure, that they are in some particular encoding before actually parsing them. For example, some symbols are invalid for UTF-8 encoding, but nevertheless files which are mainly in UTF-8 can contain some invalid characters and many of editors will not show you that. This gem will help you identify lines and characters of the text which are invalid for particular encoding.}
|
9
|
+
gem.homepage = "https://github.com/fxposter/encoding_checker"
|
10
|
+
|
11
|
+
gem.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
12
|
+
gem.files = `git ls-files`.split("\n")
|
13
|
+
gem.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
14
|
+
gem.name = "encoding_checker"
|
15
|
+
gem.require_paths = ["lib"]
|
16
|
+
gem.version = EncodingChecker::VERSION
|
17
|
+
|
18
|
+
gem.add_development_dependency 'rspec', '~> 2.8.0'
|
19
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require "encoding_checker/version"
|
4
|
+
|
5
|
+
# checker = EncodingChecker.new("utf-8")
|
6
|
+
# checker.check!("some string with wrong\xA0symbol")
|
7
|
+
class EncodingChecker
|
8
|
+
def initialize(encoding)
|
9
|
+
@encoding = encoding
|
10
|
+
end
|
11
|
+
|
12
|
+
def check(string)
|
13
|
+
string = string.dup
|
14
|
+
string.force_encoding(@encoding)
|
15
|
+
return Match.new(@encoding, []) if string.valid_encoding?
|
16
|
+
errors = []
|
17
|
+
string.each_line.with_index do |line, line_index|
|
18
|
+
unless line.valid_encoding?
|
19
|
+
line_errors = []
|
20
|
+
line.each_char.with_index do |char, index|
|
21
|
+
unless char.valid_encoding?
|
22
|
+
line_errors << CharacterMatch.new(index, char)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
errors << LineMatch.new(line_index, line.strip, line_errors)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
Match.new(@encoding, errors)
|
29
|
+
end
|
30
|
+
|
31
|
+
def check!(string)
|
32
|
+
check(string).tap do |match|
|
33
|
+
raise Error, match unless match.empty?
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
class Match < Struct.new(:encoding, :invalid_lines)
|
38
|
+
def empty?
|
39
|
+
invalid_lines.empty?
|
40
|
+
end
|
41
|
+
|
42
|
+
def to_s
|
43
|
+
invalid_lines.map { |line|
|
44
|
+
result = %(#{encoding.to_s.upcase} error on line #{line}"\n)
|
45
|
+
result << 'Invalid characters: ' << line.invalid_characters.join(', ') << "\n"
|
46
|
+
result
|
47
|
+
}.join("\n")
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
class LineMatch < Struct.new(:index, :content, :invalid_characters)
|
52
|
+
def to_s
|
53
|
+
%(#{index}: "#{content}")
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
class CharacterMatch < Struct.new(:index, :content)
|
58
|
+
def to_s
|
59
|
+
"#{content.inspect} (#{index})"
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
class Error < RuntimeError
|
64
|
+
attr_reader :result
|
65
|
+
|
66
|
+
def initialize(result)
|
67
|
+
@result = result
|
68
|
+
super
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
describe EncodingChecker do
|
5
|
+
let(:checker) { EncodingChecker.new("utf-8") }
|
6
|
+
let(:valid_string) { "some string with only_right symbols" }
|
7
|
+
let(:invalid_symbol) { "\xA0" }
|
8
|
+
let(:invalid_string) { "some string with wrong#{invalid_symbol}symbol" }
|
9
|
+
let(:invalid_text) { [valid_string, invalid_string].join("\n") }
|
10
|
+
|
11
|
+
describe '#check(string)' do
|
12
|
+
it 'returns result which contains invalid lines and characters in them' do
|
13
|
+
result = checker.check(invalid_text)
|
14
|
+
result.should_not be_empty
|
15
|
+
|
16
|
+
result.invalid_lines.should have(1).element
|
17
|
+
invalid_line = result.invalid_lines[0]
|
18
|
+
check_line(invalid_line, invalid_string, 1)
|
19
|
+
|
20
|
+
invalid_line.invalid_characters.should have(1).element
|
21
|
+
invalid_character = invalid_line.invalid_characters[0]
|
22
|
+
check_character(invalid_character, invalid_symbol, 22)
|
23
|
+
end
|
24
|
+
|
25
|
+
it 'returns empty result when a string is in the specified encoding' do
|
26
|
+
checker = EncodingChecker.new("utf-8")
|
27
|
+
result = checker.check(valid_string)
|
28
|
+
result.should be_empty
|
29
|
+
result.invalid_lines.should be_empty
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
describe '#check!(string)' do
|
34
|
+
it 'raises EncodingChecker::Error when errors are present' do
|
35
|
+
expect {
|
36
|
+
checker.check!(invalid_string)
|
37
|
+
}.to raise_error(EncodingChecker::Error)
|
38
|
+
end
|
39
|
+
|
40
|
+
it 'raises EncodingChecker::Error which contains invalid lines when errors are present' do
|
41
|
+
begin
|
42
|
+
checker.check!(invalid_string)
|
43
|
+
rescue EncodingChecker::Error => e
|
44
|
+
e.result.should_not be_nil
|
45
|
+
e.result.invalid_lines.should have(1).element
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
it 'returns true when no errors are present' do
|
50
|
+
checker.check!(valid_string).should be_true
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def check_line(line, content, index)
|
55
|
+
line.content.should == content
|
56
|
+
line.index.should == index
|
57
|
+
end
|
58
|
+
|
59
|
+
def check_character(character, content, index)
|
60
|
+
character.content.should == content
|
61
|
+
character.index.should == index
|
62
|
+
end
|
63
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: encoding_checker
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Pavel Forkert
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-03-10 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rspec
|
16
|
+
requirement: &70149927503160 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 2.8.0
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *70149927503160
|
25
|
+
description: This gem will helps you identify lines and characters of the text which
|
26
|
+
are invalid for particular encoding
|
27
|
+
email:
|
28
|
+
- fxposter@gmail.com
|
29
|
+
executables: []
|
30
|
+
extensions: []
|
31
|
+
extra_rdoc_files: []
|
32
|
+
files:
|
33
|
+
- .gitignore
|
34
|
+
- .rspec
|
35
|
+
- .rvmrc
|
36
|
+
- .travis.yml
|
37
|
+
- Gemfile
|
38
|
+
- LICENSE
|
39
|
+
- README.md
|
40
|
+
- Rakefile
|
41
|
+
- encoding_checker.gemspec
|
42
|
+
- lib/encoding_checker.rb
|
43
|
+
- lib/encoding_checker/version.rb
|
44
|
+
- spec/encoding_checker_spec.rb
|
45
|
+
- spec/spec_helper.rb
|
46
|
+
homepage: https://github.com/fxposter/encoding_checker
|
47
|
+
licenses: []
|
48
|
+
post_install_message:
|
49
|
+
rdoc_options: []
|
50
|
+
require_paths:
|
51
|
+
- lib
|
52
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
53
|
+
none: false
|
54
|
+
requirements:
|
55
|
+
- - ! '>='
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
version: '0'
|
58
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
59
|
+
none: false
|
60
|
+
requirements:
|
61
|
+
- - ! '>='
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: '0'
|
64
|
+
requirements: []
|
65
|
+
rubyforge_project:
|
66
|
+
rubygems_version: 1.8.16
|
67
|
+
signing_key:
|
68
|
+
specification_version: 3
|
69
|
+
summary: When you need to parse some text files - you need to be sure, that they are
|
70
|
+
in some particular encoding before actually parsing them. For example, some symbols
|
71
|
+
are invalid for UTF-8 encoding, but nevertheless files which are mainly in UTF-8
|
72
|
+
can contain some invalid characters and many of editors will not show you that.
|
73
|
+
This gem will help you identify lines and characters of the text which are invalid
|
74
|
+
for particular encoding.
|
75
|
+
test_files:
|
76
|
+
- spec/encoding_checker_spec.rb
|
77
|
+
- spec/spec_helper.rb
|