charazard 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +3 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +22 -0
- data/README.markdown +64 -0
- data/Rakefile +17 -0
- data/charazard.gemspec +25 -0
- data/lib/charazard/version.rb +3 -0
- data/lib/charazard.rb +107 -0
- data/test/charazard_test.rb +167 -0
- data/test/test_helper.rb +8 -0
- metadata +112 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 34b9d5367e647921915519d5a3a252f683ddf91b
|
4
|
+
data.tar.gz: b762b59f2e9ddeeefb4ade4afe6511a6756d1074
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 887e42a951e9fcb07146bcf9f7adc7dbde56f706dd6d964d32d345112bc27508a38d6210430218e2165ca785d07f1c5e37d840e74198938dc3a35c58f8b19a81
|
7
|
+
data.tar.gz: dd537a7e4d2ae0d675f06731a3aac65e729a0b28f341d6c4adcc8ec60073ab312aefa605722b03b3b093d76e5470af79bf430dc3d38b5e683a24390399334e1e
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 Jason Weathered
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.markdown
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
# Charazard
|
2
|
+
|
3
|
+
Cleans up bad character encodings with liberal application of fire.
|
4
|
+
|
5
|
+
## Usage
|
6
|
+
|
7
|
+
### Converting Windows-1252 (ISO 8859-1) to UTF-8
|
8
|
+
|
9
|
+
CSV files saved by Excel on Windows are by default encoded in Windows-1252 (which is close to but not quite ISO 8859-1 or ISO Latin 1).
|
10
|
+
`Charazard.fix_invalid_unicode_literals` can be used to convert these characters into valid UTF-8 without breaking existing UTF-8 strings.
|
11
|
+
|
12
|
+
```ruby
|
13
|
+
Charazard.fix_invalid_unicode_literals("\x93Smart quotes\x94 \xC3\x9Cber Unicode")
|
14
|
+
# => "“Smart quotes” Über Unicode"
|
15
|
+
```
|
16
|
+
|
17
|
+
`Charazard.fix_invalid_unicode_literals` can be used in combination with
|
18
|
+
[`filter_io`](https://github.com/jasoncodes/filter_io) to filter CSV streams.
|
19
|
+
Here’s an example that handles UTF-8/ISO-8859-1 with mixed line endings:
|
20
|
+
|
21
|
+
``` ruby
|
22
|
+
require 'filter_io'
|
23
|
+
require 'charazard'
|
24
|
+
require 'csv'
|
25
|
+
|
26
|
+
File.open(filename, external_encoding: 'UTF-8') do |io|
|
27
|
+
io = FilterIO.new(io) do |data, state|
|
28
|
+
# fix invalid UTF-8 literals
|
29
|
+
data = Charazard.fix_invalid_unicode_literals(data)
|
30
|
+
|
31
|
+
# grab another chunk if the last character is a delimiter
|
32
|
+
raise FilterIO::NeedMoreData if data =~ /[\r\n]\z/ && !state.eof?
|
33
|
+
# normalise line endings to LF
|
34
|
+
data = data.gsub /\r\n|\r|\n/, "\n"
|
35
|
+
|
36
|
+
data
|
37
|
+
end
|
38
|
+
|
39
|
+
CSV.parse(io, row_sep: "\n") do |row|
|
40
|
+
p row
|
41
|
+
end
|
42
|
+
end
|
43
|
+
```
|
44
|
+
|
45
|
+
### Converting HTML to plain text
|
46
|
+
|
47
|
+
Feeds (RSS, product lists, etc.) often contain HTML which you may want to sanitize into plain text.
|
48
|
+
Charazard recognises basic formatting such as paragraphs and lists.
|
49
|
+
|
50
|
+
```html
|
51
|
+
<p>First sentence.</p><p>Second sentenence.</p>
|
52
|
+
<ul><li>foo</li><li>bar</li></ul>
|
53
|
+
```
|
54
|
+
|
55
|
+
```ruby
|
56
|
+
text = Charazard.html_to_plain(html)
|
57
|
+
```
|
58
|
+
|
59
|
+
```markdown
|
60
|
+
First sentence.
|
61
|
+
Second sentenence.
|
62
|
+
* foo
|
63
|
+
* bar
|
64
|
+
```
|
data/Rakefile
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'bundler/gem_tasks'
|
2
|
+
require 'rake/testtask'
|
3
|
+
|
4
|
+
task :default => :test
|
5
|
+
|
6
|
+
desc 'Run the tests'
|
7
|
+
Rake::TestTask.new do |t|
|
8
|
+
t.libs.push 'lib'
|
9
|
+
t.libs.push 'test'
|
10
|
+
t.test_files = FileList['test/*_test.rb']
|
11
|
+
t.verbose = true
|
12
|
+
end
|
13
|
+
|
14
|
+
desc 'Open a Pry console with environment'
|
15
|
+
task :console do
|
16
|
+
exec "pry -Ilib -rcharazard"
|
17
|
+
end
|
data/charazard.gemspec
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'charazard/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = 'charazard'
|
8
|
+
spec.version = Charazard::VERSION
|
9
|
+
spec.authors = ['Jason Weathered']
|
10
|
+
spec.email = ['jason@jasoncodes.com']
|
11
|
+
spec.summary = %q{Cleans up bad character encodings with liberal application of fire.}
|
12
|
+
spec.homepage = 'https://github.com/jasoncodes/charazard'
|
13
|
+
spec.license = 'MIT'
|
14
|
+
|
15
|
+
spec.files = `git ls-files -z`.split("\x0")
|
16
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
17
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
18
|
+
spec.require_paths = ['lib']
|
19
|
+
|
20
|
+
spec.add_dependency 'nokogiri', '~> 1.5'
|
21
|
+
|
22
|
+
spec.add_development_dependency 'bundler', '~> 1.6'
|
23
|
+
spec.add_development_dependency 'rake'
|
24
|
+
spec.add_development_dependency 'pry'
|
25
|
+
end
|
data/lib/charazard.rb
ADDED
@@ -0,0 +1,107 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
require 'charazard/version'
|
4
|
+
require 'nokogiri'
|
5
|
+
|
6
|
+
module Charazard
|
7
|
+
extend self
|
8
|
+
|
9
|
+
CP1252_TABLE = {
|
10
|
+
128 => 0x20AC, 130 => 0x201A, 131 => 0x0192, 132 => 0x201E, 133 => 0x2026, 134 => 0x2020, 135 => 0x2021,
|
11
|
+
136 => 0x02C6, 137 => 0x2030, 138 => 0x0160, 139 => 0x2039, 140 => 0x0152, 142 => 0x017D, 145 => 0x2018,
|
12
|
+
146 => 0x2019, 147 => 0x201C, 148 => 0x201D, 149 => 0x2022, 150 => 0x2013, 151 => 0x2014, 152 => 0x02DC,
|
13
|
+
153 => 0x2122, 154 => 0x0161, 155 => 0x203A, 156 => 0x0153, 158 => 0x017E, 159 => 0x0178
|
14
|
+
}
|
15
|
+
|
16
|
+
UTF8_REGEX_PREFIX = /\A(
|
17
|
+
[\x09\x0A\x0D\x20-\x7E] # ASCII
|
18
|
+
| [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
|
19
|
+
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
|
20
|
+
| [\xE1-\xEC\xEE][\x80-\xBF]{2} # straight 3-byte
|
21
|
+
| \xEF[\x80-\xBE]{2} #
|
22
|
+
| \xEF\xBF[\x80-\xBD] # excluding U+fffe and U+ffff
|
23
|
+
| \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
|
24
|
+
| \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
|
25
|
+
| [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
|
26
|
+
| \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
|
27
|
+
)*/nx;
|
28
|
+
|
29
|
+
def fix_invalid_unicode_literals(str)
|
30
|
+
input = str.dup.force_encoding('ASCII-8BIT')
|
31
|
+
input.gsub!(/\xef\xbb\xbf/n, '') # byte order marker
|
32
|
+
input.gsub!(/\xef\xbf\xbd/n, '') # replacement character
|
33
|
+
if input =~ /[\x80-\xFF]/n
|
34
|
+
output = input.slice! 0, 0
|
35
|
+
until input.empty?
|
36
|
+
input =~ UTF8_REGEX_PREFIX or raise "UTF8 match failed"
|
37
|
+
input = $'
|
38
|
+
output << $&
|
39
|
+
unless input.empty?
|
40
|
+
byte = input.slice!(0,1).ord
|
41
|
+
char = if CP1252_TABLE[byte]
|
42
|
+
[CP1252_TABLE[byte]].pack 'U'
|
43
|
+
else
|
44
|
+
[byte].pack 'U'
|
45
|
+
end
|
46
|
+
output << char.force_encoding('ASCII-8BIT')
|
47
|
+
end
|
48
|
+
end
|
49
|
+
output.force_encoding('UTF-8')
|
50
|
+
else
|
51
|
+
input.force_encoding('UTF-8')
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def strip_dom(str, keep_newline = false)
|
56
|
+
str = str.gsub(/\v/, '') # remove vertical tabs
|
57
|
+
str = str.gsub(/(?:\s|\xC2\xA0|\xEF\xBF\xBD)+/) do |match|
|
58
|
+
if keep_newline && match =~ /(?:\r|\n)/
|
59
|
+
"\n"
|
60
|
+
else
|
61
|
+
" "
|
62
|
+
end
|
63
|
+
end
|
64
|
+
str = str.gsub(/[\x00-\x09\x0b-\x1f]/, '') # remove all chars < 0x20 except 0x10 (LF)
|
65
|
+
str.strip
|
66
|
+
end
|
67
|
+
|
68
|
+
def fix_cp1252_entities(str)
|
69
|
+
if str =~ /[345][0-9];/
|
70
|
+
CP1252_TABLE.each do |from,to|
|
71
|
+
str = str.gsub "&##{from};", "&##{to};"
|
72
|
+
end
|
73
|
+
end
|
74
|
+
str
|
75
|
+
end
|
76
|
+
|
77
|
+
def html_to_plain(str, keep_newline = true)
|
78
|
+
str = fix_invalid_unicode_literals(str)
|
79
|
+
|
80
|
+
if str.include? '<' or str.include? '&'
|
81
|
+
str = fix_cp1252_entities(strip_dom(str))
|
82
|
+
str.gsub!(/-<-/, '-<-')
|
83
|
+
doc = Nokogiri::HTML.fragment(str)
|
84
|
+
doc.search('style,script').each(&:remove)
|
85
|
+
doc.search('br').each do |br|
|
86
|
+
br.replace(Nokogiri::XML::Text.new("\n", doc))
|
87
|
+
end
|
88
|
+
doc.search('p,div').each do |p|
|
89
|
+
p.add_next_sibling(Nokogiri::XML::Text.new("\n", doc))
|
90
|
+
p.add_previous_sibling(Nokogiri::XML::Text.new("\n", doc))
|
91
|
+
end
|
92
|
+
doc.search('ul,ol').each do |list|
|
93
|
+
is_ordered = list.node_name.downcase == 'ol'
|
94
|
+
start = list[:start] =~ /\A(\d+)\z/ ? $1.to_i : 1
|
95
|
+
list.add_next_sibling Nokogiri::XML::Text.new("\n", doc)
|
96
|
+
list.add_previous_sibling Nokogiri::XML::Text.new("\n", doc)
|
97
|
+
list.css('li').each_with_index do |item,index|
|
98
|
+
item_prefix_text = is_ordered ? "\n#{start+index}. " : "\n* "
|
99
|
+
item.add_previous_sibling Nokogiri::XML::Text.new(item_prefix_text, doc)
|
100
|
+
end
|
101
|
+
end
|
102
|
+
str = doc.inner_text
|
103
|
+
end
|
104
|
+
|
105
|
+
return strip_dom(str, keep_newline)
|
106
|
+
end
|
107
|
+
end
|
@@ -0,0 +1,167 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
require 'test_helper'
|
4
|
+
|
5
|
+
describe Charazard do
|
6
|
+
describe 'fix_invalid_unicode_literals' do
|
7
|
+
test "Test ISO-8859-1 literal conversion is idempotent" do
|
8
|
+
assert_equal Charazard.html_to_plain("™ © ®"), Charazard.fix_invalid_unicode_literals(Charazard.fix_invalid_unicode_literals("\231 \251 \256"))
|
9
|
+
end
|
10
|
+
|
11
|
+
test "ISO-8859-1 characters within words" do
|
12
|
+
assert_equal "M\xc3\xa9nage \xc3\xa0 Trois for", Charazard.fix_invalid_unicode_literals("M\xe9nage \xe0 Trois for")
|
13
|
+
end
|
14
|
+
|
15
|
+
test "should fix literals in ASCII-8BIT without breaking correct UTF-8" do
|
16
|
+
input = "r\xc3\xa9sum\xe9"
|
17
|
+
input.force_encoding 'ASCII-8BIT'
|
18
|
+
expected = "r\xc3\xa9sum\xc3\xa9"
|
19
|
+
output = Charazard.fix_invalid_unicode_literals(input)
|
20
|
+
assert_equal expected, output
|
21
|
+
end
|
22
|
+
|
23
|
+
test "replacement character (U+FFFD, UTF-8 EF BF BD) as invalid literal" do
|
24
|
+
assert_equal "foo bar", Charazard.fix_invalid_unicode_literals("foo \xef\xbf\xbdbar")
|
25
|
+
end
|
26
|
+
|
27
|
+
test "byte order marker (U+FEFF, UTF-8 EF BB BF) as invalid literal" do
|
28
|
+
assert_equal "foo bar", Charazard.fix_invalid_unicode_literals("foo \xef\xbb\xbfbar")
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
describe 'strip_dom' do
|
33
|
+
test "whitespace normalisation removing newlines" do
|
34
|
+
assert_equal "foo bar", Charazard.strip_dom(" foo \n\t\tbar \n ")
|
35
|
+
end
|
36
|
+
|
37
|
+
test "whitespace normalisation keeping newlines" do
|
38
|
+
assert_equal "foo\nbar", Charazard.strip_dom(" foo \n\t\tbar \n ", true)
|
39
|
+
end
|
40
|
+
|
41
|
+
test "newline preservation with multiple newlines together" do
|
42
|
+
assert_equal "foo\nbar\nbaz", Charazard.strip_dom("foo\n\n\n\nbar\t\n\t\n\tbaz", true)
|
43
|
+
end
|
44
|
+
|
45
|
+
test "non-breaking space (UTF-8 C2 A0)" do
|
46
|
+
assert_equal "foo bar", Charazard.strip_dom("foo\xc2\xa0bar")
|
47
|
+
end
|
48
|
+
|
49
|
+
test "replacement character (U+FFFD, UTF-8 EF BF BD)" do
|
50
|
+
assert_equal "foo bar", Charazard.strip_dom("foo\xef\xbf\xbdbar")
|
51
|
+
end
|
52
|
+
|
53
|
+
test "CR in newline removal" do
|
54
|
+
assert_equal "foo bar", Charazard.strip_dom("foo\rbar", false)
|
55
|
+
end
|
56
|
+
|
57
|
+
test "CR in newline preservation" do
|
58
|
+
assert_equal "foo\nbar", Charazard.strip_dom("foo\rbar", true)
|
59
|
+
end
|
60
|
+
|
61
|
+
test "invalid Unicode code points below 0x20" do
|
62
|
+
assert_equal "foobar", Charazard.strip_dom("foo\006bar")
|
63
|
+
assert_equal "foobar\nbaz", Charazard.strip_dom("foo\006bar\n baz", true)
|
64
|
+
assert_equal "shes", Charazard.strip_dom("\x73\x68\x65\x1a\x73")
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
describe 'html_to_plain' do
|
69
|
+
test "html basic entities" do
|
70
|
+
assert_equal "foo & \"bar\" <baz>", Charazard.html_to_plain("foo & "bar\" <baz>")
|
71
|
+
assert_equal "foo\xc2\xae bar \xe2\x82\xac123!", Charazard.html_to_plain("foo® bar €123!")
|
72
|
+
end
|
73
|
+
|
74
|
+
test "html basic pass through" do
|
75
|
+
assert_equal "foo bar", Charazard.html_to_plain("foo bar")
|
76
|
+
end
|
77
|
+
|
78
|
+
test "html basic whitespace" do
|
79
|
+
assert_equal "foo bar baz. Hello world. 1 2 3.", Charazard.html_to_plain("foo bar baz. Hello world. 1 2 3.")
|
80
|
+
end
|
81
|
+
|
82
|
+
test "html basic elements" do
|
83
|
+
assert_equal "This is a test.", Charazard.html_to_plain("<strong>This</strong> <em>is <span style='text-decoration: underline'>a</span></em> <a href='http://www.example.com/' title=\"Testing\">test</a>.")
|
84
|
+
end
|
85
|
+
|
86
|
+
test "html block entities" do
|
87
|
+
assert_equal "Foo\nBar\nabc\ndef\nghi", Charazard.html_to_plain("<p>Foo</p><p>Bar</p><p>abc<p>def<br />ghi")
|
88
|
+
end
|
89
|
+
|
90
|
+
test "invalid Unicode code points in numeric entities from CP1252" do
|
91
|
+
assert_equal "that\xe2\x80\x99s \xe2\x80\x93 a test", Charazard.html_to_plain("that’s – a test")
|
92
|
+
end
|
93
|
+
|
94
|
+
test "Test common ISO-8859-1 literals" do
|
95
|
+
assert_equal Charazard.html_to_plain("Foo® Bar™ ©2010"), Charazard.html_to_plain("Foo\256 Bar\231 \2512010")
|
96
|
+
end
|
97
|
+
|
98
|
+
test "ISO-8859-1 capital A grave accent literal" do
|
99
|
+
assert_equal "\xc3\x82", Charazard.html_to_plain(Charazard.html_to_plain("\xc2"))
|
100
|
+
end
|
101
|
+
|
102
|
+
test "Line tabulation character U+000B in HTML should be stripped" do
|
103
|
+
assert_equal 'foobar', Charazard.html_to_plain("foo\x0bbar")
|
104
|
+
end
|
105
|
+
|
106
|
+
test "Newline U+000A in HTML should be preserved" do
|
107
|
+
assert_equal "foo\nbar", Charazard.html_to_plain("foo\nbar")
|
108
|
+
end
|
109
|
+
|
110
|
+
test "Tab U+0009 in HTML should be normalised to space" do
|
111
|
+
assert_equal "foo bar", Charazard.html_to_plain("foo\tbar")
|
112
|
+
end
|
113
|
+
|
114
|
+
test "Combination of different conversion cases in HTML to plain" do
|
115
|
+
assert_equal "A\nB\xc3\x82CD", Charazard.html_to_plain(Charazard.html_to_plain("A\nB\xc2C\x0bD"))
|
116
|
+
end
|
117
|
+
|
118
|
+
test "stylesheet and script blocks should be stripped" do
|
119
|
+
html = <<-HTML
|
120
|
+
foo
|
121
|
+
<script>alert('hi');</script><strong>bar</strong>
|
122
|
+
<style type="text/css">
|
123
|
+
body
|
124
|
+
{
|
125
|
+
font-face: Helvetica;
|
126
|
+
}
|
127
|
+
</style>
|
128
|
+
baz
|
129
|
+
HTML
|
130
|
+
plain = "foo bar baz"
|
131
|
+
assert_equal plain, Charazard.html_to_plain(html)
|
132
|
+
end
|
133
|
+
|
134
|
+
test "unordered lists should format as lines" do
|
135
|
+
html = "foo<ul><li>one</li><li>two</li><li>three</li></ul>bar"
|
136
|
+
plain = "foo\n* one\n* two\n* three\nbar"
|
137
|
+
assert_equal plain, Charazard.html_to_plain(html)
|
138
|
+
end
|
139
|
+
|
140
|
+
test "ordered lists should format as lines" do
|
141
|
+
html = "foo<ol><li>one</li><li>two</li><li>three</li></ol>bar"
|
142
|
+
plain = "foo\n1. one\n2. two\n3. three\nbar"
|
143
|
+
assert_equal plain, Charazard.html_to_plain(html)
|
144
|
+
end
|
145
|
+
|
146
|
+
test "ordered list with start offset" do
|
147
|
+
html = "<ol start=3><li>three</li><li>four</li></ol>"
|
148
|
+
plain = "3. three\n4. four"
|
149
|
+
assert_equal plain, Charazard.html_to_plain(html)
|
150
|
+
end
|
151
|
+
|
152
|
+
test "should be converted to UTF-8" do
|
153
|
+
input = "Foo\nBar\x99".force_encoding('ASCII-8BIT').split("\n").to_a
|
154
|
+
expected = ["Foo", "Bar\xE2\x84\xA2"]
|
155
|
+
actual = input.map { |str| Charazard.html_to_plain(str) }
|
156
|
+
assert_equal expected, actual
|
157
|
+
expected.zip(actual).each do |expected_line, actual_line|
|
158
|
+
assert_equal expected_line.encoding, actual_line.encoding
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
test "URL containing hyphen-less-than-hyphen passes through" do
|
163
|
+
str = "http://example.com/age-<-5-years"
|
164
|
+
assert_equal str, Charazard.html_to_plain(str)
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
data/test/test_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,112 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: charazard
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jason Weathered
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-06-11 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.5'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.5'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: bundler
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.6'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.6'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: pry
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
description:
|
70
|
+
email:
|
71
|
+
- jason@jasoncodes.com
|
72
|
+
executables: []
|
73
|
+
extensions: []
|
74
|
+
extra_rdoc_files: []
|
75
|
+
files:
|
76
|
+
- ".gitignore"
|
77
|
+
- Gemfile
|
78
|
+
- LICENSE.txt
|
79
|
+
- README.markdown
|
80
|
+
- Rakefile
|
81
|
+
- charazard.gemspec
|
82
|
+
- lib/charazard.rb
|
83
|
+
- lib/charazard/version.rb
|
84
|
+
- test/charazard_test.rb
|
85
|
+
- test/test_helper.rb
|
86
|
+
homepage: https://github.com/jasoncodes/charazard
|
87
|
+
licenses:
|
88
|
+
- MIT
|
89
|
+
metadata: {}
|
90
|
+
post_install_message:
|
91
|
+
rdoc_options: []
|
92
|
+
require_paths:
|
93
|
+
- lib
|
94
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
95
|
+
requirements:
|
96
|
+
- - ">="
|
97
|
+
- !ruby/object:Gem::Version
|
98
|
+
version: '0'
|
99
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
requirements: []
|
105
|
+
rubyforge_project:
|
106
|
+
rubygems_version: 2.2.2
|
107
|
+
signing_key:
|
108
|
+
specification_version: 4
|
109
|
+
summary: Cleans up bad character encodings with liberal application of fire.
|
110
|
+
test_files:
|
111
|
+
- test/charazard_test.rb
|
112
|
+
- test/test_helper.rb
|