html_massage 0.0.2 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +1 -0
- data/Gemfile +3 -1
- data/License-MIT +22 -0
- data/README.md +71 -9
- data/Rakefile +1 -1
- data/bin/html_massage +4 -0
- data/generate_readme.rb +59 -0
- data/html_massage.gemspec +22 -17
- data/lib/html_massage/cli.rb +35 -0
- data/lib/html_massage/version.rb +1 -1
- data/lib/html_massage.rb +247 -88
- data/spec/html_massage_spec.rb +210 -0
- metadata +108 -50
data/.gitignore
CHANGED
data/Gemfile
CHANGED
data/License-MIT
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2012 Harlan T Wood
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
CHANGED
@@ -1,18 +1,80 @@
|
|
1
1
|
# html_massage
|
2
2
|
|
3
3
|
Give your HTML a massage, in just the ways it loves:
|
4
|
+
|
4
5
|
* Remove headers and footers and navigation, and strip to only the "content" part of the HTML
|
5
6
|
* Sanitize tags, removing javascript and styling
|
6
7
|
* Convert your HTML to nicely-formatted plain text
|
7
8
|
|
8
|
-
## Usage
|
9
|
+
## Sample Usage
|
10
|
+
|
11
|
+
### Full Massage
|
9
12
|
|
10
|
-
require 'rubygems'
|
11
13
|
require 'html_massage'
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
14
|
+
|
15
|
+
html = %{
|
16
|
+
<html>
|
17
|
+
<head>
|
18
|
+
<script type="text/javascript">document.write('I am a bad script');</script>
|
19
|
+
</head>
|
20
|
+
<body>
|
21
|
+
<div id="header">My Site</div>
|
22
|
+
<div>This is some great content!</div>
|
23
|
+
<a href ="foo/bar.html">Click this link</a>
|
24
|
+
</body>
|
25
|
+
</html>
|
26
|
+
}
|
27
|
+
|
28
|
+
puts HtmlMassage.html( html )
|
29
|
+
# => "<div>This is some great content!</div>"
|
30
|
+
|
31
|
+
puts HtmlMassage.text( html )
|
32
|
+
# => "This is some great content!\n"
|
33
|
+
|
34
|
+
### Content Only
|
35
|
+
|
36
|
+
html_massage = HtmlMassage.new( html,
|
37
|
+
:exclude => [ '#header' ] )
|
38
|
+
# => #<HtmlMassager::HtmlMassage ... >
|
39
|
+
|
40
|
+
puts html_massage.exclude!
|
41
|
+
# <div>This is some great content!</div>
|
42
|
+
# <a href="foo/bar.html">Click this link</a>
|
43
|
+
|
44
|
+
### Sanitize HTML
|
45
|
+
|
46
|
+
html_massage = HtmlMassage.new( html,
|
47
|
+
:exclude => [ '#header' ] )
|
48
|
+
# => #<HtmlMassager::HtmlMassage ... >
|
49
|
+
|
50
|
+
puts html_massage.sanitize_html!
|
51
|
+
# <html>
|
52
|
+
# <head>
|
53
|
+
# </head>
|
54
|
+
# <body>
|
55
|
+
# <div id="header">My Site</div>
|
56
|
+
# <div>This is some great content!</div>
|
57
|
+
# </body>
|
58
|
+
# </html>
|
59
|
+
|
60
|
+
### Make Links Absolute
|
61
|
+
|
62
|
+
html_massage = HtmlMassage.new( html,
|
63
|
+
:exclude => [ '#header' ],
|
64
|
+
:source_url => 'http://example.com/joe/page1.html' )
|
65
|
+
|
66
|
+
puts html_massage.absolutify_links!
|
67
|
+
# <html>
|
68
|
+
# <head>
|
69
|
+
# <script type="text/javascript">document.write('I am a bad script');</script>
|
70
|
+
# </head>
|
71
|
+
# <body>
|
72
|
+
# <div id="header">My Site</div>
|
73
|
+
# <div>This is some great content!</div>
|
74
|
+
# <a href ="http://example.com/joe/foo/bar.html">Click this link</a>
|
75
|
+
# </body>
|
76
|
+
# </html>
|
77
|
+
|
78
|
+
puts html_massage.absolutify_images!
|
79
|
+
#
|
80
|
+
|
data/Rakefile
CHANGED
@@ -1 +1 @@
|
|
1
|
-
require
|
1
|
+
require "bundler/gem_tasks"
|
data/bin/html_massage
ADDED
data/generate_readme.rb
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
class IO
|
4
|
+
def self.write( path, content )
|
5
|
+
file = File.new( path, "w" )
|
6
|
+
file.write( content )
|
7
|
+
file.close
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
CHUNK_SEP = "\n\n"
|
12
|
+
|
13
|
+
def is_code?( markdown )
|
14
|
+
markdown.start_with?( ' ' )
|
15
|
+
end
|
16
|
+
|
17
|
+
def header( text, top_newlines )
|
18
|
+
puts "\n" * top_newlines
|
19
|
+
puts '*'*10
|
20
|
+
puts text
|
21
|
+
puts '*'*10
|
22
|
+
end
|
23
|
+
|
24
|
+
system( "cp README.md README-backup-#{Time.now.to_s.gsub(/\W/, '-')}.md" )
|
25
|
+
readme = IO.read( 'README.md' )
|
26
|
+
chunks = readme.split( CHUNK_SEP )
|
27
|
+
code = ''
|
28
|
+
new_readme = ''
|
29
|
+
chunks.each do |chunk|
|
30
|
+
if is_code?( chunk )
|
31
|
+
|
32
|
+
chunk
|
33
|
+
code << chunk << CHUNK_SEP
|
34
|
+
|
35
|
+
header( 'Code', 3 )
|
36
|
+
puts code
|
37
|
+
header( 'Result', 1 )
|
38
|
+
puts result = eval( code )
|
39
|
+
|
40
|
+
unless result.nil?
|
41
|
+
p 111, chunk
|
42
|
+
result = result.to_s
|
43
|
+
_, code_sans_results = chunk.match( /\A((?: [^#].*\r?\n)+)(?: #.*\r?\n)+\Z/ ).to_a
|
44
|
+
if code_sans_results
|
45
|
+
p 222
|
46
|
+
result = result.split("\n").map{ |line| " # #{line}" }.join("\n")
|
47
|
+
chunk = code_sans_results << result << CHUNK_SEP
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
header( 'Output', 1 )
|
52
|
+
puts chunk
|
53
|
+
|
54
|
+
new_readme << chunk << CHUNK_SEP
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
IO.write( 'README.md', new_readme )
|
data/html_massage.gemspec
CHANGED
@@ -1,23 +1,28 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
|
-
|
3
|
-
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'html_massage/version'
|
4
5
|
|
5
|
-
Gem::Specification.new do |
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = "html_massage"
|
8
|
+
gem.version = HtmlMassager::VERSION
|
9
|
+
gem.authors = ["Harlan T Wood"]
|
10
|
+
gem.email = ["code@harlantwood.net"]
|
11
|
+
gem.homepage = "https://github.com/harlantwood/html_massage"
|
12
|
+
gem.summary = %{Massages HTML how you want to.}
|
13
|
+
gem.description = %{Massages HTML how you want to: sanitize tags, remove headers and footers, convert to plain text.}
|
13
14
|
|
14
|
-
|
15
|
+
gem.files = `git ls-files`.split($/)
|
16
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
|
+
gem.require_paths = ["lib"]
|
15
19
|
|
16
|
-
|
17
|
-
|
20
|
+
gem.add_dependency "nokogiri", ">= 1.4"
|
21
|
+
gem.add_dependency "sanitize", ">= 2.0"
|
22
|
+
gem.add_dependency "thor"
|
23
|
+
gem.add_dependency "rest-client", ">= 1.6"
|
24
|
+
|
25
|
+
gem.add_development_dependency "rspec", ">= 2.5"
|
18
26
|
|
19
|
-
s.files = `git ls-files`.split("\n")
|
20
|
-
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
21
|
-
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
22
|
-
s.require_paths = ["lib"]
|
23
27
|
end
|
28
|
+
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'thor'
|
2
|
+
require 'rest_client'
|
3
|
+
require 'html_massage'
|
4
|
+
|
5
|
+
module HtmlMassager
|
6
|
+
|
7
|
+
class CLI < Thor
|
8
|
+
|
9
|
+
desc :html, 'Download HTML from given URL and massage into html'
|
10
|
+
def html url
|
11
|
+
STDOUT.puts massage_to :html, url
|
12
|
+
end
|
13
|
+
|
14
|
+
desc :text, 'Download HTML from given URL and massage into plain text'
|
15
|
+
def text url
|
16
|
+
STDOUT.puts massage_to :text, url
|
17
|
+
end
|
18
|
+
|
19
|
+
desc :markdown, 'Download HTML from given URL and massage into markdown'
|
20
|
+
def markdown url
|
21
|
+
STDOUT.puts massage_to :markdown, url
|
22
|
+
end
|
23
|
+
|
24
|
+
no_tasks do
|
25
|
+
def massage_to output_format, url
|
26
|
+
HtmlMassage.send output_format,
|
27
|
+
RestClient.get(url),
|
28
|
+
:source_url => url,
|
29
|
+
:links => :absolute,
|
30
|
+
:images => :absolute
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
end
|
data/lib/html_massage/version.rb
CHANGED
data/lib/html_massage.rb
CHANGED
@@ -1,129 +1,281 @@
|
|
1
1
|
require "cgi"
|
2
2
|
require "nokogiri"
|
3
3
|
require "sanitize"
|
4
|
+
require "reverse_markdown"
|
4
5
|
require "html_massage/version"
|
5
6
|
|
6
7
|
module HtmlMassager
|
8
|
+
|
7
9
|
class HtmlMassage
|
8
|
-
def initialize( html, options )
|
9
|
-
@source_url = options[ :source_url ]
|
10
|
-
@ignored_selectors = options[ :ignored_selectors ]
|
11
|
-
@clean_html = massage_html( html )
|
12
|
-
end
|
13
10
|
|
14
|
-
|
15
|
-
html = content_only( html )
|
16
|
-
html = sanitize_html( html )
|
17
|
-
html = absolutify_links( html ) if @source_url
|
11
|
+
INCLUDE_CONTENT_ONLY = %w[
|
18
12
|
html
|
19
|
-
|
13
|
+
body
|
14
|
+
]
|
20
15
|
|
21
|
-
|
22
|
-
|
23
|
-
|
16
|
+
DEFAULT_EXCLUDE_OPTIONS = [
|
17
|
+
# general:
|
18
|
+
'head',
|
19
|
+
'title',
|
20
|
+
'meta',
|
24
21
|
|
25
|
-
|
26
|
-
|
27
|
-
|
22
|
+
'div#header',
|
23
|
+
'div.header',
|
24
|
+
'div#banner',
|
25
|
+
'div.banner',
|
26
|
+
'.footer',
|
27
|
+
'#footer',
|
28
|
+
'div#navigation',
|
29
|
+
'div.navigation',
|
30
|
+
'div#nav',
|
31
|
+
'div.nav',
|
32
|
+
'div#sidebar',
|
33
|
+
'div.sidebar',
|
34
|
+
'#breadcrumbs',
|
35
|
+
'.breadcrumbs',
|
36
|
+
'#backfornav',
|
37
|
+
'.backfornav',
|
38
|
+
'div.post-footer',
|
39
|
+
'div.navigation',
|
28
40
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
41
|
+
# wordpress:
|
42
|
+
'a#left_arrow',
|
43
|
+
'a#right_arrow',
|
44
|
+
'div#comments',
|
45
|
+
'div#comment-section',
|
46
|
+
'div#respond',
|
34
47
|
|
35
|
-
|
36
|
-
|
48
|
+
# typepad
|
49
|
+
'#pagebody > #pagebody-inner > #alpha',
|
50
|
+
'p.content-nav',
|
51
|
+
|
52
|
+
# blog widgets
|
53
|
+
'.widget_blog_subscription',
|
54
|
+
'.loggedout-follow-normal',
|
37
55
|
|
38
|
-
%w[ script noscript style ].each do |tag|
|
39
|
-
html.gsub!( %r{<#{tag}[^>]*>.*?</#{tag}>}mi, '' )
|
40
|
-
end
|
41
56
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
+
# wikipedia
|
58
|
+
|
59
|
+
'#bodyContent > #siteSub',
|
60
|
+
'#bodyContent > #contentSub',
|
61
|
+
'#bodyContent > #jump-to-nav',
|
62
|
+
'table.metadata',
|
63
|
+
'table.navbox',
|
64
|
+
'table.toc',
|
65
|
+
'div#catlinks',
|
66
|
+
'div.printfooter',
|
67
|
+
'h1 > span.editsection',
|
68
|
+
'h2 > span.editsection',
|
69
|
+
'h3 > span.editsection',
|
70
|
+
'h4 > span.editsection',
|
71
|
+
'h5 > span.editsection',
|
72
|
+
'h6 > span.editsection',
|
73
|
+
|
74
|
+
# wikipedia "message boxes" -- metadata such as "requires cleanup":
|
75
|
+
# see http://en.wikipedia.org/wiki/Template:Ambox
|
76
|
+
'table.ambox',
|
77
|
+
'table.tmbox',
|
78
|
+
'table.imbox',
|
79
|
+
'table.cmbox',
|
80
|
+
'table.ombox',
|
81
|
+
'table.fmbox',
|
82
|
+
'table.dmbox',
|
83
|
+
|
84
|
+
# mediawiki
|
85
|
+
'#mw-subcategories',
|
86
|
+
'#mw-pages',
|
87
|
+
'#mw-head',
|
88
|
+
'#mw-panel',
|
89
|
+
|
90
|
+
# social media sharing:
|
91
|
+
'ul#sharebar',
|
92
|
+
'ul#sharebarx',
|
93
|
+
'.sharedaddy',
|
94
|
+
'#sharing_email',
|
95
|
+
|
96
|
+
# signup:
|
97
|
+
'#mailchimp_signup_bottom',
|
98
|
+
]
|
99
|
+
|
100
|
+
DEFAULT_SANITIZE_OPTIONS = {
|
101
|
+
:elements => %w[
|
102
|
+
a abbr acronym address area b big
|
103
|
+
blockquote br button caption center cite
|
104
|
+
code col colgroup dd del dfn dir
|
105
|
+
div dl dt em fieldset form h1
|
106
|
+
h2 h3 h4 h5 h6 hr i
|
107
|
+
img
|
108
|
+
input ins kbd label legend li map menu
|
109
|
+
ol optgroup option p pre q s samp
|
110
|
+
select small span strike strong sub
|
111
|
+
sup table tbody td textarea tfoot th
|
112
|
+
thead tr tt u ul var
|
57
113
|
],
|
58
114
|
:attributes => {
|
59
|
-
'a' => [
|
60
|
-
'img' => [
|
61
|
-
:all => [
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
115
|
+
'a' => %w[ href ],
|
116
|
+
'img' => %w[ src ],
|
117
|
+
:all => %w[
|
118
|
+
abbr accept accept-charset
|
119
|
+
accesskey action align alt axis
|
120
|
+
border cellpadding cellspacing char
|
121
|
+
charoff class charset checked cite
|
122
|
+
clear cols colspan color
|
123
|
+
compact coords datetime dir
|
124
|
+
disabled enctype for frame
|
125
|
+
headers height hreflang
|
126
|
+
hspace id ismap label lang
|
127
|
+
longdesc maxlength media method
|
128
|
+
multiple name nohref noshade
|
129
|
+
nowrap prompt readonly rel rev
|
130
|
+
rows rowspan rules scope
|
131
|
+
selected shape size span
|
132
|
+
start summary tabindex target
|
133
|
+
title type usemap valign value
|
134
|
+
vspace width
|
135
|
+
]
|
78
136
|
},
|
137
|
+
|
138
|
+
# medium permissive list:
|
139
|
+
#:elements => [
|
140
|
+
# 'a', 'b', 'blockquote', 'br', 'code', 'dd', 'del', 'dl', 'dt',
|
141
|
+
# 'em', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i',
|
142
|
+
# 'img', 'ins', 'li', 'ol', 'p', 'pre', 'small', 'strike', 'strong', 'sub',
|
143
|
+
# 'sup', 'table', 'tbody', 'td', 'th',
|
144
|
+
# 'thead', 'tr', 'u', 'ul',
|
145
|
+
#],
|
146
|
+
|
79
147
|
:protocols => {
|
80
148
|
'a' => {'href' => ['http', 'https', 'mailto', :relative]},
|
81
149
|
'img' => {'src' => ['http', 'https', :relative]}
|
82
150
|
},
|
83
151
|
|
84
|
-
#
|
85
|
-
# Gollum has a nice way to add this to your config optionally, see:
|
86
|
-
# https://github.com/github/gollum/blob/master/lib/gollum/sanitization.rb
|
152
|
+
# Consider including for deprecated/historical or spam-suspect pages:
|
87
153
|
#
|
88
154
|
# :add_attributes => {
|
89
155
|
# 'a' => {'rel' => 'nofollow'}
|
90
156
|
# }
|
157
|
+
#
|
158
|
+
# Gollum has a nice way to add this to your config optionally, see:
|
159
|
+
# https://github.com/github/gollum/blob/master/lib/gollum/sanitization.rb
|
91
160
|
}
|
92
|
-
|
161
|
+
|
162
|
+
DEFAULTS = {
|
163
|
+
:include => INCLUDE_CONTENT_ONLY,
|
164
|
+
:exclude => DEFAULT_EXCLUDE_OPTIONS,
|
165
|
+
:sanitize => DEFAULT_SANITIZE_OPTIONS,
|
166
|
+
:links => :unchanged,
|
167
|
+
}
|
168
|
+
|
169
|
+
def self.html( html, options={} )
|
170
|
+
new( html ).massage!( options ).to_html
|
93
171
|
end
|
94
172
|
|
95
|
-
def
|
96
|
-
|
97
|
-
|
173
|
+
def self.text( html, options={} )
|
174
|
+
new( html ).massage!( options ).to_text
|
175
|
+
end
|
176
|
+
|
177
|
+
def self.markdown( html, options={} )
|
178
|
+
ReverseMarkdown.parse( self.html( html, options ) )
|
179
|
+
end
|
180
|
+
|
181
|
+
def initialize( html )
|
182
|
+
@html = html.dup
|
183
|
+
end
|
184
|
+
|
185
|
+
def massage!( options={} )
|
186
|
+
self.class.translate_old_options( options )
|
187
|
+
options = DEFAULTS.merge( options )
|
188
|
+
absolutify_links!(options[:source_url]) if options.delete( :links ) == :absolute
|
189
|
+
absolutify_images!(options[:source_url]) if options.delete( :images ) == :absolute
|
190
|
+
include!( options.delete( :include ) )
|
191
|
+
exclude!( options.delete( :exclude ) )
|
192
|
+
sanitize!( options.delete( :sanitize ) )
|
193
|
+
tidy_whitespace!
|
194
|
+
raise "Unexpected options #{options.inspect}" unless options.empty?
|
195
|
+
self
|
196
|
+
end
|
197
|
+
|
198
|
+
def self.translate_old_options( options )
|
199
|
+
options[ :exclude ] = options.delete( :ignored_selectors ) if options[ :ignored_selectors ]
|
200
|
+
end
|
201
|
+
|
202
|
+
def exclude!( selectors_to_exclude )
|
203
|
+
doc = Nokogiri::HTML( @html )
|
204
|
+
selectors_to_exclude.to_a.each do |selector_to_exclude|
|
205
|
+
( doc / selector_to_exclude ).remove
|
206
|
+
end
|
207
|
+
@html = doc.to_s
|
208
|
+
end
|
209
|
+
|
210
|
+
def include!( selectors_to_include )
|
211
|
+
section = Nokogiri::HTML( @html )
|
212
|
+
selectors_to_include.to_a.each do |selector_to_include|
|
213
|
+
subsection = section / selector_to_include
|
214
|
+
section = subsection unless subsection.empty?
|
215
|
+
end
|
216
|
+
@html = section.inner_html
|
217
|
+
end
|
218
|
+
|
219
|
+
def sanitize!( sanitize_options={} )
|
220
|
+
# Sanitize does not thoroughly remove these tags -- so we do a manual pass:
|
221
|
+
%w[ script noscript style ].each do |tag|
|
222
|
+
unless sanitize_options[ :elements ] && sanitize_options[ :elements ].include?( tag )
|
223
|
+
@html.gsub!( %r{<#{tag}[^>]*>.*?</#{tag}>}mi, '' )
|
224
|
+
end
|
225
|
+
end
|
226
|
+
|
227
|
+
@html = Sanitize.clean( @html, sanitize_options )
|
228
|
+
@html
|
229
|
+
end
|
230
|
+
|
231
|
+
def absolutify_links!(source_url)
|
232
|
+
absolutify_paths!('a', 'href', source_url)
|
233
|
+
end
|
234
|
+
|
235
|
+
def absolutify_images!(source_url)
|
236
|
+
absolutify_paths!('img', 'src', source_url)
|
237
|
+
end
|
238
|
+
|
239
|
+
def absolutify_paths!(tag_name, attr, source_url)
|
240
|
+
raise "When asking for absolute images or paths, please pass in source_url" unless source_url
|
241
|
+
match = source_url.match( %r{(^[a-z]+?://[^/]+)(/.+/)?}i )
|
242
|
+
return @html unless match
|
98
243
|
base_url = match[ 1 ]
|
99
244
|
resource_dir_url = match[ 0 ] # whole regexp match
|
245
|
+
dom = Nokogiri::HTML.fragment( @html )
|
100
246
|
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
247
|
+
tags = dom / tag_name
|
248
|
+
tags.each do |tag|
|
249
|
+
value = tag[ attr ]
|
250
|
+
if value
|
251
|
+
tag[ attr ] =
|
252
|
+
case value
|
253
|
+
when %r{^//} # eg src="//upload.wikimedia.org/wikipedia/Map.png"
|
254
|
+
value
|
108
255
|
when %r{^/}
|
109
|
-
File.join( base_url,
|
256
|
+
File.join( base_url, value )
|
110
257
|
when %r{^\.\.}
|
111
|
-
File.join( resource_dir_url,
|
258
|
+
File.join( resource_dir_url, value )
|
112
259
|
else
|
113
|
-
|
260
|
+
value
|
114
261
|
end
|
115
262
|
end
|
116
263
|
end
|
117
|
-
|
118
|
-
html
|
264
|
+
|
265
|
+
@html = dom.to_s.strip
|
119
266
|
end
|
120
267
|
|
121
|
-
def
|
122
|
-
@
|
268
|
+
def tidy_whitespace!
|
269
|
+
@html = strip_lines(@html)
|
270
|
+
tidy_tables!
|
271
|
+
end
|
272
|
+
|
273
|
+
def tidy_tables!
|
274
|
+
@html.gsub!(%r{(<table\b)(.+?)(</table>)}m) { open,body,close=$1,$2,$3; open + body.gsub(/\n{2,}/, "\n") + close }
|
123
275
|
end
|
124
276
|
|
125
277
|
def to_text
|
126
|
-
text = CGI.unescapeHTML( @
|
278
|
+
text = CGI.unescapeHTML( @html )
|
127
279
|
|
128
280
|
# normalize newlines
|
129
281
|
text.gsub!(/\r\n/, "\n")
|
@@ -132,7 +284,7 @@ module HtmlMassager
|
|
132
284
|
# nbsp => ' '
|
133
285
|
text.gsub!(/ /, ' ')
|
134
286
|
|
135
|
-
# TODO: figure out how to do these in ruby 1.9
|
287
|
+
# TODO: figure out how to do these in ruby 1.9:
|
136
288
|
# They now throw 'incompatible encoding -- ascii regexp for utf8 string'
|
137
289
|
# text.gsub!( /\302\240/, ' ' ) # UTF8 for nbsp
|
138
290
|
# text.gsub!( /\240/, ' ' ) # ascii for nbsp
|
@@ -163,14 +315,21 @@ module HtmlMassager
|
|
163
315
|
"#{text}\n"
|
164
316
|
end
|
165
317
|
|
166
|
-
def strip_lines(
|
167
|
-
lines =
|
318
|
+
def strip_lines(content)
|
319
|
+
lines = content.split( $/ ) # $/ is the current ruby line ending, \n by default
|
168
320
|
lines.map!{ |line| line.strip }
|
169
|
-
|
170
|
-
|
321
|
+
processed = lines.join( $/ )
|
322
|
+
processed.strip
|
323
|
+
end
|
324
|
+
|
325
|
+
|
326
|
+
def to_html
|
327
|
+
@html.strip!
|
328
|
+
@html
|
171
329
|
end
|
172
330
|
|
173
331
|
end
|
332
|
+
|
174
333
|
end
|
175
334
|
|
176
|
-
include HtmlMassager
|
335
|
+
include HtmlMassager
|
@@ -0,0 +1,210 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'html_massage'))
|
4
|
+
|
5
|
+
describe HtmlMassager::HtmlMassage do
|
6
|
+
|
7
|
+
include HtmlMassager
|
8
|
+
|
9
|
+
describe ".html" do
|
10
|
+
it 'Should massage and output HTML' do
|
11
|
+
html = "<html><body><div>This is some great content!</div></body></html>"
|
12
|
+
HtmlMassage.html(html).should == "<div>This is some great content!</div>"
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'should remove HTML "doctype"' do
|
16
|
+
html = '
|
17
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
18
|
+
<html xmlns="http://www.w3.org/1999/xhtml">
|
19
|
+
<body>
|
20
|
+
<p>foobar</p>
|
21
|
+
</body>
|
22
|
+
</html>
|
23
|
+
'
|
24
|
+
HtmlMassage.html(html).strip.should == "<p>foobar</p>"
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
|
29
|
+
describe ".text" do
|
30
|
+
it 'Should massage and output text' do
|
31
|
+
html = "<html><body><div>This is some great content!</div></body></html>"
|
32
|
+
HtmlMassage.text(html).strip.should == "This is some great content!"
|
33
|
+
end
|
34
|
+
|
35
|
+
it 'should convert an HTML sample as expected' do
|
36
|
+
html = "
|
37
|
+
<html><body>
|
38
|
+
<h1>Title</h1>
|
39
|
+
This is the body.
|
40
|
+
Testing <a href='http://www.google.com/'>link to Google</a>.
|
41
|
+
<p />
|
42
|
+
Testing image <img src='/noimage.png'>.
|
43
|
+
<br />
|
44
|
+
The End.
|
45
|
+
</body></html>
|
46
|
+
"
|
47
|
+
HtmlMassage.text(html).strip.should == "Title
|
48
|
+
|
49
|
+
This is the body. Testing link to Google.
|
50
|
+
|
51
|
+
Testing image .
|
52
|
+
The End.
|
53
|
+
".strip.gsub(/^ +/, '')
|
54
|
+
end
|
55
|
+
|
56
|
+
it 'should play nice with UTF8 HTML source' do
|
57
|
+
html = '
|
58
|
+
<html>
|
59
|
+
<head>
|
60
|
+
<meta content="text/html; charset=utf-8" http-equiv="content-type" />
|
61
|
+
</head>
|
62
|
+
<body>
|
63
|
+
Niq is a performer → Angry, arrogant, & so admired.
|
64
|
+
</body>
|
65
|
+
</html>
|
66
|
+
'
|
67
|
+
HtmlMassage.text(html).strip.should == "Niq is a performer → Angry, arrogant, & so admired."
|
68
|
+
end
|
69
|
+
|
70
|
+
it 'should play nice with ' do
|
71
|
+
pending
|
72
|
+
html = ' '
|
73
|
+
HtmlMassage.text(html).strip.should == " "
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
describe ".markdown" do
|
78
|
+
it 'Should massage and output markdown' do
|
79
|
+
html = "<html><body><div>This is some <i>great</i> content!</div></body></html>"
|
80
|
+
massaged = HtmlMassage.markdown html
|
81
|
+
massaged.strip.should == "This is some _great_ content!"
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
describe "#massage!" do
|
86
|
+
|
87
|
+
context 'invalid html' do
|
88
|
+
[
|
89
|
+
"<html><body>foobar</body>",
|
90
|
+
"<html><body>foobar</html>",
|
91
|
+
"<body>foobar</body></html>",
|
92
|
+
"<html>foobar</body></html>",
|
93
|
+
].each do |broken_html|
|
94
|
+
it "should return 'foobar' when given #{broken_html.inspect}" do
|
95
|
+
HtmlMassage.new(broken_html).massage!.to_text.strip.should == "foobar"
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
pending 'should convert an HTML sample as expected'
|
101
|
+
|
102
|
+
it 'should leave HTML entities intact' do
|
103
|
+
pending 'improve ::Node.massage_html -- handling of html entities, utf8 chars'
|
104
|
+
original = "This “branching” of creative works"
|
105
|
+
massage = HtmlMassager::HtmlMassage.new( original )
|
106
|
+
massage.massage!.should == original
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
describe ".sanitize_html" do
|
111
|
+
it 'should remove <style> tags and their contents' do
|
112
|
+
html = %~<!-- Remix button --><br />
|
113
|
+
<style type='text/css'>
|
114
|
+
a.remix_on_wikinodes_tab {
|
115
|
+
top: 25%; left: 0; width: 42px; height: 100px; color: #FFF; cursor:pointer; text-indent:-99999px; overflow:hidden; position: fixed; z-index: 99999; margin-left: -7px; background-image: url(http://www.openyourproject.org/images/remix_tab.png); _position: absolute; right: 0 !important; left: auto !important; margin-right: -7px !important; margin-left: auto !important; } a.remix_on_wikinodes_tab:hover { margin-left: -4px; margin-right: -4px !important; margin-left: auto !important;
|
116
|
+
}
|
117
|
+
</style>
|
118
|
+
<p> <script type="text/javascript" language="javascript"> document.write( '<a style="background-color: #2a2a2a;" class="remix_on_wikinodes_tab" href="http://www.openyourproject.org/nodes/new?parent=' + window.location + '" title="Remix this content on WikiNodes -- creative collaboration designed to set you free" >Remix This</a>' ); </script> <noscript>Note: you can turn on Javascript to see the ‘Remix This’ link.</noscript></p>
|
119
|
+
~
|
120
|
+
html_massager = HtmlMassage.new( html )
|
121
|
+
html_massager.sanitize!.should_not =~ /remix_on_wikinodes_tab/
|
122
|
+
end
|
123
|
+
|
124
|
+
it 'should remove <noscript> tags and their contents' do
|
125
|
+
html = %{ <noscript>Note: you can turn on Javascript to see the 'Remix This' link. </noscript> }
|
126
|
+
html_massager = HtmlMassage.new( html )
|
127
|
+
html_massager.sanitize!.strip.should == ''
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
describe '#absolutify_links' do
|
132
|
+
it 'should work for absolute path links' do
|
133
|
+
source_url = 'http://en.wikipedia.org/wiki/Singularity'
|
134
|
+
original_html = '<a href="/wiki/Ray_Kurzweil">Ray</a>'
|
135
|
+
html_massager = HtmlMassage.new( original_html )
|
136
|
+
html_massager.absolutify_links!(source_url).should ==
|
137
|
+
'<a href="http://en.wikipedia.org/wiki/Ray_Kurzweil">Ray</a>'
|
138
|
+
end
|
139
|
+
|
140
|
+
it 'should work for absolute path links (bugfix)' do
|
141
|
+
source_url = 'http://p2pfoundation.net/NextNet'
|
142
|
+
original_html = '<a href="/Ten_Principles_for_an_Autonomous_Internet" title="Ten Principles for an Autonomous Internet">Ten Principles for an Autonomous Internet</a>'
|
143
|
+
html_massager = HtmlMassage.new( original_html )
|
144
|
+
html_massager.absolutify_links!(source_url).should ==
|
145
|
+
'<a href="http://p2pfoundation.net/Ten_Principles_for_an_Autonomous_Internet" title="Ten Principles for an Autonomous Internet">Ten Principles for an Autonomous Internet</a>'
|
146
|
+
end
|
147
|
+
|
148
|
+
it 'should work for relative links' do
|
149
|
+
source_url = 'http://en.wikipedia.org/wiki/Singularity'
|
150
|
+
original_html = '<a href="../wiki/Ray_Kurzweil">Ray</a>'
|
151
|
+
html_massager = HtmlMassage.new( original_html )
|
152
|
+
html_massager.absolutify_links!(source_url).should ==
|
153
|
+
'<a href="http://en.wikipedia.org/wiki/../wiki/Ray_Kurzweil">Ray</a>'
|
154
|
+
end
|
155
|
+
|
156
|
+
it 'should leave full URLs alone' do
|
157
|
+
source_url = 'http://en.wikipedia.org/wiki/Singularity'
|
158
|
+
original_html = '<a href="http://www.wired.com/wiredscience">wired science</a>'
|
159
|
+
html_massager = HtmlMassage.new( original_html )
|
160
|
+
html_massager.absolutify_links!(source_url).should == original_html
|
161
|
+
end
|
162
|
+
|
163
|
+
it 'should leave // style URLs alone' do
|
164
|
+
source_url = 'http://en.wikipedia.org/wiki/Singularity'
|
165
|
+
original_html = '<a href="//wired.com/wiredscience">wired science</a>'
|
166
|
+
html_massager = HtmlMassage.new( original_html )
|
167
|
+
html_massager.absolutify_links!(source_url).should == original_html
|
168
|
+
end
|
169
|
+
|
170
|
+
it 'should leave "jump links" alone' do
|
171
|
+
source_url = 'http://en.wikipedia.org/wiki/Singularity'
|
172
|
+
original_html = '<a href="#cite_1">1</a>'
|
173
|
+
html_massager = HtmlMassage.new( original_html )
|
174
|
+
html_massager.absolutify_links!(source_url).should == original_html
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
describe '#absolutify_images!' do
|
179
|
+
it 'should work for absolute path links' do
|
180
|
+
source_url = 'http://enlightenedstructure.org/Home/'
|
181
|
+
original_html = '<img src="/IMG/we-are.png" alt="" class="icon">'
|
182
|
+
html_massager = HtmlMassage.new( original_html )
|
183
|
+
html_massager.absolutify_images!(source_url).should ==
|
184
|
+
'<img src="http://enlightenedstructure.org/IMG/we-are.png" alt="" class="icon">'
|
185
|
+
end
|
186
|
+
|
187
|
+
it 'should work for absolute path links (bugfix)' do
|
188
|
+
source_url = 'http://www.realitysandwich.com/blog/daniel_pinchbeck'
|
189
|
+
original_html = '<img src="/sites/realitysandwich.com/themes/zen/pinkreality/images/creative-commons-license.png" alt="Attribution-Noncommercial-Share Alike 3.0 Unported" title="" width="88" height="31">'
|
190
|
+
html_massager = HtmlMassage.new( original_html )
|
191
|
+
html_massager.absolutify_images!(source_url).should ==
|
192
|
+
'<img src="http://www.realitysandwich.com/sites/realitysandwich.com/themes/zen/pinkreality/images/creative-commons-license.png" alt="Attribution-Noncommercial-Share Alike 3.0 Unported" title="" width="88" height="31">'
|
193
|
+
end
|
194
|
+
|
195
|
+
it 'should leave // style URLs alone' do
|
196
|
+
source_url = 'http://en.wikipedia.org/wiki/List_of_communes_in_France_with_over_20,000_inhabitants_(2006_census)'
|
197
|
+
original_html = '<img alt="" src="//upload.wikimedia.org/wikipedia/commons/thumb/f/f9/France-CIA_WFB_Map.png/220px-France-CIA_WFB_Map.png" width="220" height="235" class="thumbimage">'
|
198
|
+
html_massager = HtmlMassage.new( original_html )
|
199
|
+
html_massager.absolutify_images!(source_url).should == original_html
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
203
|
+
describe '#tidy_tables!' do
|
204
|
+
it 'should remove multiple newlines from tables' do
|
205
|
+
HtmlMassage.new("<table><tr>\n<th>Chư\n\n\nYang Sin National Park</th>\n\n\n</tr></table>").tidy_tables!.should ==
|
206
|
+
"<table><tr>\n<th>Chư\nYang Sin National Park</th>\n</tr></table>"
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
end
|
metadata
CHANGED
@@ -1,82 +1,140 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: html_massage
|
3
|
-
version: !ruby/object:Gem::Version
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.2.0
|
4
5
|
prerelease:
|
5
|
-
version: 0.0.2
|
6
6
|
platform: ruby
|
7
|
-
authors:
|
8
|
-
- Harlan
|
7
|
+
authors:
|
8
|
+
- Harlan T Wood
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
- !ruby/object:Gem::Dependency
|
12
|
+
date: 2012-11-25 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
16
15
|
name: nokogiri
|
17
|
-
|
18
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
19
17
|
none: false
|
20
|
-
requirements:
|
21
|
-
- -
|
22
|
-
- !ruby/object:Gem::Version
|
23
|
-
version: 1.4
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '1.4'
|
24
22
|
type: :runtime
|
25
|
-
|
26
|
-
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '1.4'
|
30
|
+
- !ruby/object:Gem::Dependency
|
27
31
|
name: sanitize
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '2.0'
|
38
|
+
type: :runtime
|
28
39
|
prerelease: false
|
29
|
-
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '2.0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: thor
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
30
49
|
none: false
|
31
|
-
requirements:
|
32
|
-
- -
|
33
|
-
- !ruby/object:Gem::Version
|
34
|
-
version:
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
35
54
|
type: :runtime
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: rest-client
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '1.6'
|
70
|
+
type: :runtime
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '1.6'
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: rspec
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ! '>='
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '2.5'
|
86
|
+
type: :development
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '2.5'
|
94
|
+
description: ! 'Massages HTML how you want to: sanitize tags, remove headers and footers,
|
95
|
+
convert to plain text.'
|
96
|
+
email:
|
97
|
+
- code@harlantwood.net
|
98
|
+
executables:
|
99
|
+
- html_massage
|
42
100
|
extensions: []
|
43
|
-
|
44
101
|
extra_rdoc_files: []
|
45
|
-
|
46
|
-
files:
|
102
|
+
files:
|
47
103
|
- .gitignore
|
48
104
|
- Gemfile
|
105
|
+
- License-MIT
|
49
106
|
- README.md
|
50
107
|
- Rakefile
|
108
|
+
- bin/html_massage
|
109
|
+
- generate_readme.rb
|
51
110
|
- html_massage.gemspec
|
52
111
|
- lib/html_massage.rb
|
112
|
+
- lib/html_massage/cli.rb
|
53
113
|
- lib/html_massage/version.rb
|
54
|
-
|
114
|
+
- spec/html_massage_spec.rb
|
115
|
+
homepage: https://github.com/harlantwood/html_massage
|
55
116
|
licenses: []
|
56
|
-
|
57
117
|
post_install_message:
|
58
118
|
rdoc_options: []
|
59
|
-
|
60
|
-
require_paths:
|
119
|
+
require_paths:
|
61
120
|
- lib
|
62
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
121
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
63
122
|
none: false
|
64
|
-
requirements:
|
65
|
-
- -
|
66
|
-
- !ruby/object:Gem::Version
|
67
|
-
version:
|
68
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
123
|
+
requirements:
|
124
|
+
- - ! '>='
|
125
|
+
- !ruby/object:Gem::Version
|
126
|
+
version: '0'
|
127
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
69
128
|
none: false
|
70
|
-
requirements:
|
71
|
-
- -
|
72
|
-
- !ruby/object:Gem::Version
|
73
|
-
version:
|
129
|
+
requirements:
|
130
|
+
- - ! '>='
|
131
|
+
- !ruby/object:Gem::Version
|
132
|
+
version: '0'
|
74
133
|
requirements: []
|
75
|
-
|
76
|
-
|
77
|
-
rubygems_version: 1.8.5
|
134
|
+
rubyforge_project:
|
135
|
+
rubygems_version: 1.8.24
|
78
136
|
signing_key:
|
79
137
|
specification_version: 3
|
80
138
|
summary: Massages HTML how you want to.
|
81
|
-
test_files:
|
82
|
-
|
139
|
+
test_files:
|
140
|
+
- spec/html_massage_spec.rb
|