html_massage 0.0.2 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -0
- data/Gemfile +3 -1
- data/License-MIT +22 -0
- data/README.md +71 -9
- data/Rakefile +1 -1
- data/bin/html_massage +4 -0
- data/generate_readme.rb +59 -0
- data/html_massage.gemspec +22 -17
- data/lib/html_massage/cli.rb +35 -0
- data/lib/html_massage/version.rb +1 -1
- data/lib/html_massage.rb +247 -88
- data/spec/html_massage_spec.rb +210 -0
- metadata +108 -50
data/.gitignore
CHANGED
data/Gemfile
CHANGED
data/License-MIT
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2012 Harlan T Wood
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
CHANGED
@@ -1,18 +1,80 @@
|
|
1
1
|
# html_massage
|
2
2
|
|
3
3
|
Give your HTML a massage, in just the ways it loves:
|
4
|
+
|
4
5
|
* Remove headers and footers and navigation, and strip to only the "content" part of the HTML
|
5
6
|
* Sanitize tags, removing javascript and styling
|
6
7
|
* Convert your HTML to nicely-formatted plain text
|
7
8
|
|
8
|
-
## Usage
|
9
|
+
## Sample Usage
|
10
|
+
|
11
|
+
### Full Massage
|
9
12
|
|
10
|
-
require 'rubygems'
|
11
13
|
require 'html_massage'
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
14
|
+
|
15
|
+
html = %{
|
16
|
+
<html>
|
17
|
+
<head>
|
18
|
+
<script type="text/javascript">document.write('I am a bad script');</script>
|
19
|
+
</head>
|
20
|
+
<body>
|
21
|
+
<div id="header">My Site</div>
|
22
|
+
<div>This is some great content!</div>
|
23
|
+
<a href ="foo/bar.html">Click this link</a>
|
24
|
+
</body>
|
25
|
+
</html>
|
26
|
+
}
|
27
|
+
|
28
|
+
puts HtmlMassage.html( html )
|
29
|
+
# => "<div>This is some great content!</div>"
|
30
|
+
|
31
|
+
puts HtmlMassage.text( html )
|
32
|
+
# => "This is some great content!\n"
|
33
|
+
|
34
|
+
### Content Only
|
35
|
+
|
36
|
+
html_massage = HtmlMassage.new( html,
|
37
|
+
:exclude => [ '#header' ] )
|
38
|
+
# => #<HtmlMassager::HtmlMassage ... >
|
39
|
+
|
40
|
+
puts html_massage.exclude!
|
41
|
+
# <div>This is some great content!</div>
|
42
|
+
# <a href="foo/bar.html">Click this link</a>
|
43
|
+
|
44
|
+
### Sanitize HTML
|
45
|
+
|
46
|
+
html_massage = HtmlMassage.new( html,
|
47
|
+
:exclude => [ '#header' ] )
|
48
|
+
# => #<HtmlMassager::HtmlMassage ... >
|
49
|
+
|
50
|
+
puts html_massage.sanitize_html!
|
51
|
+
# <html>
|
52
|
+
# <head>
|
53
|
+
# </head>
|
54
|
+
# <body>
|
55
|
+
# <div id="header">My Site</div>
|
56
|
+
# <div>This is some great content!</div>
|
57
|
+
# </body>
|
58
|
+
# </html>
|
59
|
+
|
60
|
+
### Make Links Absolute
|
61
|
+
|
62
|
+
html_massage = HtmlMassage.new( html,
|
63
|
+
:exclude => [ '#header' ],
|
64
|
+
:source_url => 'http://example.com/joe/page1.html' )
|
65
|
+
|
66
|
+
puts html_massage.absolutify_links!
|
67
|
+
# <html>
|
68
|
+
# <head>
|
69
|
+
# <script type="text/javascript">document.write('I am a bad script');</script>
|
70
|
+
# </head>
|
71
|
+
# <body>
|
72
|
+
# <div id="header">My Site</div>
|
73
|
+
# <div>This is some great content!</div>
|
74
|
+
# <a href ="http://example.com/joe/foo/bar.html">Click this link</a>
|
75
|
+
# </body>
|
76
|
+
# </html>
|
77
|
+
|
78
|
+
puts html_massage.absolutify_images!
|
79
|
+
#
|
80
|
+
|
data/Rakefile
CHANGED
@@ -1 +1 @@
|
|
1
|
-
require
|
1
|
+
require "bundler/gem_tasks"
|
data/bin/html_massage
ADDED
data/generate_readme.rb
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
class IO
|
4
|
+
def self.write( path, content )
|
5
|
+
file = File.new( path, "w" )
|
6
|
+
file.write( content )
|
7
|
+
file.close
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
CHUNK_SEP = "\n\n"
|
12
|
+
|
13
|
+
def is_code?( markdown )
|
14
|
+
markdown.start_with?( ' ' )
|
15
|
+
end
|
16
|
+
|
17
|
+
def header( text, top_newlines )
|
18
|
+
puts "\n" * top_newlines
|
19
|
+
puts '*'*10
|
20
|
+
puts text
|
21
|
+
puts '*'*10
|
22
|
+
end
|
23
|
+
|
24
|
+
system( "cp README.md README-backup-#{Time.now.to_s.gsub(/\W/, '-')}.md" )
|
25
|
+
readme = IO.read( 'README.md' )
|
26
|
+
chunks = readme.split( CHUNK_SEP )
|
27
|
+
code = ''
|
28
|
+
new_readme = ''
|
29
|
+
chunks.each do |chunk|
|
30
|
+
if is_code?( chunk )
|
31
|
+
|
32
|
+
chunk
|
33
|
+
code << chunk << CHUNK_SEP
|
34
|
+
|
35
|
+
header( 'Code', 3 )
|
36
|
+
puts code
|
37
|
+
header( 'Result', 1 )
|
38
|
+
puts result = eval( code )
|
39
|
+
|
40
|
+
unless result.nil?
|
41
|
+
p 111, chunk
|
42
|
+
result = result.to_s
|
43
|
+
_, code_sans_results = chunk.match( /\A((?: [^#].*\r?\n)+)(?: #.*\r?\n)+\Z/ ).to_a
|
44
|
+
if code_sans_results
|
45
|
+
p 222
|
46
|
+
result = result.split("\n").map{ |line| " # #{line}" }.join("\n")
|
47
|
+
chunk = code_sans_results << result << CHUNK_SEP
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
header( 'Output', 1 )
|
52
|
+
puts chunk
|
53
|
+
|
54
|
+
new_readme << chunk << CHUNK_SEP
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
IO.write( 'README.md', new_readme )
|
data/html_massage.gemspec
CHANGED
@@ -1,23 +1,28 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
|
-
|
3
|
-
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'html_massage/version'
|
4
5
|
|
5
|
-
Gem::Specification.new do |
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = "html_massage"
|
8
|
+
gem.version = HtmlMassager::VERSION
|
9
|
+
gem.authors = ["Harlan T Wood"]
|
10
|
+
gem.email = ["code@harlantwood.net"]
|
11
|
+
gem.homepage = "https://github.com/harlantwood/html_massage"
|
12
|
+
gem.summary = %{Massages HTML how you want to.}
|
13
|
+
gem.description = %{Massages HTML how you want to: sanitize tags, remove headers and footers, convert to plain text.}
|
13
14
|
|
14
|
-
|
15
|
+
gem.files = `git ls-files`.split($/)
|
16
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
|
+
gem.require_paths = ["lib"]
|
15
19
|
|
16
|
-
|
17
|
-
|
20
|
+
gem.add_dependency "nokogiri", ">= 1.4"
|
21
|
+
gem.add_dependency "sanitize", ">= 2.0"
|
22
|
+
gem.add_dependency "thor"
|
23
|
+
gem.add_dependency "rest-client", ">= 1.6"
|
24
|
+
|
25
|
+
gem.add_development_dependency "rspec", ">= 2.5"
|
18
26
|
|
19
|
-
s.files = `git ls-files`.split("\n")
|
20
|
-
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
21
|
-
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
22
|
-
s.require_paths = ["lib"]
|
23
27
|
end
|
28
|
+
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'thor'
|
2
|
+
require 'rest_client'
|
3
|
+
require 'html_massage'
|
4
|
+
|
5
|
+
module HtmlMassager
|
6
|
+
|
7
|
+
class CLI < Thor
|
8
|
+
|
9
|
+
desc :html, 'Download HTML from given URL and massage into html'
|
10
|
+
def html url
|
11
|
+
STDOUT.puts massage_to :html, url
|
12
|
+
end
|
13
|
+
|
14
|
+
desc :text, 'Download HTML from given URL and massage into plain text'
|
15
|
+
def text url
|
16
|
+
STDOUT.puts massage_to :text, url
|
17
|
+
end
|
18
|
+
|
19
|
+
desc :markdown, 'Download HTML from given URL and massage into markdown'
|
20
|
+
def markdown url
|
21
|
+
STDOUT.puts massage_to :markdown, url
|
22
|
+
end
|
23
|
+
|
24
|
+
no_tasks do
|
25
|
+
def massage_to output_format, url
|
26
|
+
HtmlMassage.send output_format,
|
27
|
+
RestClient.get(url),
|
28
|
+
:source_url => url,
|
29
|
+
:links => :absolute,
|
30
|
+
:images => :absolute
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
end
|
data/lib/html_massage/version.rb
CHANGED
data/lib/html_massage.rb
CHANGED
@@ -1,129 +1,281 @@
|
|
1
1
|
require "cgi"
|
2
2
|
require "nokogiri"
|
3
3
|
require "sanitize"
|
4
|
+
require "reverse_markdown"
|
4
5
|
require "html_massage/version"
|
5
6
|
|
6
7
|
module HtmlMassager
|
8
|
+
|
7
9
|
class HtmlMassage
|
8
|
-
def initialize( html, options )
|
9
|
-
@source_url = options[ :source_url ]
|
10
|
-
@ignored_selectors = options[ :ignored_selectors ]
|
11
|
-
@clean_html = massage_html( html )
|
12
|
-
end
|
13
10
|
|
14
|
-
|
15
|
-
html = content_only( html )
|
16
|
-
html = sanitize_html( html )
|
17
|
-
html = absolutify_links( html ) if @source_url
|
11
|
+
INCLUDE_CONTENT_ONLY = %w[
|
18
12
|
html
|
19
|
-
|
13
|
+
body
|
14
|
+
]
|
20
15
|
|
21
|
-
|
22
|
-
|
23
|
-
|
16
|
+
DEFAULT_EXCLUDE_OPTIONS = [
|
17
|
+
# general:
|
18
|
+
'head',
|
19
|
+
'title',
|
20
|
+
'meta',
|
24
21
|
|
25
|
-
|
26
|
-
|
27
|
-
|
22
|
+
'div#header',
|
23
|
+
'div.header',
|
24
|
+
'div#banner',
|
25
|
+
'div.banner',
|
26
|
+
'.footer',
|
27
|
+
'#footer',
|
28
|
+
'div#navigation',
|
29
|
+
'div.navigation',
|
30
|
+
'div#nav',
|
31
|
+
'div.nav',
|
32
|
+
'div#sidebar',
|
33
|
+
'div.sidebar',
|
34
|
+
'#breadcrumbs',
|
35
|
+
'.breadcrumbs',
|
36
|
+
'#backfornav',
|
37
|
+
'.backfornav',
|
38
|
+
'div.post-footer',
|
39
|
+
'div.navigation',
|
28
40
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
41
|
+
# wordpress:
|
42
|
+
'a#left_arrow',
|
43
|
+
'a#right_arrow',
|
44
|
+
'div#comments',
|
45
|
+
'div#comment-section',
|
46
|
+
'div#respond',
|
34
47
|
|
35
|
-
|
36
|
-
|
48
|
+
# typepad
|
49
|
+
'#pagebody > #pagebody-inner > #alpha',
|
50
|
+
'p.content-nav',
|
51
|
+
|
52
|
+
# blog widgets
|
53
|
+
'.widget_blog_subscription',
|
54
|
+
'.loggedout-follow-normal',
|
37
55
|
|
38
|
-
%w[ script noscript style ].each do |tag|
|
39
|
-
html.gsub!( %r{<#{tag}[^>]*>.*?</#{tag}>}mi, '' )
|
40
|
-
end
|
41
56
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
+
# wikipedia
|
58
|
+
|
59
|
+
'#bodyContent > #siteSub',
|
60
|
+
'#bodyContent > #contentSub',
|
61
|
+
'#bodyContent > #jump-to-nav',
|
62
|
+
'table.metadata',
|
63
|
+
'table.navbox',
|
64
|
+
'table.toc',
|
65
|
+
'div#catlinks',
|
66
|
+
'div.printfooter',
|
67
|
+
'h1 > span.editsection',
|
68
|
+
'h2 > span.editsection',
|
69
|
+
'h3 > span.editsection',
|
70
|
+
'h4 > span.editsection',
|
71
|
+
'h5 > span.editsection',
|
72
|
+
'h6 > span.editsection',
|
73
|
+
|
74
|
+
# wikipedia "message boxes" -- metadata such as "requires cleanup":
|
75
|
+
# see http://en.wikipedia.org/wiki/Template:Ambox
|
76
|
+
'table.ambox',
|
77
|
+
'table.tmbox',
|
78
|
+
'table.imbox',
|
79
|
+
'table.cmbox',
|
80
|
+
'table.ombox',
|
81
|
+
'table.fmbox',
|
82
|
+
'table.dmbox',
|
83
|
+
|
84
|
+
# mediawiki
|
85
|
+
'#mw-subcategories',
|
86
|
+
'#mw-pages',
|
87
|
+
'#mw-head',
|
88
|
+
'#mw-panel',
|
89
|
+
|
90
|
+
# social media sharing:
|
91
|
+
'ul#sharebar',
|
92
|
+
'ul#sharebarx',
|
93
|
+
'.sharedaddy',
|
94
|
+
'#sharing_email',
|
95
|
+
|
96
|
+
# signup:
|
97
|
+
'#mailchimp_signup_bottom',
|
98
|
+
]
|
99
|
+
|
100
|
+
DEFAULT_SANITIZE_OPTIONS = {
|
101
|
+
:elements => %w[
|
102
|
+
a abbr acronym address area b big
|
103
|
+
blockquote br button caption center cite
|
104
|
+
code col colgroup dd del dfn dir
|
105
|
+
div dl dt em fieldset form h1
|
106
|
+
h2 h3 h4 h5 h6 hr i
|
107
|
+
img
|
108
|
+
input ins kbd label legend li map menu
|
109
|
+
ol optgroup option p pre q s samp
|
110
|
+
select small span strike strong sub
|
111
|
+
sup table tbody td textarea tfoot th
|
112
|
+
thead tr tt u ul var
|
57
113
|
],
|
58
114
|
:attributes => {
|
59
|
-
'a' => [
|
60
|
-
'img' => [
|
61
|
-
:all => [
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
115
|
+
'a' => %w[ href ],
|
116
|
+
'img' => %w[ src ],
|
117
|
+
:all => %w[
|
118
|
+
abbr accept accept-charset
|
119
|
+
accesskey action align alt axis
|
120
|
+
border cellpadding cellspacing char
|
121
|
+
charoff class charset checked cite
|
122
|
+
clear cols colspan color
|
123
|
+
compact coords datetime dir
|
124
|
+
disabled enctype for frame
|
125
|
+
headers height hreflang
|
126
|
+
hspace id ismap label lang
|
127
|
+
longdesc maxlength media method
|
128
|
+
multiple name nohref noshade
|
129
|
+
nowrap prompt readonly rel rev
|
130
|
+
rows rowspan rules scope
|
131
|
+
selected shape size span
|
132
|
+
start summary tabindex target
|
133
|
+
title type usemap valign value
|
134
|
+
vspace width
|
135
|
+
]
|
78
136
|
},
|
137
|
+
|
138
|
+
# medium permissive list:
|
139
|
+
#:elements => [
|
140
|
+
# 'a', 'b', 'blockquote', 'br', 'code', 'dd', 'del', 'dl', 'dt',
|
141
|
+
# 'em', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i',
|
142
|
+
# 'img', 'ins', 'li', 'ol', 'p', 'pre', 'small', 'strike', 'strong', 'sub',
|
143
|
+
# 'sup', 'table', 'tbody', 'td', 'th',
|
144
|
+
# 'thead', 'tr', 'u', 'ul',
|
145
|
+
#],
|
146
|
+
|
79
147
|
:protocols => {
|
80
148
|
'a' => {'href' => ['http', 'https', 'mailto', :relative]},
|
81
149
|
'img' => {'src' => ['http', 'https', :relative]}
|
82
150
|
},
|
83
151
|
|
84
|
-
#
|
85
|
-
# Gollum has a nice way to add this to your config optionally, see:
|
86
|
-
# https://github.com/github/gollum/blob/master/lib/gollum/sanitization.rb
|
152
|
+
# Consider including for deprecated/historical or spam-suspect pages:
|
87
153
|
#
|
88
154
|
# :add_attributes => {
|
89
155
|
# 'a' => {'rel' => 'nofollow'}
|
90
156
|
# }
|
157
|
+
#
|
158
|
+
# Gollum has a nice way to add this to your config optionally, see:
|
159
|
+
# https://github.com/github/gollum/blob/master/lib/gollum/sanitization.rb
|
91
160
|
}
|
92
|
-
|
161
|
+
|
162
|
+
DEFAULTS = {
|
163
|
+
:include => INCLUDE_CONTENT_ONLY,
|
164
|
+
:exclude => DEFAULT_EXCLUDE_OPTIONS,
|
165
|
+
:sanitize => DEFAULT_SANITIZE_OPTIONS,
|
166
|
+
:links => :unchanged,
|
167
|
+
}
|
168
|
+
|
169
|
+
def self.html( html, options={} )
|
170
|
+
new( html ).massage!( options ).to_html
|
93
171
|
end
|
94
172
|
|
95
|
-
def
|
96
|
-
|
97
|
-
|
173
|
+
def self.text( html, options={} )
|
174
|
+
new( html ).massage!( options ).to_text
|
175
|
+
end
|
176
|
+
|
177
|
+
def self.markdown( html, options={} )
|
178
|
+
ReverseMarkdown.parse( self.html( html, options ) )
|
179
|
+
end
|
180
|
+
|
181
|
+
def initialize( html )
|
182
|
+
@html = html.dup
|
183
|
+
end
|
184
|
+
|
185
|
+
def massage!( options={} )
|
186
|
+
self.class.translate_old_options( options )
|
187
|
+
options = DEFAULTS.merge( options )
|
188
|
+
absolutify_links!(options[:source_url]) if options.delete( :links ) == :absolute
|
189
|
+
absolutify_images!(options[:source_url]) if options.delete( :images ) == :absolute
|
190
|
+
include!( options.delete( :include ) )
|
191
|
+
exclude!( options.delete( :exclude ) )
|
192
|
+
sanitize!( options.delete( :sanitize ) )
|
193
|
+
tidy_whitespace!
|
194
|
+
raise "Unexpected options #{options.inspect}" unless options.empty?
|
195
|
+
self
|
196
|
+
end
|
197
|
+
|
198
|
+
def self.translate_old_options( options )
|
199
|
+
options[ :exclude ] = options.delete( :ignored_selectors ) if options[ :ignored_selectors ]
|
200
|
+
end
|
201
|
+
|
202
|
+
def exclude!( selectors_to_exclude )
|
203
|
+
doc = Nokogiri::HTML( @html )
|
204
|
+
selectors_to_exclude.to_a.each do |selector_to_exclude|
|
205
|
+
( doc / selector_to_exclude ).remove
|
206
|
+
end
|
207
|
+
@html = doc.to_s
|
208
|
+
end
|
209
|
+
|
210
|
+
def include!( selectors_to_include )
|
211
|
+
section = Nokogiri::HTML( @html )
|
212
|
+
selectors_to_include.to_a.each do |selector_to_include|
|
213
|
+
subsection = section / selector_to_include
|
214
|
+
section = subsection unless subsection.empty?
|
215
|
+
end
|
216
|
+
@html = section.inner_html
|
217
|
+
end
|
218
|
+
|
219
|
+
def sanitize!( sanitize_options={} )
|
220
|
+
# Sanitize does not thoroughly remove these tags -- so we do a manual pass:
|
221
|
+
%w[ script noscript style ].each do |tag|
|
222
|
+
unless sanitize_options[ :elements ] && sanitize_options[ :elements ].include?( tag )
|
223
|
+
@html.gsub!( %r{<#{tag}[^>]*>.*?</#{tag}>}mi, '' )
|
224
|
+
end
|
225
|
+
end
|
226
|
+
|
227
|
+
@html = Sanitize.clean( @html, sanitize_options )
|
228
|
+
@html
|
229
|
+
end
|
230
|
+
|
231
|
+
def absolutify_links!(source_url)
|
232
|
+
absolutify_paths!('a', 'href', source_url)
|
233
|
+
end
|
234
|
+
|
235
|
+
def absolutify_images!(source_url)
|
236
|
+
absolutify_paths!('img', 'src', source_url)
|
237
|
+
end
|
238
|
+
|
239
|
+
def absolutify_paths!(tag_name, attr, source_url)
|
240
|
+
raise "When asking for absolute images or paths, please pass in source_url" unless source_url
|
241
|
+
match = source_url.match( %r{(^[a-z]+?://[^/]+)(/.+/)?}i )
|
242
|
+
return @html unless match
|
98
243
|
base_url = match[ 1 ]
|
99
244
|
resource_dir_url = match[ 0 ] # whole regexp match
|
245
|
+
dom = Nokogiri::HTML.fragment( @html )
|
100
246
|
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
247
|
+
tags = dom / tag_name
|
248
|
+
tags.each do |tag|
|
249
|
+
value = tag[ attr ]
|
250
|
+
if value
|
251
|
+
tag[ attr ] =
|
252
|
+
case value
|
253
|
+
when %r{^//} # eg src="//upload.wikimedia.org/wikipedia/Map.png"
|
254
|
+
value
|
108
255
|
when %r{^/}
|
109
|
-
File.join( base_url,
|
256
|
+
File.join( base_url, value )
|
110
257
|
when %r{^\.\.}
|
111
|
-
File.join( resource_dir_url,
|
258
|
+
File.join( resource_dir_url, value )
|
112
259
|
else
|
113
|
-
|
260
|
+
value
|
114
261
|
end
|
115
262
|
end
|
116
263
|
end
|
117
|
-
|
118
|
-
html
|
264
|
+
|
265
|
+
@html = dom.to_s.strip
|
119
266
|
end
|
120
267
|
|
121
|
-
def
|
122
|
-
@
|
268
|
+
def tidy_whitespace!
|
269
|
+
@html = strip_lines(@html)
|
270
|
+
tidy_tables!
|
271
|
+
end
|
272
|
+
|
273
|
+
def tidy_tables!
|
274
|
+
@html.gsub!(%r{(<table\b)(.+?)(</table>)}m) { open,body,close=$1,$2,$3; open + body.gsub(/\n{2,}/, "\n") + close }
|
123
275
|
end
|
124
276
|
|
125
277
|
def to_text
|
126
|
-
text = CGI.unescapeHTML( @
|
278
|
+
text = CGI.unescapeHTML( @html )
|
127
279
|
|
128
280
|
# normalize newlines
|
129
281
|
text.gsub!(/\r\n/, "\n")
|
@@ -132,7 +284,7 @@ module HtmlMassager
|
|
132
284
|
# nbsp => ' '
|
133
285
|
text.gsub!(/ /, ' ')
|
134
286
|
|
135
|
-
# TODO: figure out how to do these in ruby 1.9
|
287
|
+
# TODO: figure out how to do these in ruby 1.9:
|
136
288
|
# They now throw 'incompatible encoding -- ascii regexp for utf8 string'
|
137
289
|
# text.gsub!( /\302\240/, ' ' ) # UTF8 for nbsp
|
138
290
|
# text.gsub!( /\240/, ' ' ) # ascii for nbsp
|
@@ -163,14 +315,21 @@ module HtmlMassager
|
|
163
315
|
"#{text}\n"
|
164
316
|
end
|
165
317
|
|
166
|
-
def strip_lines(
|
167
|
-
lines =
|
318
|
+
def strip_lines(content)
|
319
|
+
lines = content.split( $/ ) # $/ is the current ruby line ending, \n by default
|
168
320
|
lines.map!{ |line| line.strip }
|
169
|
-
|
170
|
-
|
321
|
+
processed = lines.join( $/ )
|
322
|
+
processed.strip
|
323
|
+
end
|
324
|
+
|
325
|
+
|
326
|
+
def to_html
|
327
|
+
@html.strip!
|
328
|
+
@html
|
171
329
|
end
|
172
330
|
|
173
331
|
end
|
332
|
+
|
174
333
|
end
|
175
334
|
|
176
|
-
include HtmlMassager
|
335
|
+
include HtmlMassager
|
@@ -0,0 +1,210 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'html_massage'))
|
4
|
+
|
5
|
+
describe HtmlMassager::HtmlMassage do
|
6
|
+
|
7
|
+
include HtmlMassager
|
8
|
+
|
9
|
+
describe ".html" do
|
10
|
+
it 'Should massage and output HTML' do
|
11
|
+
html = "<html><body><div>This is some great content!</div></body></html>"
|
12
|
+
HtmlMassage.html(html).should == "<div>This is some great content!</div>"
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'should remove HTML "doctype"' do
|
16
|
+
html = '
|
17
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
18
|
+
<html xmlns="http://www.w3.org/1999/xhtml">
|
19
|
+
<body>
|
20
|
+
<p>foobar</p>
|
21
|
+
</body>
|
22
|
+
</html>
|
23
|
+
'
|
24
|
+
HtmlMassage.html(html).strip.should == "<p>foobar</p>"
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
|
29
|
+
describe ".text" do
|
30
|
+
it 'Should massage and output text' do
|
31
|
+
html = "<html><body><div>This is some great content!</div></body></html>"
|
32
|
+
HtmlMassage.text(html).strip.should == "This is some great content!"
|
33
|
+
end
|
34
|
+
|
35
|
+
it 'should convert an HTML sample as expected' do
|
36
|
+
html = "
|
37
|
+
<html><body>
|
38
|
+
<h1>Title</h1>
|
39
|
+
This is the body.
|
40
|
+
Testing <a href='http://www.google.com/'>link to Google</a>.
|
41
|
+
<p />
|
42
|
+
Testing image <img src='/noimage.png'>.
|
43
|
+
<br />
|
44
|
+
The End.
|
45
|
+
</body></html>
|
46
|
+
"
|
47
|
+
HtmlMassage.text(html).strip.should == "Title
|
48
|
+
|
49
|
+
This is the body. Testing link to Google.
|
50
|
+
|
51
|
+
Testing image .
|
52
|
+
The End.
|
53
|
+
".strip.gsub(/^ +/, '')
|
54
|
+
end
|
55
|
+
|
56
|
+
it 'should play nice with UTF8 HTML source' do
|
57
|
+
html = '
|
58
|
+
<html>
|
59
|
+
<head>
|
60
|
+
<meta content="text/html; charset=utf-8" http-equiv="content-type" />
|
61
|
+
</head>
|
62
|
+
<body>
|
63
|
+
Niq is a performer → Angry, arrogant, & so admired.
|
64
|
+
</body>
|
65
|
+
</html>
|
66
|
+
'
|
67
|
+
HtmlMassage.text(html).strip.should == "Niq is a performer → Angry, arrogant, & so admired."
|
68
|
+
end
|
69
|
+
|
70
|
+
it 'should play nice with ' do
|
71
|
+
pending
|
72
|
+
html = ' '
|
73
|
+
HtmlMassage.text(html).strip.should == " "
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
describe ".markdown" do
|
78
|
+
it 'Should massage and output markdown' do
|
79
|
+
html = "<html><body><div>This is some <i>great</i> content!</div></body></html>"
|
80
|
+
massaged = HtmlMassage.markdown html
|
81
|
+
massaged.strip.should == "This is some _great_ content!"
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
describe "#massage!" do
|
86
|
+
|
87
|
+
context 'invalid html' do
|
88
|
+
[
|
89
|
+
"<html><body>foobar</body>",
|
90
|
+
"<html><body>foobar</html>",
|
91
|
+
"<body>foobar</body></html>",
|
92
|
+
"<html>foobar</body></html>",
|
93
|
+
].each do |broken_html|
|
94
|
+
it "should return 'foobar' when given #{broken_html.inspect}" do
|
95
|
+
HtmlMassage.new(broken_html).massage!.to_text.strip.should == "foobar"
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
pending 'should convert an HTML sample as expected'
|
101
|
+
|
102
|
+
it 'should leave HTML entities intact' do
|
103
|
+
pending 'improve ::Node.massage_html -- handling of html entities, utf8 chars'
|
104
|
+
original = "This “branching” of creative works"
|
105
|
+
massage = HtmlMassager::HtmlMassage.new( original )
|
106
|
+
massage.massage!.should == original
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
describe ".sanitize_html" do
|
111
|
+
it 'should remove <style> tags and their contents' do
|
112
|
+
html = %~<!-- Remix button --><br />
|
113
|
+
<style type='text/css'>
|
114
|
+
a.remix_on_wikinodes_tab {
|
115
|
+
top: 25%; left: 0; width: 42px; height: 100px; color: #FFF; cursor:pointer; text-indent:-99999px; overflow:hidden; position: fixed; z-index: 99999; margin-left: -7px; background-image: url(http://www.openyourproject.org/images/remix_tab.png); _position: absolute; right: 0 !important; left: auto !important; margin-right: -7px !important; margin-left: auto !important; } a.remix_on_wikinodes_tab:hover { margin-left: -4px; margin-right: -4px !important; margin-left: auto !important;
|
116
|
+
}
|
117
|
+
</style>
|
118
|
+
<p> <script type="text/javascript" language="javascript"> document.write( '<a style="background-color: #2a2a2a;" class="remix_on_wikinodes_tab" href="http://www.openyourproject.org/nodes/new?parent=' + window.location + '" title="Remix this content on WikiNodes -- creative collaboration designed to set you free" >Remix This</a>' ); </script> <noscript>Note: you can turn on Javascript to see the ‘Remix This’ link.</noscript></p>
|
119
|
+
~
|
120
|
+
html_massager = HtmlMassage.new( html )
|
121
|
+
html_massager.sanitize!.should_not =~ /remix_on_wikinodes_tab/
|
122
|
+
end
|
123
|
+
|
124
|
+
it 'should remove <noscript> tags and their contents' do
|
125
|
+
html = %{ <noscript>Note: you can turn on Javascript to see the 'Remix This' link. </noscript> }
|
126
|
+
html_massager = HtmlMassage.new( html )
|
127
|
+
html_massager.sanitize!.strip.should == ''
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
describe '#absolutify_links' do
|
132
|
+
it 'should work for absolute path links' do
|
133
|
+
source_url = 'http://en.wikipedia.org/wiki/Singularity'
|
134
|
+
original_html = '<a href="/wiki/Ray_Kurzweil">Ray</a>'
|
135
|
+
html_massager = HtmlMassage.new( original_html )
|
136
|
+
html_massager.absolutify_links!(source_url).should ==
|
137
|
+
'<a href="http://en.wikipedia.org/wiki/Ray_Kurzweil">Ray</a>'
|
138
|
+
end
|
139
|
+
|
140
|
+
it 'should work for absolute path links (bugfix)' do
|
141
|
+
source_url = 'http://p2pfoundation.net/NextNet'
|
142
|
+
original_html = '<a href="/Ten_Principles_for_an_Autonomous_Internet" title="Ten Principles for an Autonomous Internet">Ten Principles for an Autonomous Internet</a>'
|
143
|
+
html_massager = HtmlMassage.new( original_html )
|
144
|
+
html_massager.absolutify_links!(source_url).should ==
|
145
|
+
'<a href="http://p2pfoundation.net/Ten_Principles_for_an_Autonomous_Internet" title="Ten Principles for an Autonomous Internet">Ten Principles for an Autonomous Internet</a>'
|
146
|
+
end
|
147
|
+
|
148
|
+
it 'should work for relative links' do
|
149
|
+
source_url = 'http://en.wikipedia.org/wiki/Singularity'
|
150
|
+
original_html = '<a href="../wiki/Ray_Kurzweil">Ray</a>'
|
151
|
+
html_massager = HtmlMassage.new( original_html )
|
152
|
+
html_massager.absolutify_links!(source_url).should ==
|
153
|
+
'<a href="http://en.wikipedia.org/wiki/../wiki/Ray_Kurzweil">Ray</a>'
|
154
|
+
end
|
155
|
+
|
156
|
+
it 'should leave full URLs alone' do
|
157
|
+
source_url = 'http://en.wikipedia.org/wiki/Singularity'
|
158
|
+
original_html = '<a href="http://www.wired.com/wiredscience">wired science</a>'
|
159
|
+
html_massager = HtmlMassage.new( original_html )
|
160
|
+
html_massager.absolutify_links!(source_url).should == original_html
|
161
|
+
end
|
162
|
+
|
163
|
+
it 'should leave // style URLs alone' do
|
164
|
+
source_url = 'http://en.wikipedia.org/wiki/Singularity'
|
165
|
+
original_html = '<a href="//wired.com/wiredscience">wired science</a>'
|
166
|
+
html_massager = HtmlMassage.new( original_html )
|
167
|
+
html_massager.absolutify_links!(source_url).should == original_html
|
168
|
+
end
|
169
|
+
|
170
|
+
it 'should leave "jump links" alone' do
|
171
|
+
source_url = 'http://en.wikipedia.org/wiki/Singularity'
|
172
|
+
original_html = '<a href="#cite_1">1</a>'
|
173
|
+
html_massager = HtmlMassage.new( original_html )
|
174
|
+
html_massager.absolutify_links!(source_url).should == original_html
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
describe '#absolutify_images!' do
|
179
|
+
it 'should work for absolute path links' do
|
180
|
+
source_url = 'http://enlightenedstructure.org/Home/'
|
181
|
+
original_html = '<img src="/IMG/we-are.png" alt="" class="icon">'
|
182
|
+
html_massager = HtmlMassage.new( original_html )
|
183
|
+
html_massager.absolutify_images!(source_url).should ==
|
184
|
+
'<img src="http://enlightenedstructure.org/IMG/we-are.png" alt="" class="icon">'
|
185
|
+
end
|
186
|
+
|
187
|
+
it 'should work for absolute path links (bugfix)' do
|
188
|
+
source_url = 'http://www.realitysandwich.com/blog/daniel_pinchbeck'
|
189
|
+
original_html = '<img src="/sites/realitysandwich.com/themes/zen/pinkreality/images/creative-commons-license.png" alt="Attribution-Noncommercial-Share Alike 3.0 Unported" title="" width="88" height="31">'
|
190
|
+
html_massager = HtmlMassage.new( original_html )
|
191
|
+
html_massager.absolutify_images!(source_url).should ==
|
192
|
+
'<img src="http://www.realitysandwich.com/sites/realitysandwich.com/themes/zen/pinkreality/images/creative-commons-license.png" alt="Attribution-Noncommercial-Share Alike 3.0 Unported" title="" width="88" height="31">'
|
193
|
+
end
|
194
|
+
|
195
|
+
it 'should leave // style URLs alone' do
|
196
|
+
source_url = 'http://en.wikipedia.org/wiki/List_of_communes_in_France_with_over_20,000_inhabitants_(2006_census)'
|
197
|
+
original_html = '<img alt="" src="//upload.wikimedia.org/wikipedia/commons/thumb/f/f9/France-CIA_WFB_Map.png/220px-France-CIA_WFB_Map.png" width="220" height="235" class="thumbimage">'
|
198
|
+
html_massager = HtmlMassage.new( original_html )
|
199
|
+
html_massager.absolutify_images!(source_url).should == original_html
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
203
|
+
describe '#tidy_tables!' do
|
204
|
+
it 'should remove multiple newlines from tables' do
|
205
|
+
HtmlMassage.new("<table><tr>\n<th>Chư\n\n\nYang Sin National Park</th>\n\n\n</tr></table>").tidy_tables!.should ==
|
206
|
+
"<table><tr>\n<th>Chư\nYang Sin National Park</th>\n</tr></table>"
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
end
|
metadata
CHANGED
@@ -1,82 +1,140 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: html_massage
|
3
|
-
version: !ruby/object:Gem::Version
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.2.0
|
4
5
|
prerelease:
|
5
|
-
version: 0.0.2
|
6
6
|
platform: ruby
|
7
|
-
authors:
|
8
|
-
- Harlan
|
7
|
+
authors:
|
8
|
+
- Harlan T Wood
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
- !ruby/object:Gem::Dependency
|
12
|
+
date: 2012-11-25 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
16
15
|
name: nokogiri
|
17
|
-
|
18
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
19
17
|
none: false
|
20
|
-
requirements:
|
21
|
-
- -
|
22
|
-
- !ruby/object:Gem::Version
|
23
|
-
version: 1.4
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '1.4'
|
24
22
|
type: :runtime
|
25
|
-
|
26
|
-
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '1.4'
|
30
|
+
- !ruby/object:Gem::Dependency
|
27
31
|
name: sanitize
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '2.0'
|
38
|
+
type: :runtime
|
28
39
|
prerelease: false
|
29
|
-
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '2.0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: thor
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
30
49
|
none: false
|
31
|
-
requirements:
|
32
|
-
- -
|
33
|
-
- !ruby/object:Gem::Version
|
34
|
-
version:
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
35
54
|
type: :runtime
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: rest-client
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '1.6'
|
70
|
+
type: :runtime
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '1.6'
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: rspec
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ! '>='
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '2.5'
|
86
|
+
type: :development
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '2.5'
|
94
|
+
description: ! 'Massages HTML how you want to: sanitize tags, remove headers and footers,
|
95
|
+
convert to plain text.'
|
96
|
+
email:
|
97
|
+
- code@harlantwood.net
|
98
|
+
executables:
|
99
|
+
- html_massage
|
42
100
|
extensions: []
|
43
|
-
|
44
101
|
extra_rdoc_files: []
|
45
|
-
|
46
|
-
files:
|
102
|
+
files:
|
47
103
|
- .gitignore
|
48
104
|
- Gemfile
|
105
|
+
- License-MIT
|
49
106
|
- README.md
|
50
107
|
- Rakefile
|
108
|
+
- bin/html_massage
|
109
|
+
- generate_readme.rb
|
51
110
|
- html_massage.gemspec
|
52
111
|
- lib/html_massage.rb
|
112
|
+
- lib/html_massage/cli.rb
|
53
113
|
- lib/html_massage/version.rb
|
54
|
-
|
114
|
+
- spec/html_massage_spec.rb
|
115
|
+
homepage: https://github.com/harlantwood/html_massage
|
55
116
|
licenses: []
|
56
|
-
|
57
117
|
post_install_message:
|
58
118
|
rdoc_options: []
|
59
|
-
|
60
|
-
require_paths:
|
119
|
+
require_paths:
|
61
120
|
- lib
|
62
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
121
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
63
122
|
none: false
|
64
|
-
requirements:
|
65
|
-
- -
|
66
|
-
- !ruby/object:Gem::Version
|
67
|
-
version:
|
68
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
123
|
+
requirements:
|
124
|
+
- - ! '>='
|
125
|
+
- !ruby/object:Gem::Version
|
126
|
+
version: '0'
|
127
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
69
128
|
none: false
|
70
|
-
requirements:
|
71
|
-
- -
|
72
|
-
- !ruby/object:Gem::Version
|
73
|
-
version:
|
129
|
+
requirements:
|
130
|
+
- - ! '>='
|
131
|
+
- !ruby/object:Gem::Version
|
132
|
+
version: '0'
|
74
133
|
requirements: []
|
75
|
-
|
76
|
-
|
77
|
-
rubygems_version: 1.8.5
|
134
|
+
rubyforge_project:
|
135
|
+
rubygems_version: 1.8.24
|
78
136
|
signing_key:
|
79
137
|
specification_version: 3
|
80
138
|
summary: Massages HTML how you want to.
|
81
|
-
test_files:
|
82
|
-
|
139
|
+
test_files:
|
140
|
+
- spec/html_massage_spec.rb
|