html_massage 0.0.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore CHANGED
@@ -2,3 +2,4 @@
2
2
  .idea
3
3
  Gemfile.lock
4
4
  pkg/*
5
+ README-backup*
data/Gemfile CHANGED
@@ -1,4 +1,6 @@
1
- source "http://rubygems.org"
1
+ source 'https://rubygems.org'
2
+
3
+ gem 'reverse_markdown', :git => 'git://github.com/harlantwood/reverse_markdown.git'
2
4
 
3
5
  # Specify your gem's dependencies in html_massage.gemspec
4
6
  gemspec
data/License-MIT ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 Harlan T Wood
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md CHANGED
@@ -1,18 +1,80 @@
1
1
  # html_massage
2
2
 
3
3
  Give your HTML a massage, in just the ways it loves:
4
+
4
5
  * Remove headers and footers and navigation, and strip to only the "content" part of the HTML
5
6
  * Sanitize tags, removing javascript and styling
6
7
  * Convert your HTML to nicely-formatted plain text
7
8
 
8
- ## Usage
9
+ ## Sample Usage
10
+
11
+ ### Full Massage
9
12
 
10
- require 'rubygems'
11
13
  require 'html_massage'
12
- html = "<html><body><div id='header'>My Site</div><div>This is some great content!</div></body></html>"
13
- html_massage = HtmlMassage.new( html, :ignored_selectors => [ '#header' ] )
14
- # => #<HtmlMassager::HtmlMassage ... >
15
- html_massage.to_html
16
- # => "<div>This is some great content!</div>"
17
- html_massage.to_text
18
- # => "This is some great content!\n"
14
+
15
+ html = %{
16
+ <html>
17
+ <head>
18
+ <script type="text/javascript">document.write('I am a bad script');</script>
19
+ </head>
20
+ <body>
21
+ <div id="header">My Site</div>
22
+ <div>This is some great content!</div>
23
+ <a href ="foo/bar.html">Click this link</a>
24
+ </body>
25
+ </html>
26
+ }
27
+
28
+ puts HtmlMassage.html( html )
29
+ # => "<div>This is some great content!</div>"
30
+
31
+ puts HtmlMassage.text( html )
32
+ # => "This is some great content!\n"
33
+
34
+ ### Content Only
35
+
36
+ html_massage = HtmlMassage.new( html,
37
+ :exclude => [ '#header' ] )
38
+ # => #<HtmlMassager::HtmlMassage ... >
39
+
40
+ puts html_massage.exclude!
41
+ # <div>This is some great content!</div>
42
+ # <a href="foo/bar.html">Click this link</a>
43
+
44
+ ### Sanitize HTML
45
+
46
+ html_massage = HtmlMassage.new( html,
47
+ :exclude => [ '#header' ] )
48
+ # => #<HtmlMassager::HtmlMassage ... >
49
+
50
+ puts html_massage.sanitize_html!
51
+ # <html>
52
+ # <head>
53
+ # </head>
54
+ # <body>
55
+ # <div id="header">My Site</div>
56
+ # <div>This is some great content!</div>
57
+ # </body>
58
+ # </html>
59
+
60
+ ### Make Links Absolute
61
+
62
+ html_massage = HtmlMassage.new( html,
63
+ :exclude => [ '#header' ],
64
+ :source_url => 'http://example.com/joe/page1.html' )
65
+
66
+ puts html_massage.absolutify_links!
67
+ # <html>
68
+ # <head>
69
+ # <script type="text/javascript">document.write('I am a bad script');</script>
70
+ # </head>
71
+ # <body>
72
+ # <div id="header">My Site</div>
73
+ # <div>This is some great content!</div>
74
+ # <a href ="http://example.com/joe/foo/bar.html">Click this link</a>
75
+ # </body>
76
+ # </html>
77
+
78
+ puts html_massage.absolutify_images!
79
+ #
80
+
data/Rakefile CHANGED
@@ -1 +1 @@
1
- require 'bundler/gem_tasks'
1
+ require "bundler/gem_tasks"
data/bin/html_massage ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'html_massage/cli'
4
+ HtmlMassager::CLI.start
@@ -0,0 +1,59 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ class IO
4
+ def self.write( path, content )
5
+ file = File.new( path, "w" )
6
+ file.write( content )
7
+ file.close
8
+ end
9
+ end
10
+
11
+ CHUNK_SEP = "\n\n"
12
+
13
+ def is_code?( markdown )
14
+ markdown.start_with?( ' ' )
15
+ end
16
+
17
+ def header( text, top_newlines )
18
+ puts "\n" * top_newlines
19
+ puts '*'*10
20
+ puts text
21
+ puts '*'*10
22
+ end
23
+
24
+ system( "cp README.md README-backup-#{Time.now.to_s.gsub(/\W/, '-')}.md" )
25
+ readme = IO.read( 'README.md' )
26
+ chunks = readme.split( CHUNK_SEP )
27
+ code = ''
28
+ new_readme = ''
29
+ chunks.each do |chunk|
30
+ if is_code?( chunk )
31
+
32
+ chunk
33
+ code << chunk << CHUNK_SEP
34
+
35
+ header( 'Code', 3 )
36
+ puts code
37
+ header( 'Result', 1 )
38
+ puts result = eval( code )
39
+
40
+ unless result.nil?
41
+ p 111, chunk
42
+ result = result.to_s
43
+ _, code_sans_results = chunk.match( /\A((?: [^#].*\r?\n)+)(?: #.*\r?\n)+\Z/ ).to_a
44
+ if code_sans_results
45
+ p 222
46
+ result = result.split("\n").map{ |line| " # #{line}" }.join("\n")
47
+ chunk = code_sans_results << result << CHUNK_SEP
48
+ end
49
+ end
50
+
51
+ header( 'Output', 1 )
52
+ puts chunk
53
+
54
+ new_readme << chunk << CHUNK_SEP
55
+ end
56
+
57
+ end
58
+
59
+ IO.write( 'README.md', new_readme )
data/html_massage.gemspec CHANGED
@@ -1,23 +1,28 @@
1
1
  # -*- encoding: utf-8 -*-
2
- $:.push File.expand_path("../lib", __FILE__)
3
- require "html_massage/version"
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'html_massage/version'
4
5
 
5
- Gem::Specification.new do |s|
6
- s.name = "html_massage"
7
- s.version = HtmlMassager::VERSION
8
- s.authors = ["Harlan Knight Wood"]
9
- s.email = ["code@hkw7.org"]
10
- s.homepage = "https://github.com/onesunone/html_massage"
11
- s.summary = %{Massages HTML how you want to.}
12
- s.description = %{Massages HTML how you want to: sanitize tags, remove headers and footers, convert to plain text.}
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "html_massage"
8
+ gem.version = HtmlMassager::VERSION
9
+ gem.authors = ["Harlan T Wood"]
10
+ gem.email = ["code@harlantwood.net"]
11
+ gem.homepage = "https://github.com/harlantwood/html_massage"
12
+ gem.summary = %{Massages HTML how you want to.}
13
+ gem.description = %{Massages HTML how you want to: sanitize tags, remove headers and footers, convert to plain text.}
13
14
 
14
- s.rubyforge_project = "html_massage"
15
+ gem.files = `git ls-files`.split($/)
16
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
+ gem.require_paths = ["lib"]
15
19
 
16
- s.add_dependency('nokogiri', ">= 1.4.4")
17
- s.add_dependency('sanitize', ">= 2.0.0")
20
+ gem.add_dependency "nokogiri", ">= 1.4"
21
+ gem.add_dependency "sanitize", ">= 2.0"
22
+ gem.add_dependency "thor"
23
+ gem.add_dependency "rest-client", ">= 1.6"
24
+
25
+ gem.add_development_dependency "rspec", ">= 2.5"
18
26
 
19
- s.files = `git ls-files`.split("\n")
20
- s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
21
- s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
22
- s.require_paths = ["lib"]
23
27
  end
28
+
@@ -0,0 +1,35 @@
1
+ require 'thor'
2
+ require 'rest_client'
3
+ require 'html_massage'
4
+
5
+ module HtmlMassager
6
+
7
+ class CLI < Thor
8
+
9
+ desc :html, 'Download HTML from given URL and massage into html'
10
+ def html url
11
+ STDOUT.puts massage_to :html, url
12
+ end
13
+
14
+ desc :text, 'Download HTML from given URL and massage into plain text'
15
+ def text url
16
+ STDOUT.puts massage_to :text, url
17
+ end
18
+
19
+ desc :markdown, 'Download HTML from given URL and massage into markdown'
20
+ def markdown url
21
+ STDOUT.puts massage_to :markdown, url
22
+ end
23
+
24
+ no_tasks do
25
+ def massage_to output_format, url
26
+ HtmlMassage.send output_format,
27
+ RestClient.get(url),
28
+ :source_url => url,
29
+ :links => :absolute,
30
+ :images => :absolute
31
+ end
32
+ end
33
+
34
+ end
35
+ end
@@ -1,3 +1,3 @@
1
1
  module HtmlMassager
2
- VERSION = "0.0.2"
2
+ VERSION = "0.2.0"
3
3
  end
data/lib/html_massage.rb CHANGED
@@ -1,129 +1,281 @@
1
1
  require "cgi"
2
2
  require "nokogiri"
3
3
  require "sanitize"
4
+ require "reverse_markdown"
4
5
  require "html_massage/version"
5
6
 
6
7
  module HtmlMassager
8
+
7
9
  class HtmlMassage
8
- def initialize( html, options )
9
- @source_url = options[ :source_url ]
10
- @ignored_selectors = options[ :ignored_selectors ]
11
- @clean_html = massage_html( html )
12
- end
13
10
 
14
- def massage_html( html )
15
- html = content_only( html )
16
- html = sanitize_html( html )
17
- html = absolutify_links( html ) if @source_url
11
+ INCLUDE_CONTENT_ONLY = %w[
18
12
  html
19
- end
13
+ body
14
+ ]
20
15
 
21
- def content_only( content )
22
- doc = Nokogiri::HTML( content )
23
- body = doc / 'html' / 'body'
16
+ DEFAULT_EXCLUDE_OPTIONS = [
17
+ # general:
18
+ 'head',
19
+ 'title',
20
+ 'meta',
24
21
 
25
- @ignored_selectors.to_a.each do |ignored_selector|
26
- ( body / ignored_selector ).remove
27
- end
22
+ 'div#header',
23
+ 'div.header',
24
+ 'div#banner',
25
+ 'div.banner',
26
+ '.footer',
27
+ '#footer',
28
+ 'div#navigation',
29
+ 'div.navigation',
30
+ 'div#nav',
31
+ 'div.nav',
32
+ 'div#sidebar',
33
+ 'div.sidebar',
34
+ '#breadcrumbs',
35
+ '.breadcrumbs',
36
+ '#backfornav',
37
+ '.backfornav',
38
+ 'div.post-footer',
39
+ 'div.navigation',
28
40
 
29
- content = body / '#content'
30
- content = body if content.empty?
31
- content = content.inner_html
32
- content
33
- end
41
+ # wordpress:
42
+ 'a#left_arrow',
43
+ 'a#right_arrow',
44
+ 'div#comments',
45
+ 'div#comment-section',
46
+ 'div#respond',
34
47
 
35
- def sanitize_html(html)
36
- html = html.dup
48
+ # typepad
49
+ '#pagebody > #pagebody-inner > #alpha',
50
+ 'p.content-nav',
51
+
52
+ # blog widgets
53
+ '.widget_blog_subscription',
54
+ '.loggedout-follow-normal',
37
55
 
38
- %w[ script noscript style ].each do |tag|
39
- html.gsub!( %r{<#{tag}[^>]*>.*?</#{tag}>}mi, '' )
40
- end
41
56
 
42
- Sanitize.clean(
43
- html,
44
- {
45
- :elements => [
46
- 'a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
47
- 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
48
- 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir',
49
- 'div', 'dl', 'dt', 'em', 'fieldset', 'form', 'h1',
50
- 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i',
51
- 'img',
52
- 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu',
53
- 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
54
- 'select', 'small', 'span', 'strike', 'strong', 'sub',
55
- 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th',
56
- 'thead', 'tr', 'tt', 'u', 'ul', 'var',
57
+ # wikipedia
58
+
59
+ '#bodyContent > #siteSub',
60
+ '#bodyContent > #contentSub',
61
+ '#bodyContent > #jump-to-nav',
62
+ 'table.metadata',
63
+ 'table.navbox',
64
+ 'table.toc',
65
+ 'div#catlinks',
66
+ 'div.printfooter',
67
+ 'h1 > span.editsection',
68
+ 'h2 > span.editsection',
69
+ 'h3 > span.editsection',
70
+ 'h4 > span.editsection',
71
+ 'h5 > span.editsection',
72
+ 'h6 > span.editsection',
73
+
74
+ # wikipedia "message boxes" -- metadata such as "requires cleanup":
75
+ # see http://en.wikipedia.org/wiki/Template:Ambox
76
+ 'table.ambox',
77
+ 'table.tmbox',
78
+ 'table.imbox',
79
+ 'table.cmbox',
80
+ 'table.ombox',
81
+ 'table.fmbox',
82
+ 'table.dmbox',
83
+
84
+ # mediawiki
85
+ '#mw-subcategories',
86
+ '#mw-pages',
87
+ '#mw-head',
88
+ '#mw-panel',
89
+
90
+ # social media sharing:
91
+ 'ul#sharebar',
92
+ 'ul#sharebarx',
93
+ '.sharedaddy',
94
+ '#sharing_email',
95
+
96
+ # signup:
97
+ '#mailchimp_signup_bottom',
98
+ ]
99
+
100
+ DEFAULT_SANITIZE_OPTIONS = {
101
+ :elements => %w[
102
+ a abbr acronym address area b big
103
+ blockquote br button caption center cite
104
+ code col colgroup dd del dfn dir
105
+ div dl dt em fieldset form h1
106
+ h2 h3 h4 h5 h6 hr i
107
+ img
108
+ input ins kbd label legend li map menu
109
+ ol optgroup option p pre q s samp
110
+ select small span strike strong sub
111
+ sup table tbody td textarea tfoot th
112
+ thead tr tt u ul var
57
113
  ],
58
114
  :attributes => {
59
- 'a' => ['href'],
60
- 'img' => ['src'],
61
- :all => ['abbr', 'accept', 'accept-charset',
62
- 'accesskey', 'action', 'align', 'alt', 'axis',
63
- 'border', 'cellpadding', 'cellspacing', 'char',
64
- 'charoff', 'class', 'charset', 'checked', 'cite',
65
- 'clear', 'cols', 'colspan', 'color',
66
- 'compact', 'coords', 'datetime', 'dir',
67
- 'disabled', 'enctype', 'for', 'frame',
68
- 'headers', 'height', 'hreflang',
69
- 'hspace', 'id', 'ismap', 'label', 'lang',
70
- 'longdesc', 'maxlength', 'media', 'method',
71
- 'multiple', 'name', 'nohref', 'noshade',
72
- 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
73
- 'rows', 'rowspan', 'rules', 'scope',
74
- 'selected', 'shape', 'size', 'span',
75
- 'start', 'summary', 'tabindex', 'target',
76
- 'title', 'type', 'usemap', 'valign', 'value',
77
- 'vspace', 'width']
115
+ 'a' => %w[ href ],
116
+ 'img' => %w[ src ],
117
+ :all => %w[
118
+ abbr accept accept-charset
119
+ accesskey action align alt axis
120
+ border cellpadding cellspacing char
121
+ charoff class charset checked cite
122
+ clear cols colspan color
123
+ compact coords datetime dir
124
+ disabled enctype for frame
125
+ headers height hreflang
126
+ hspace id ismap label lang
127
+ longdesc maxlength media method
128
+ multiple name nohref noshade
129
+ nowrap prompt readonly rel rev
130
+ rows rowspan rules scope
131
+ selected shape size span
132
+ start summary tabindex target
133
+ title type usemap valign value
134
+ vspace width
135
+ ]
78
136
  },
137
+
138
+ # medium permissive list:
139
+ #:elements => [
140
+ # 'a', 'b', 'blockquote', 'br', 'code', 'dd', 'del', 'dl', 'dt',
141
+ # 'em', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i',
142
+ # 'img', 'ins', 'li', 'ol', 'p', 'pre', 'small', 'strike', 'strong', 'sub',
143
+ # 'sup', 'table', 'tbody', 'td', 'th',
144
+ # 'thead', 'tr', 'u', 'ul',
145
+ #],
146
+
79
147
  :protocols => {
80
148
  'a' => {'href' => ['http', 'https', 'mailto', :relative]},
81
149
  'img' => {'src' => ['http', 'https', :relative]}
82
150
  },
83
151
 
84
- # consider including for deprecated/historical/or spam-suspect pages:
85
- # Gollum has a nice way to add this to your config optionally, see:
86
- # https://github.com/github/gollum/blob/master/lib/gollum/sanitization.rb
152
+ # Consider including for deprecated/historical or spam-suspect pages:
87
153
  #
88
154
  # :add_attributes => {
89
155
  # 'a' => {'rel' => 'nofollow'}
90
156
  # }
157
+ #
158
+ # Gollum has a nice way to add this to your config optionally, see:
159
+ # https://github.com/github/gollum/blob/master/lib/gollum/sanitization.rb
91
160
  }
92
- )
161
+
162
+ DEFAULTS = {
163
+ :include => INCLUDE_CONTENT_ONLY,
164
+ :exclude => DEFAULT_EXCLUDE_OPTIONS,
165
+ :sanitize => DEFAULT_SANITIZE_OPTIONS,
166
+ :links => :unchanged,
167
+ }
168
+
169
+ def self.html( html, options={} )
170
+ new( html ).massage!( options ).to_html
93
171
  end
94
172
 
95
- def absolutify_links( html )
96
- match = @source_url.match( %r{(^[a-z]+://[^/]+)(/.+/)}i )
97
- return html unless match
173
+ def self.text( html, options={} )
174
+ new( html ).massage!( options ).to_text
175
+ end
176
+
177
+ def self.markdown( html, options={} )
178
+ ReverseMarkdown.parse( self.html( html, options ) )
179
+ end
180
+
181
+ def initialize( html )
182
+ @html = html.dup
183
+ end
184
+
185
+ def massage!( options={} )
186
+ self.class.translate_old_options( options )
187
+ options = DEFAULTS.merge( options )
188
+ absolutify_links!(options[:source_url]) if options.delete( :links ) == :absolute
189
+ absolutify_images!(options[:source_url]) if options.delete( :images ) == :absolute
190
+ include!( options.delete( :include ) )
191
+ exclude!( options.delete( :exclude ) )
192
+ sanitize!( options.delete( :sanitize ) )
193
+ tidy_whitespace!
194
+ raise "Unexpected options #{options.inspect}" unless options.empty?
195
+ self
196
+ end
197
+
198
+ def self.translate_old_options( options )
199
+ options[ :exclude ] = options.delete( :ignored_selectors ) if options[ :ignored_selectors ]
200
+ end
201
+
202
+ def exclude!( selectors_to_exclude )
203
+ doc = Nokogiri::HTML( @html )
204
+ selectors_to_exclude.to_a.each do |selector_to_exclude|
205
+ ( doc / selector_to_exclude ).remove
206
+ end
207
+ @html = doc.to_s
208
+ end
209
+
210
+ def include!( selectors_to_include )
211
+ section = Nokogiri::HTML( @html )
212
+ selectors_to_include.to_a.each do |selector_to_include|
213
+ subsection = section / selector_to_include
214
+ section = subsection unless subsection.empty?
215
+ end
216
+ @html = section.inner_html
217
+ end
218
+
219
+ def sanitize!( sanitize_options={} )
220
+ # Sanitize does not thoroughly remove these tags -- so we do a manual pass:
221
+ %w[ script noscript style ].each do |tag|
222
+ unless sanitize_options[ :elements ] && sanitize_options[ :elements ].include?( tag )
223
+ @html.gsub!( %r{<#{tag}[^>]*>.*?</#{tag}>}mi, '' )
224
+ end
225
+ end
226
+
227
+ @html = Sanitize.clean( @html, sanitize_options )
228
+ @html
229
+ end
230
+
231
+ def absolutify_links!(source_url)
232
+ absolutify_paths!('a', 'href', source_url)
233
+ end
234
+
235
+ def absolutify_images!(source_url)
236
+ absolutify_paths!('img', 'src', source_url)
237
+ end
238
+
239
+ def absolutify_paths!(tag_name, attr, source_url)
240
+ raise "When asking for absolute images or paths, please pass in source_url" unless source_url
241
+ match = source_url.match( %r{(^[a-z]+?://[^/]+)(/.+/)?}i )
242
+ return @html unless match
98
243
  base_url = match[ 1 ]
99
244
  resource_dir_url = match[ 0 ] # whole regexp match
245
+ dom = Nokogiri::HTML.fragment( @html )
100
246
 
101
- dom = Nokogiri::HTML.fragment( html )
102
- links = dom / 'a'
103
- links.each do |link|
104
- href = link[ 'href' ]
105
- if href
106
- link[ 'href' ] =
107
- case href
247
+ tags = dom / tag_name
248
+ tags.each do |tag|
249
+ value = tag[ attr ]
250
+ if value
251
+ tag[ attr ] =
252
+ case value
253
+ when %r{^//} # eg src="//upload.wikimedia.org/wikipedia/Map.png"
254
+ value
108
255
  when %r{^/}
109
- File.join( base_url, href )
256
+ File.join( base_url, value )
110
257
  when %r{^\.\.}
111
- File.join( resource_dir_url, href )
258
+ File.join( resource_dir_url, value )
112
259
  else
113
- href
260
+ value
114
261
  end
115
262
  end
116
263
  end
117
- html = dom.to_s
118
- html
264
+
265
+ @html = dom.to_s.strip
119
266
  end
120
267
 
121
- def to_html
122
- @clean_html
268
+ def tidy_whitespace!
269
+ @html = strip_lines(@html)
270
+ tidy_tables!
271
+ end
272
+
273
+ def tidy_tables!
274
+ @html.gsub!(%r{(<table\b)(.+?)(</table>)}m) { open,body,close=$1,$2,$3; open + body.gsub(/\n{2,}/, "\n") + close }
123
275
  end
124
276
 
125
277
  def to_text
126
- text = CGI.unescapeHTML( @clean_html )
278
+ text = CGI.unescapeHTML( @html )
127
279
 
128
280
  # normalize newlines
129
281
  text.gsub!(/\r\n/, "\n")
@@ -132,7 +284,7 @@ module HtmlMassager
132
284
  # nbsp => ' '
133
285
  text.gsub!(/&nbsp;/, ' ')
134
286
 
135
- # TODO: figure out how to do these in ruby 1.9.2:
287
+ # TODO: figure out how to do these in ruby 1.9:
136
288
  # They now throw 'incompatible encoding -- ascii regexp for utf8 string'
137
289
  # text.gsub!( /\302\240/, ' ' ) # UTF8 for nbsp
138
290
  # text.gsub!( /\240/, ' ' ) # ascii for nbsp
@@ -163,14 +315,21 @@ module HtmlMassager
163
315
  "#{text}\n"
164
316
  end
165
317
 
166
- def strip_lines( text )
167
- lines = text.split( "\n" )
318
+ def strip_lines(content)
319
+ lines = content.split( $/ ) # $/ is the current ruby line ending, \n by default
168
320
  lines.map!{ |line| line.strip }
169
- text = lines.join( "\n" )
170
- text.strip
321
+ processed = lines.join( $/ )
322
+ processed.strip
323
+ end
324
+
325
+
326
+ def to_html
327
+ @html.strip!
328
+ @html
171
329
  end
172
330
 
173
331
  end
332
+
174
333
  end
175
334
 
176
- include HtmlMassager
335
+ include HtmlMassager
@@ -0,0 +1,210 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'html_massage'))
4
+
5
+ describe HtmlMassager::HtmlMassage do
6
+
7
+ include HtmlMassager
8
+
9
+ describe ".html" do
10
+ it 'Should massage and output HTML' do
11
+ html = "<html><body><div>This is some great content!</div></body></html>"
12
+ HtmlMassage.html(html).should == "<div>This is some great content!</div>"
13
+ end
14
+
15
+ it 'should remove HTML "doctype"' do
16
+ html = '
17
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
18
+ <html xmlns="http://www.w3.org/1999/xhtml">
19
+ <body>
20
+ <p>foobar</p>
21
+ </body>
22
+ </html>
23
+ '
24
+ HtmlMassage.html(html).strip.should == "<p>foobar</p>"
25
+ end
26
+
27
+ end
28
+
29
+ describe ".text" do
30
+ it 'Should massage and output text' do
31
+ html = "<html><body><div>This is some great content!</div></body></html>"
32
+ HtmlMassage.text(html).strip.should == "This is some great content!"
33
+ end
34
+
35
+ it 'should convert an HTML sample as expected' do
36
+ html = "
37
+ <html><body>
38
+ <h1>Title</h1>
39
+ This is the body.
40
+ Testing <a href='http://www.google.com/'>link to Google</a>.
41
+ <p />
42
+ Testing image <img src='/noimage.png'>.
43
+ <br />
44
+ The End.
45
+ </body></html>
46
+ "
47
+ HtmlMassage.text(html).strip.should == "Title
48
+
49
+ This is the body. Testing link to Google.
50
+
51
+ Testing image .
52
+ The End.
53
+ ".strip.gsub(/^ +/, '')
54
+ end
55
+
56
+ it 'should play nice with UTF8 HTML source' do
57
+ html = '
58
+ <html>
59
+ <head>
60
+ <meta content="text/html; charset=utf-8" http-equiv="content-type" />
61
+ </head>
62
+ <body>
63
+ Niq is a performer → Angry, arrogant, &amp; so admired.
64
+ </body>
65
+ </html>
66
+ '
67
+ HtmlMassage.text(html).strip.should == "Niq is a performer → Angry, arrogant, & so admired."
68
+ end
69
+
70
+ it 'should play nice with &nbsp;' do
71
+ pending
72
+ html = '&nbsp;&nbsp;&nbsp;'
73
+ HtmlMassage.text(html).strip.should == " "
74
+ end
75
+ end
76
+
77
+ describe ".markdown" do
78
+ it 'Should massage and output markdown' do
79
+ html = "<html><body><div>This is some <i>great</i> content!</div></body></html>"
80
+ massaged = HtmlMassage.markdown html
81
+ massaged.strip.should == "This is some _great_ content!"
82
+ end
83
+ end
84
+
85
+ describe "#massage!" do
86
+
87
+ context 'invalid html' do
88
+ [
89
+ "<html><body>foobar</body>",
90
+ "<html><body>foobar</html>",
91
+ "<body>foobar</body></html>",
92
+ "<html>foobar</body></html>",
93
+ ].each do |broken_html|
94
+ it "should return 'foobar' when given #{broken_html.inspect}" do
95
+ HtmlMassage.new(broken_html).massage!.to_text.strip.should == "foobar"
96
+ end
97
+ end
98
+ end
99
+
100
+ pending 'should convert an HTML sample as expected'
101
+
102
+ it 'should leave HTML entities intact' do
103
+ pending 'improve ::Node.massage_html -- handling of html entities, utf8 chars'
104
+ original = "This &ldquo;branching&rdquo; of creative works"
105
+ massage = HtmlMassager::HtmlMassage.new( original )
106
+ massage.massage!.should == original
107
+ end
108
+ end
109
+
110
+ describe ".sanitize_html" do
111
+ it 'should remove <style> tags and their contents' do
112
+ html = %~<!-- Remix button --><br />
113
+ <style type='text/css'>
114
+ a.remix_on_wikinodes_tab {
115
+ top: 25%; left: 0; width: 42px; height: 100px; color: #FFF; cursor:pointer; text-indent:-99999px; overflow:hidden; position: fixed; z-index: 99999; margin-left: -7px; background-image: url(http://www.openyourproject.org/images/remix_tab.png); _position: absolute; right: 0 !important; left: auto !important; margin-right: -7px !important; margin-left: auto !important; } a.remix_on_wikinodes_tab:hover { margin-left: -4px; margin-right: -4px !important; margin-left: auto !important;
116
+ }
117
+ </style>
118
+ <p> <script type="text/javascript" language="javascript"> document.write( '<a style="background-color: #2a2a2a;" class="remix_on_wikinodes_tab" href="http://www.openyourproject.org/nodes/new?parent=' + window.location + '" title="Remix this content on WikiNodes -- creative collaboration designed to set you free" >Remix This</a>' ); </script> <noscript>Note: you can turn on Javascript to see the &#8216;Remix This&#8217; link.</noscript></p>
119
+ ~
120
+ html_massager = HtmlMassage.new( html )
121
+ html_massager.sanitize!.should_not =~ /remix_on_wikinodes_tab/
122
+ end
123
+
124
+ it 'should remove <noscript> tags and their contents' do
125
+ html = %{ <noscript>Note: you can turn on Javascript to see the 'Remix This' link. </noscript> }
126
+ html_massager = HtmlMassage.new( html )
127
+ html_massager.sanitize!.strip.should == ''
128
+ end
129
+ end
130
+
131
+ describe '#absolutify_links' do
132
+ it 'should work for absolute path links' do
133
+ source_url = 'http://en.wikipedia.org/wiki/Singularity'
134
+ original_html = '<a href="/wiki/Ray_Kurzweil">Ray</a>'
135
+ html_massager = HtmlMassage.new( original_html )
136
+ html_massager.absolutify_links!(source_url).should ==
137
+ '<a href="http://en.wikipedia.org/wiki/Ray_Kurzweil">Ray</a>'
138
+ end
139
+
140
+ it 'should work for absolute path links (bugfix)' do
141
+ source_url = 'http://p2pfoundation.net/NextNet'
142
+ original_html = '<a href="/Ten_Principles_for_an_Autonomous_Internet" title="Ten Principles for an Autonomous Internet">Ten Principles for an Autonomous Internet</a>'
143
+ html_massager = HtmlMassage.new( original_html )
144
+ html_massager.absolutify_links!(source_url).should ==
145
+ '<a href="http://p2pfoundation.net/Ten_Principles_for_an_Autonomous_Internet" title="Ten Principles for an Autonomous Internet">Ten Principles for an Autonomous Internet</a>'
146
+ end
147
+
148
+ it 'should work for relative links' do
149
+ source_url = 'http://en.wikipedia.org/wiki/Singularity'
150
+ original_html = '<a href="../wiki/Ray_Kurzweil">Ray</a>'
151
+ html_massager = HtmlMassage.new( original_html )
152
+ html_massager.absolutify_links!(source_url).should ==
153
+ '<a href="http://en.wikipedia.org/wiki/../wiki/Ray_Kurzweil">Ray</a>'
154
+ end
155
+
156
+ it 'should leave full URLs alone' do
157
+ source_url = 'http://en.wikipedia.org/wiki/Singularity'
158
+ original_html = '<a href="http://www.wired.com/wiredscience">wired science</a>'
159
+ html_massager = HtmlMassage.new( original_html )
160
+ html_massager.absolutify_links!(source_url).should == original_html
161
+ end
162
+
163
+ it 'should leave // style URLs alone' do
164
+ source_url = 'http://en.wikipedia.org/wiki/Singularity'
165
+ original_html = '<a href="//wired.com/wiredscience">wired science</a>'
166
+ html_massager = HtmlMassage.new( original_html )
167
+ html_massager.absolutify_links!(source_url).should == original_html
168
+ end
169
+
170
+ it 'should leave "jump links" alone' do
171
+ source_url = 'http://en.wikipedia.org/wiki/Singularity'
172
+ original_html = '<a href="#cite_1">1</a>'
173
+ html_massager = HtmlMassage.new( original_html )
174
+ html_massager.absolutify_links!(source_url).should == original_html
175
+ end
176
+ end
177
+
178
+ describe '#absolutify_images!' do
179
+ it 'should work for absolute path links' do
180
+ source_url = 'http://enlightenedstructure.org/Home/'
181
+ original_html = '<img src="/IMG/we-are.png" alt="" class="icon">'
182
+ html_massager = HtmlMassage.new( original_html )
183
+ html_massager.absolutify_images!(source_url).should ==
184
+ '<img src="http://enlightenedstructure.org/IMG/we-are.png" alt="" class="icon">'
185
+ end
186
+
187
+ it 'should work for absolute path links (bugfix)' do
188
+ source_url = 'http://www.realitysandwich.com/blog/daniel_pinchbeck'
189
+ original_html = '<img src="/sites/realitysandwich.com/themes/zen/pinkreality/images/creative-commons-license.png" alt="Attribution-Noncommercial-Share Alike 3.0 Unported" title="" width="88" height="31">'
190
+ html_massager = HtmlMassage.new( original_html )
191
+ html_massager.absolutify_images!(source_url).should ==
192
+ '<img src="http://www.realitysandwich.com/sites/realitysandwich.com/themes/zen/pinkreality/images/creative-commons-license.png" alt="Attribution-Noncommercial-Share Alike 3.0 Unported" title="" width="88" height="31">'
193
+ end
194
+
195
+ it 'should leave // style URLs alone' do
196
+ source_url = 'http://en.wikipedia.org/wiki/List_of_communes_in_France_with_over_20,000_inhabitants_(2006_census)'
197
+ original_html = '<img alt="" src="//upload.wikimedia.org/wikipedia/commons/thumb/f/f9/France-CIA_WFB_Map.png/220px-France-CIA_WFB_Map.png" width="220" height="235" class="thumbimage">'
198
+ html_massager = HtmlMassage.new( original_html )
199
+ html_massager.absolutify_images!(source_url).should == original_html
200
+ end
201
+ end
202
+
203
+ describe '#tidy_tables!' do
204
+ it 'should remove multiple newlines from tables' do
205
+ HtmlMassage.new("<table><tr>\n<th>Chư\n\n\nYang Sin National Park</th>\n\n\n</tr></table>").tidy_tables!.should ==
206
+ "<table><tr>\n<th>Chư\nYang Sin National Park</th>\n</tr></table>"
207
+ end
208
+ end
209
+
210
+ end
metadata CHANGED
@@ -1,82 +1,140 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: html_massage
3
- version: !ruby/object:Gem::Version
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
4
5
  prerelease:
5
- version: 0.0.2
6
6
  platform: ruby
7
- authors:
8
- - Harlan Knight Wood
7
+ authors:
8
+ - Harlan T Wood
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
-
13
- date: 2011-06-18 00:00:00 Z
14
- dependencies:
15
- - !ruby/object:Gem::Dependency
12
+ date: 2012-11-25 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
16
15
  name: nokogiri
17
- prerelease: false
18
- requirement: &id001 !ruby/object:Gem::Requirement
16
+ requirement: !ruby/object:Gem::Requirement
19
17
  none: false
20
- requirements:
21
- - - ">="
22
- - !ruby/object:Gem::Version
23
- version: 1.4.4
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '1.4'
24
22
  type: :runtime
25
- version_requirements: *id001
26
- - !ruby/object:Gem::Dependency
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '1.4'
30
+ - !ruby/object:Gem::Dependency
27
31
  name: sanitize
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '2.0'
38
+ type: :runtime
28
39
  prerelease: false
29
- requirement: &id002 !ruby/object:Gem::Requirement
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '2.0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: thor
48
+ requirement: !ruby/object:Gem::Requirement
30
49
  none: false
31
- requirements:
32
- - - ">="
33
- - !ruby/object:Gem::Version
34
- version: 2.0.0
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
35
54
  type: :runtime
36
- version_requirements: *id002
37
- description: "Massages HTML how you want to: sanitize tags, remove headers and footers, convert to plain text."
38
- email:
39
- - code@hkw7.org
40
- executables: []
41
-
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: rest-client
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '1.6'
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '1.6'
78
+ - !ruby/object:Gem::Dependency
79
+ name: rspec
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '2.5'
86
+ type: :development
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '2.5'
94
+ description: ! 'Massages HTML how you want to: sanitize tags, remove headers and footers,
95
+ convert to plain text.'
96
+ email:
97
+ - code@harlantwood.net
98
+ executables:
99
+ - html_massage
42
100
  extensions: []
43
-
44
101
  extra_rdoc_files: []
45
-
46
- files:
102
+ files:
47
103
  - .gitignore
48
104
  - Gemfile
105
+ - License-MIT
49
106
  - README.md
50
107
  - Rakefile
108
+ - bin/html_massage
109
+ - generate_readme.rb
51
110
  - html_massage.gemspec
52
111
  - lib/html_massage.rb
112
+ - lib/html_massage/cli.rb
53
113
  - lib/html_massage/version.rb
54
- homepage: https://github.com/onesunone/html_massage
114
+ - spec/html_massage_spec.rb
115
+ homepage: https://github.com/harlantwood/html_massage
55
116
  licenses: []
56
-
57
117
  post_install_message:
58
118
  rdoc_options: []
59
-
60
- require_paths:
119
+ require_paths:
61
120
  - lib
62
- required_ruby_version: !ruby/object:Gem::Requirement
121
+ required_ruby_version: !ruby/object:Gem::Requirement
63
122
  none: false
64
- requirements:
65
- - - ">="
66
- - !ruby/object:Gem::Version
67
- version: "0"
68
- required_rubygems_version: !ruby/object:Gem::Requirement
123
+ requirements:
124
+ - - ! '>='
125
+ - !ruby/object:Gem::Version
126
+ version: '0'
127
+ required_rubygems_version: !ruby/object:Gem::Requirement
69
128
  none: false
70
- requirements:
71
- - - ">="
72
- - !ruby/object:Gem::Version
73
- version: "0"
129
+ requirements:
130
+ - - ! '>='
131
+ - !ruby/object:Gem::Version
132
+ version: '0'
74
133
  requirements: []
75
-
76
- rubyforge_project: html_massage
77
- rubygems_version: 1.8.5
134
+ rubyforge_project:
135
+ rubygems_version: 1.8.24
78
136
  signing_key:
79
137
  specification_version: 3
80
138
  summary: Massages HTML how you want to.
81
- test_files: []
82
-
139
+ test_files:
140
+ - spec/html_massage_spec.rb