html_massage 0.0.2 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore CHANGED
@@ -2,3 +2,4 @@
2
2
  .idea
3
3
  Gemfile.lock
4
4
  pkg/*
5
+ README-backup*
data/Gemfile CHANGED
@@ -1,4 +1,6 @@
1
- source "http://rubygems.org"
1
+ source 'https://rubygems.org'
2
+
3
+ gem 'reverse_markdown', :git => 'git://github.com/harlantwood/reverse_markdown.git'
2
4
 
3
5
  # Specify your gem's dependencies in html_massage.gemspec
4
6
  gemspec
data/License-MIT ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 Harlan T Wood
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md CHANGED
@@ -1,18 +1,80 @@
1
1
  # html_massage
2
2
 
3
3
  Give your HTML a massage, in just the ways it loves:
4
+
4
5
  * Remove headers and footers and navigation, and strip to only the "content" part of the HTML
5
6
  * Sanitize tags, removing javascript and styling
6
7
  * Convert your HTML to nicely-formatted plain text
7
8
 
8
- ## Usage
9
+ ## Sample Usage
10
+
11
+ ### Full Massage
9
12
 
10
- require 'rubygems'
11
13
  require 'html_massage'
12
- html = "<html><body><div id='header'>My Site</div><div>This is some great content!</div></body></html>"
13
- html_massage = HtmlMassage.new( html, :ignored_selectors => [ '#header' ] )
14
- # => #<HtmlMassager::HtmlMassage ... >
15
- html_massage.to_html
16
- # => "<div>This is some great content!</div>"
17
- html_massage.to_text
18
- # => "This is some great content!\n"
14
+
15
+ html = %{
16
+ <html>
17
+ <head>
18
+ <script type="text/javascript">document.write('I am a bad script');</script>
19
+ </head>
20
+ <body>
21
+ <div id="header">My Site</div>
22
+ <div>This is some great content!</div>
23
+ <a href ="foo/bar.html">Click this link</a>
24
+ </body>
25
+ </html>
26
+ }
27
+
28
+ puts HtmlMassage.html( html )
29
+ # => "<div>This is some great content!</div>"
30
+
31
+ puts HtmlMassage.text( html )
32
+ # => "This is some great content!\n"
33
+
34
+ ### Content Only
35
+
36
+ html_massage = HtmlMassage.new( html,
37
+ :exclude => [ '#header' ] )
38
+ # => #<HtmlMassager::HtmlMassage ... >
39
+
40
+ puts html_massage.exclude!
41
+ # <div>This is some great content!</div>
42
+ # <a href="foo/bar.html">Click this link</a>
43
+
44
+ ### Sanitize HTML
45
+
46
+ html_massage = HtmlMassage.new( html,
47
+ :exclude => [ '#header' ] )
48
+ # => #<HtmlMassager::HtmlMassage ... >
49
+
50
+ puts html_massage.sanitize_html!
51
+ # <html>
52
+ # <head>
53
+ # </head>
54
+ # <body>
55
+ # <div id="header">My Site</div>
56
+ # <div>This is some great content!</div>
57
+ # </body>
58
+ # </html>
59
+
60
+ ### Make Links Absolute
61
+
62
+ html_massage = HtmlMassage.new( html,
63
+ :exclude => [ '#header' ],
64
+ :source_url => 'http://example.com/joe/page1.html' )
65
+
66
+ puts html_massage.absolutify_links!
67
+ # <html>
68
+ # <head>
69
+ # <script type="text/javascript">document.write('I am a bad script');</script>
70
+ # </head>
71
+ # <body>
72
+ # <div id="header">My Site</div>
73
+ # <div>This is some great content!</div>
74
+ # <a href ="http://example.com/joe/foo/bar.html">Click this link</a>
75
+ # </body>
76
+ # </html>
77
+
78
+ puts html_massage.absolutify_images!
79
+ #
80
+
data/Rakefile CHANGED
@@ -1 +1 @@
1
- require 'bundler/gem_tasks'
1
+ require "bundler/gem_tasks"
data/bin/html_massage ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'html_massage/cli'
4
+ HtmlMassager::CLI.start
@@ -0,0 +1,59 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ class IO
4
+ def self.write( path, content )
5
+ file = File.new( path, "w" )
6
+ file.write( content )
7
+ file.close
8
+ end
9
+ end
10
+
11
+ CHUNK_SEP = "\n\n"
12
+
13
+ def is_code?( markdown )
14
+ markdown.start_with?( ' ' )
15
+ end
16
+
17
+ def header( text, top_newlines )
18
+ puts "\n" * top_newlines
19
+ puts '*'*10
20
+ puts text
21
+ puts '*'*10
22
+ end
23
+
24
+ system( "cp README.md README-backup-#{Time.now.to_s.gsub(/\W/, '-')}.md" )
25
+ readme = IO.read( 'README.md' )
26
+ chunks = readme.split( CHUNK_SEP )
27
+ code = ''
28
+ new_readme = ''
29
+ chunks.each do |chunk|
30
+ if is_code?( chunk )
31
+
32
+ chunk
33
+ code << chunk << CHUNK_SEP
34
+
35
+ header( 'Code', 3 )
36
+ puts code
37
+ header( 'Result', 1 )
38
+ puts result = eval( code )
39
+
40
+ unless result.nil?
41
+ p 111, chunk
42
+ result = result.to_s
43
+ _, code_sans_results = chunk.match( /\A((?: [^#].*\r?\n)+)(?: #.*\r?\n)+\Z/ ).to_a
44
+ if code_sans_results
45
+ p 222
46
+ result = result.split("\n").map{ |line| " # #{line}" }.join("\n")
47
+ chunk = code_sans_results << result << CHUNK_SEP
48
+ end
49
+ end
50
+
51
+ header( 'Output', 1 )
52
+ puts chunk
53
+
54
+ new_readme << chunk << CHUNK_SEP
55
+ end
56
+
57
+ end
58
+
59
+ IO.write( 'README.md', new_readme )
data/html_massage.gemspec CHANGED
@@ -1,23 +1,28 @@
1
1
  # -*- encoding: utf-8 -*-
2
- $:.push File.expand_path("../lib", __FILE__)
3
- require "html_massage/version"
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'html_massage/version'
4
5
 
5
- Gem::Specification.new do |s|
6
- s.name = "html_massage"
7
- s.version = HtmlMassager::VERSION
8
- s.authors = ["Harlan Knight Wood"]
9
- s.email = ["code@hkw7.org"]
10
- s.homepage = "https://github.com/onesunone/html_massage"
11
- s.summary = %{Massages HTML how you want to.}
12
- s.description = %{Massages HTML how you want to: sanitize tags, remove headers and footers, convert to plain text.}
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "html_massage"
8
+ gem.version = HtmlMassager::VERSION
9
+ gem.authors = ["Harlan T Wood"]
10
+ gem.email = ["code@harlantwood.net"]
11
+ gem.homepage = "https://github.com/harlantwood/html_massage"
12
+ gem.summary = %{Massages HTML how you want to.}
13
+ gem.description = %{Massages HTML how you want to: sanitize tags, remove headers and footers, convert to plain text.}
13
14
 
14
- s.rubyforge_project = "html_massage"
15
+ gem.files = `git ls-files`.split($/)
16
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
+ gem.require_paths = ["lib"]
15
19
 
16
- s.add_dependency('nokogiri', ">= 1.4.4")
17
- s.add_dependency('sanitize', ">= 2.0.0")
20
+ gem.add_dependency "nokogiri", ">= 1.4"
21
+ gem.add_dependency "sanitize", ">= 2.0"
22
+ gem.add_dependency "thor"
23
+ gem.add_dependency "rest-client", ">= 1.6"
24
+
25
+ gem.add_development_dependency "rspec", ">= 2.5"
18
26
 
19
- s.files = `git ls-files`.split("\n")
20
- s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
21
- s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
22
- s.require_paths = ["lib"]
23
27
  end
28
+
@@ -0,0 +1,35 @@
1
+ require 'thor'
2
+ require 'rest_client'
3
+ require 'html_massage'
4
+
5
+ module HtmlMassager
6
+
7
+ class CLI < Thor
8
+
9
+ desc :html, 'Download HTML from given URL and massage into html'
10
+ def html url
11
+ STDOUT.puts massage_to :html, url
12
+ end
13
+
14
+ desc :text, 'Download HTML from given URL and massage into plain text'
15
+ def text url
16
+ STDOUT.puts massage_to :text, url
17
+ end
18
+
19
+ desc :markdown, 'Download HTML from given URL and massage into markdown'
20
+ def markdown url
21
+ STDOUT.puts massage_to :markdown, url
22
+ end
23
+
24
+ no_tasks do
25
+ def massage_to output_format, url
26
+ HtmlMassage.send output_format,
27
+ RestClient.get(url),
28
+ :source_url => url,
29
+ :links => :absolute,
30
+ :images => :absolute
31
+ end
32
+ end
33
+
34
+ end
35
+ end
@@ -1,3 +1,3 @@
1
1
  module HtmlMassager
2
- VERSION = "0.0.2"
2
+ VERSION = "0.2.0"
3
3
  end
data/lib/html_massage.rb CHANGED
@@ -1,129 +1,281 @@
1
1
  require "cgi"
2
2
  require "nokogiri"
3
3
  require "sanitize"
4
+ require "reverse_markdown"
4
5
  require "html_massage/version"
5
6
 
6
7
  module HtmlMassager
8
+
7
9
  class HtmlMassage
8
- def initialize( html, options )
9
- @source_url = options[ :source_url ]
10
- @ignored_selectors = options[ :ignored_selectors ]
11
- @clean_html = massage_html( html )
12
- end
13
10
 
14
- def massage_html( html )
15
- html = content_only( html )
16
- html = sanitize_html( html )
17
- html = absolutify_links( html ) if @source_url
11
+ INCLUDE_CONTENT_ONLY = %w[
18
12
  html
19
- end
13
+ body
14
+ ]
20
15
 
21
- def content_only( content )
22
- doc = Nokogiri::HTML( content )
23
- body = doc / 'html' / 'body'
16
+ DEFAULT_EXCLUDE_OPTIONS = [
17
+ # general:
18
+ 'head',
19
+ 'title',
20
+ 'meta',
24
21
 
25
- @ignored_selectors.to_a.each do |ignored_selector|
26
- ( body / ignored_selector ).remove
27
- end
22
+ 'div#header',
23
+ 'div.header',
24
+ 'div#banner',
25
+ 'div.banner',
26
+ '.footer',
27
+ '#footer',
28
+ 'div#navigation',
29
+ 'div.navigation',
30
+ 'div#nav',
31
+ 'div.nav',
32
+ 'div#sidebar',
33
+ 'div.sidebar',
34
+ '#breadcrumbs',
35
+ '.breadcrumbs',
36
+ '#backfornav',
37
+ '.backfornav',
38
+ 'div.post-footer',
39
+ 'div.navigation',
28
40
 
29
- content = body / '#content'
30
- content = body if content.empty?
31
- content = content.inner_html
32
- content
33
- end
41
+ # wordpress:
42
+ 'a#left_arrow',
43
+ 'a#right_arrow',
44
+ 'div#comments',
45
+ 'div#comment-section',
46
+ 'div#respond',
34
47
 
35
- def sanitize_html(html)
36
- html = html.dup
48
+ # typepad
49
+ '#pagebody > #pagebody-inner > #alpha',
50
+ 'p.content-nav',
51
+
52
+ # blog widgets
53
+ '.widget_blog_subscription',
54
+ '.loggedout-follow-normal',
37
55
 
38
- %w[ script noscript style ].each do |tag|
39
- html.gsub!( %r{<#{tag}[^>]*>.*?</#{tag}>}mi, '' )
40
- end
41
56
 
42
- Sanitize.clean(
43
- html,
44
- {
45
- :elements => [
46
- 'a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
47
- 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
48
- 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir',
49
- 'div', 'dl', 'dt', 'em', 'fieldset', 'form', 'h1',
50
- 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i',
51
- 'img',
52
- 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu',
53
- 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
54
- 'select', 'small', 'span', 'strike', 'strong', 'sub',
55
- 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th',
56
- 'thead', 'tr', 'tt', 'u', 'ul', 'var',
57
+ # wikipedia
58
+
59
+ '#bodyContent > #siteSub',
60
+ '#bodyContent > #contentSub',
61
+ '#bodyContent > #jump-to-nav',
62
+ 'table.metadata',
63
+ 'table.navbox',
64
+ 'table.toc',
65
+ 'div#catlinks',
66
+ 'div.printfooter',
67
+ 'h1 > span.editsection',
68
+ 'h2 > span.editsection',
69
+ 'h3 > span.editsection',
70
+ 'h4 > span.editsection',
71
+ 'h5 > span.editsection',
72
+ 'h6 > span.editsection',
73
+
74
+ # wikipedia "message boxes" -- metadata such as "requires cleanup":
75
+ # see http://en.wikipedia.org/wiki/Template:Ambox
76
+ 'table.ambox',
77
+ 'table.tmbox',
78
+ 'table.imbox',
79
+ 'table.cmbox',
80
+ 'table.ombox',
81
+ 'table.fmbox',
82
+ 'table.dmbox',
83
+
84
+ # mediawiki
85
+ '#mw-subcategories',
86
+ '#mw-pages',
87
+ '#mw-head',
88
+ '#mw-panel',
89
+
90
+ # social media sharing:
91
+ 'ul#sharebar',
92
+ 'ul#sharebarx',
93
+ '.sharedaddy',
94
+ '#sharing_email',
95
+
96
+ # signup:
97
+ '#mailchimp_signup_bottom',
98
+ ]
99
+
100
+ DEFAULT_SANITIZE_OPTIONS = {
101
+ :elements => %w[
102
+ a abbr acronym address area b big
103
+ blockquote br button caption center cite
104
+ code col colgroup dd del dfn dir
105
+ div dl dt em fieldset form h1
106
+ h2 h3 h4 h5 h6 hr i
107
+ img
108
+ input ins kbd label legend li map menu
109
+ ol optgroup option p pre q s samp
110
+ select small span strike strong sub
111
+ sup table tbody td textarea tfoot th
112
+ thead tr tt u ul var
57
113
  ],
58
114
  :attributes => {
59
- 'a' => ['href'],
60
- 'img' => ['src'],
61
- :all => ['abbr', 'accept', 'accept-charset',
62
- 'accesskey', 'action', 'align', 'alt', 'axis',
63
- 'border', 'cellpadding', 'cellspacing', 'char',
64
- 'charoff', 'class', 'charset', 'checked', 'cite',
65
- 'clear', 'cols', 'colspan', 'color',
66
- 'compact', 'coords', 'datetime', 'dir',
67
- 'disabled', 'enctype', 'for', 'frame',
68
- 'headers', 'height', 'hreflang',
69
- 'hspace', 'id', 'ismap', 'label', 'lang',
70
- 'longdesc', 'maxlength', 'media', 'method',
71
- 'multiple', 'name', 'nohref', 'noshade',
72
- 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
73
- 'rows', 'rowspan', 'rules', 'scope',
74
- 'selected', 'shape', 'size', 'span',
75
- 'start', 'summary', 'tabindex', 'target',
76
- 'title', 'type', 'usemap', 'valign', 'value',
77
- 'vspace', 'width']
115
+ 'a' => %w[ href ],
116
+ 'img' => %w[ src ],
117
+ :all => %w[
118
+ abbr accept accept-charset
119
+ accesskey action align alt axis
120
+ border cellpadding cellspacing char
121
+ charoff class charset checked cite
122
+ clear cols colspan color
123
+ compact coords datetime dir
124
+ disabled enctype for frame
125
+ headers height hreflang
126
+ hspace id ismap label lang
127
+ longdesc maxlength media method
128
+ multiple name nohref noshade
129
+ nowrap prompt readonly rel rev
130
+ rows rowspan rules scope
131
+ selected shape size span
132
+ start summary tabindex target
133
+ title type usemap valign value
134
+ vspace width
135
+ ]
78
136
  },
137
+
138
+ # medium permissive list:
139
+ #:elements => [
140
+ # 'a', 'b', 'blockquote', 'br', 'code', 'dd', 'del', 'dl', 'dt',
141
+ # 'em', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i',
142
+ # 'img', 'ins', 'li', 'ol', 'p', 'pre', 'small', 'strike', 'strong', 'sub',
143
+ # 'sup', 'table', 'tbody', 'td', 'th',
144
+ # 'thead', 'tr', 'u', 'ul',
145
+ #],
146
+
79
147
  :protocols => {
80
148
  'a' => {'href' => ['http', 'https', 'mailto', :relative]},
81
149
  'img' => {'src' => ['http', 'https', :relative]}
82
150
  },
83
151
 
84
- # consider including for deprecated/historical/or spam-suspect pages:
85
- # Gollum has a nice way to add this to your config optionally, see:
86
- # https://github.com/github/gollum/blob/master/lib/gollum/sanitization.rb
152
+ # Consider including for deprecated/historical or spam-suspect pages:
87
153
  #
88
154
  # :add_attributes => {
89
155
  # 'a' => {'rel' => 'nofollow'}
90
156
  # }
157
+ #
158
+ # Gollum has a nice way to add this to your config optionally, see:
159
+ # https://github.com/github/gollum/blob/master/lib/gollum/sanitization.rb
91
160
  }
92
- )
161
+
162
+ DEFAULTS = {
163
+ :include => INCLUDE_CONTENT_ONLY,
164
+ :exclude => DEFAULT_EXCLUDE_OPTIONS,
165
+ :sanitize => DEFAULT_SANITIZE_OPTIONS,
166
+ :links => :unchanged,
167
+ }
168
+
169
+ def self.html( html, options={} )
170
+ new( html ).massage!( options ).to_html
93
171
  end
94
172
 
95
- def absolutify_links( html )
96
- match = @source_url.match( %r{(^[a-z]+://[^/]+)(/.+/)}i )
97
- return html unless match
173
+ def self.text( html, options={} )
174
+ new( html ).massage!( options ).to_text
175
+ end
176
+
177
+ def self.markdown( html, options={} )
178
+ ReverseMarkdown.parse( self.html( html, options ) )
179
+ end
180
+
181
+ def initialize( html )
182
+ @html = html.dup
183
+ end
184
+
185
+ def massage!( options={} )
186
+ self.class.translate_old_options( options )
187
+ options = DEFAULTS.merge( options )
188
+ absolutify_links!(options[:source_url]) if options.delete( :links ) == :absolute
189
+ absolutify_images!(options[:source_url]) if options.delete( :images ) == :absolute
190
+ include!( options.delete( :include ) )
191
+ exclude!( options.delete( :exclude ) )
192
+ sanitize!( options.delete( :sanitize ) )
193
+ tidy_whitespace!
194
+ raise "Unexpected options #{options.inspect}" unless options.empty?
195
+ self
196
+ end
197
+
198
+ def self.translate_old_options( options )
199
+ options[ :exclude ] = options.delete( :ignored_selectors ) if options[ :ignored_selectors ]
200
+ end
201
+
202
+ def exclude!( selectors_to_exclude )
203
+ doc = Nokogiri::HTML( @html )
204
+ selectors_to_exclude.to_a.each do |selector_to_exclude|
205
+ ( doc / selector_to_exclude ).remove
206
+ end
207
+ @html = doc.to_s
208
+ end
209
+
210
+ def include!( selectors_to_include )
211
+ section = Nokogiri::HTML( @html )
212
+ selectors_to_include.to_a.each do |selector_to_include|
213
+ subsection = section / selector_to_include
214
+ section = subsection unless subsection.empty?
215
+ end
216
+ @html = section.inner_html
217
+ end
218
+
219
+ def sanitize!( sanitize_options={} )
220
+ # Sanitize does not thoroughly remove these tags -- so we do a manual pass:
221
+ %w[ script noscript style ].each do |tag|
222
+ unless sanitize_options[ :elements ] && sanitize_options[ :elements ].include?( tag )
223
+ @html.gsub!( %r{<#{tag}[^>]*>.*?</#{tag}>}mi, '' )
224
+ end
225
+ end
226
+
227
+ @html = Sanitize.clean( @html, sanitize_options )
228
+ @html
229
+ end
230
+
231
+ def absolutify_links!(source_url)
232
+ absolutify_paths!('a', 'href', source_url)
233
+ end
234
+
235
+ def absolutify_images!(source_url)
236
+ absolutify_paths!('img', 'src', source_url)
237
+ end
238
+
239
+ def absolutify_paths!(tag_name, attr, source_url)
240
+ raise "When asking for absolute images or paths, please pass in source_url" unless source_url
241
+ match = source_url.match( %r{(^[a-z]+?://[^/]+)(/.+/)?}i )
242
+ return @html unless match
98
243
  base_url = match[ 1 ]
99
244
  resource_dir_url = match[ 0 ] # whole regexp match
245
+ dom = Nokogiri::HTML.fragment( @html )
100
246
 
101
- dom = Nokogiri::HTML.fragment( html )
102
- links = dom / 'a'
103
- links.each do |link|
104
- href = link[ 'href' ]
105
- if href
106
- link[ 'href' ] =
107
- case href
247
+ tags = dom / tag_name
248
+ tags.each do |tag|
249
+ value = tag[ attr ]
250
+ if value
251
+ tag[ attr ] =
252
+ case value
253
+ when %r{^//} # eg src="//upload.wikimedia.org/wikipedia/Map.png"
254
+ value
108
255
  when %r{^/}
109
- File.join( base_url, href )
256
+ File.join( base_url, value )
110
257
  when %r{^\.\.}
111
- File.join( resource_dir_url, href )
258
+ File.join( resource_dir_url, value )
112
259
  else
113
- href
260
+ value
114
261
  end
115
262
  end
116
263
  end
117
- html = dom.to_s
118
- html
264
+
265
+ @html = dom.to_s.strip
119
266
  end
120
267
 
121
- def to_html
122
- @clean_html
268
+ def tidy_whitespace!
269
+ @html = strip_lines(@html)
270
+ tidy_tables!
271
+ end
272
+
273
+ def tidy_tables!
274
+ @html.gsub!(%r{(<table\b)(.+?)(</table>)}m) { open,body,close=$1,$2,$3; open + body.gsub(/\n{2,}/, "\n") + close }
123
275
  end
124
276
 
125
277
  def to_text
126
- text = CGI.unescapeHTML( @clean_html )
278
+ text = CGI.unescapeHTML( @html )
127
279
 
128
280
  # normalize newlines
129
281
  text.gsub!(/\r\n/, "\n")
@@ -132,7 +284,7 @@ module HtmlMassager
132
284
  # nbsp => ' '
133
285
  text.gsub!(/&nbsp;/, ' ')
134
286
 
135
- # TODO: figure out how to do these in ruby 1.9.2:
287
+ # TODO: figure out how to do these in ruby 1.9:
136
288
  # They now throw 'incompatible encoding -- ascii regexp for utf8 string'
137
289
  # text.gsub!( /\302\240/, ' ' ) # UTF8 for nbsp
138
290
  # text.gsub!( /\240/, ' ' ) # ascii for nbsp
@@ -163,14 +315,21 @@ module HtmlMassager
163
315
  "#{text}\n"
164
316
  end
165
317
 
166
- def strip_lines( text )
167
- lines = text.split( "\n" )
318
+ def strip_lines(content)
319
+ lines = content.split( $/ ) # $/ is the current ruby line ending, \n by default
168
320
  lines.map!{ |line| line.strip }
169
- text = lines.join( "\n" )
170
- text.strip
321
+ processed = lines.join( $/ )
322
+ processed.strip
323
+ end
324
+
325
+
326
+ def to_html
327
+ @html.strip!
328
+ @html
171
329
  end
172
330
 
173
331
  end
332
+
174
333
  end
175
334
 
176
- include HtmlMassager
335
+ include HtmlMassager
@@ -0,0 +1,210 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'html_massage'))
4
+
5
+ describe HtmlMassager::HtmlMassage do
6
+
7
+ include HtmlMassager
8
+
9
+ describe ".html" do
10
+ it 'Should massage and output HTML' do
11
+ html = "<html><body><div>This is some great content!</div></body></html>"
12
+ HtmlMassage.html(html).should == "<div>This is some great content!</div>"
13
+ end
14
+
15
+ it 'should remove HTML "doctype"' do
16
+ html = '
17
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
18
+ <html xmlns="http://www.w3.org/1999/xhtml">
19
+ <body>
20
+ <p>foobar</p>
21
+ </body>
22
+ </html>
23
+ '
24
+ HtmlMassage.html(html).strip.should == "<p>foobar</p>"
25
+ end
26
+
27
+ end
28
+
29
+ describe ".text" do
30
+ it 'Should massage and output text' do
31
+ html = "<html><body><div>This is some great content!</div></body></html>"
32
+ HtmlMassage.text(html).strip.should == "This is some great content!"
33
+ end
34
+
35
+ it 'should convert an HTML sample as expected' do
36
+ html = "
37
+ <html><body>
38
+ <h1>Title</h1>
39
+ This is the body.
40
+ Testing <a href='http://www.google.com/'>link to Google</a>.
41
+ <p />
42
+ Testing image <img src='/noimage.png'>.
43
+ <br />
44
+ The End.
45
+ </body></html>
46
+ "
47
+ HtmlMassage.text(html).strip.should == "Title
48
+
49
+ This is the body. Testing link to Google.
50
+
51
+ Testing image .
52
+ The End.
53
+ ".strip.gsub(/^ +/, '')
54
+ end
55
+
56
+ it 'should play nice with UTF8 HTML source' do
57
+ html = '
58
+ <html>
59
+ <head>
60
+ <meta content="text/html; charset=utf-8" http-equiv="content-type" />
61
+ </head>
62
+ <body>
63
+ Niq is a performer → Angry, arrogant, &amp; so admired.
64
+ </body>
65
+ </html>
66
+ '
67
+ HtmlMassage.text(html).strip.should == "Niq is a performer → Angry, arrogant, & so admired."
68
+ end
69
+
70
+ it 'should play nice with &nbsp;' do
71
+ pending
72
+ html = '&nbsp;&nbsp;&nbsp;'
73
+ HtmlMassage.text(html).strip.should == " "
74
+ end
75
+ end
76
+
77
+ describe ".markdown" do
78
+ it 'Should massage and output markdown' do
79
+ html = "<html><body><div>This is some <i>great</i> content!</div></body></html>"
80
+ massaged = HtmlMassage.markdown html
81
+ massaged.strip.should == "This is some _great_ content!"
82
+ end
83
+ end
84
+
85
+ describe "#massage!" do
86
+
87
+ context 'invalid html' do
88
+ [
89
+ "<html><body>foobar</body>",
90
+ "<html><body>foobar</html>",
91
+ "<body>foobar</body></html>",
92
+ "<html>foobar</body></html>",
93
+ ].each do |broken_html|
94
+ it "should return 'foobar' when given #{broken_html.inspect}" do
95
+ HtmlMassage.new(broken_html).massage!.to_text.strip.should == "foobar"
96
+ end
97
+ end
98
+ end
99
+
100
+ pending 'should convert an HTML sample as expected'
101
+
102
+ it 'should leave HTML entities intact' do
103
+ pending 'improve ::Node.massage_html -- handling of html entities, utf8 chars'
104
+ original = "This &ldquo;branching&rdquo; of creative works"
105
+ massage = HtmlMassager::HtmlMassage.new( original )
106
+ massage.massage!.should == original
107
+ end
108
+ end
109
+
110
+ describe ".sanitize_html" do
111
+ it 'should remove <style> tags and their contents' do
112
+ html = %~<!-- Remix button --><br />
113
+ <style type='text/css'>
114
+ a.remix_on_wikinodes_tab {
115
+ top: 25%; left: 0; width: 42px; height: 100px; color: #FFF; cursor:pointer; text-indent:-99999px; overflow:hidden; position: fixed; z-index: 99999; margin-left: -7px; background-image: url(http://www.openyourproject.org/images/remix_tab.png); _position: absolute; right: 0 !important; left: auto !important; margin-right: -7px !important; margin-left: auto !important; } a.remix_on_wikinodes_tab:hover { margin-left: -4px; margin-right: -4px !important; margin-left: auto !important;
116
+ }
117
+ </style>
118
+ <p> <script type="text/javascript" language="javascript"> document.write( '<a style="background-color: #2a2a2a;" class="remix_on_wikinodes_tab" href="http://www.openyourproject.org/nodes/new?parent=' + window.location + '" title="Remix this content on WikiNodes -- creative collaboration designed to set you free" >Remix This</a>' ); </script> <noscript>Note: you can turn on Javascript to see the &#8216;Remix This&#8217; link.</noscript></p>
119
+ ~
120
+ html_massager = HtmlMassage.new( html )
121
+ html_massager.sanitize!.should_not =~ /remix_on_wikinodes_tab/
122
+ end
123
+
124
+ it 'should remove <noscript> tags and their contents' do
125
+ html = %{ <noscript>Note: you can turn on Javascript to see the 'Remix This' link. </noscript> }
126
+ html_massager = HtmlMassage.new( html )
127
+ html_massager.sanitize!.strip.should == ''
128
+ end
129
+ end
130
+
131
+ describe '#absolutify_links' do
132
+ it 'should work for absolute path links' do
133
+ source_url = 'http://en.wikipedia.org/wiki/Singularity'
134
+ original_html = '<a href="/wiki/Ray_Kurzweil">Ray</a>'
135
+ html_massager = HtmlMassage.new( original_html )
136
+ html_massager.absolutify_links!(source_url).should ==
137
+ '<a href="http://en.wikipedia.org/wiki/Ray_Kurzweil">Ray</a>'
138
+ end
139
+
140
+ it 'should work for absolute path links (bugfix)' do
141
+ source_url = 'http://p2pfoundation.net/NextNet'
142
+ original_html = '<a href="/Ten_Principles_for_an_Autonomous_Internet" title="Ten Principles for an Autonomous Internet">Ten Principles for an Autonomous Internet</a>'
143
+ html_massager = HtmlMassage.new( original_html )
144
+ html_massager.absolutify_links!(source_url).should ==
145
+ '<a href="http://p2pfoundation.net/Ten_Principles_for_an_Autonomous_Internet" title="Ten Principles for an Autonomous Internet">Ten Principles for an Autonomous Internet</a>'
146
+ end
147
+
148
+ it 'should work for relative links' do
149
+ source_url = 'http://en.wikipedia.org/wiki/Singularity'
150
+ original_html = '<a href="../wiki/Ray_Kurzweil">Ray</a>'
151
+ html_massager = HtmlMassage.new( original_html )
152
+ html_massager.absolutify_links!(source_url).should ==
153
+ '<a href="http://en.wikipedia.org/wiki/../wiki/Ray_Kurzweil">Ray</a>'
154
+ end
155
+
156
+ it 'should leave full URLs alone' do
157
+ source_url = 'http://en.wikipedia.org/wiki/Singularity'
158
+ original_html = '<a href="http://www.wired.com/wiredscience">wired science</a>'
159
+ html_massager = HtmlMassage.new( original_html )
160
+ html_massager.absolutify_links!(source_url).should == original_html
161
+ end
162
+
163
+ it 'should leave // style URLs alone' do
164
+ source_url = 'http://en.wikipedia.org/wiki/Singularity'
165
+ original_html = '<a href="//wired.com/wiredscience">wired science</a>'
166
+ html_massager = HtmlMassage.new( original_html )
167
+ html_massager.absolutify_links!(source_url).should == original_html
168
+ end
169
+
170
+ it 'should leave "jump links" alone' do
171
+ source_url = 'http://en.wikipedia.org/wiki/Singularity'
172
+ original_html = '<a href="#cite_1">1</a>'
173
+ html_massager = HtmlMassage.new( original_html )
174
+ html_massager.absolutify_links!(source_url).should == original_html
175
+ end
176
+ end
177
+
178
+ describe '#absolutify_images!' do
179
+ it 'should work for absolute path links' do
180
+ source_url = 'http://enlightenedstructure.org/Home/'
181
+ original_html = '<img src="/IMG/we-are.png" alt="" class="icon">'
182
+ html_massager = HtmlMassage.new( original_html )
183
+ html_massager.absolutify_images!(source_url).should ==
184
+ '<img src="http://enlightenedstructure.org/IMG/we-are.png" alt="" class="icon">'
185
+ end
186
+
187
+ it 'should work for absolute path links (bugfix)' do
188
+ source_url = 'http://www.realitysandwich.com/blog/daniel_pinchbeck'
189
+ original_html = '<img src="/sites/realitysandwich.com/themes/zen/pinkreality/images/creative-commons-license.png" alt="Attribution-Noncommercial-Share Alike 3.0 Unported" title="" width="88" height="31">'
190
+ html_massager = HtmlMassage.new( original_html )
191
+ html_massager.absolutify_images!(source_url).should ==
192
+ '<img src="http://www.realitysandwich.com/sites/realitysandwich.com/themes/zen/pinkreality/images/creative-commons-license.png" alt="Attribution-Noncommercial-Share Alike 3.0 Unported" title="" width="88" height="31">'
193
+ end
194
+
195
+ it 'should leave // style URLs alone' do
196
+ source_url = 'http://en.wikipedia.org/wiki/List_of_communes_in_France_with_over_20,000_inhabitants_(2006_census)'
197
+ original_html = '<img alt="" src="//upload.wikimedia.org/wikipedia/commons/thumb/f/f9/France-CIA_WFB_Map.png/220px-France-CIA_WFB_Map.png" width="220" height="235" class="thumbimage">'
198
+ html_massager = HtmlMassage.new( original_html )
199
+ html_massager.absolutify_images!(source_url).should == original_html
200
+ end
201
+ end
202
+
203
+ describe '#tidy_tables!' do
204
+ it 'should remove multiple newlines from tables' do
205
+ HtmlMassage.new("<table><tr>\n<th>Chư\n\n\nYang Sin National Park</th>\n\n\n</tr></table>").tidy_tables!.should ==
206
+ "<table><tr>\n<th>Chư\nYang Sin National Park</th>\n</tr></table>"
207
+ end
208
+ end
209
+
210
+ end
metadata CHANGED
@@ -1,82 +1,140 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: html_massage
3
- version: !ruby/object:Gem::Version
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
4
5
  prerelease:
5
- version: 0.0.2
6
6
  platform: ruby
7
- authors:
8
- - Harlan Knight Wood
7
+ authors:
8
+ - Harlan T Wood
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
-
13
- date: 2011-06-18 00:00:00 Z
14
- dependencies:
15
- - !ruby/object:Gem::Dependency
12
+ date: 2012-11-25 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
16
15
  name: nokogiri
17
- prerelease: false
18
- requirement: &id001 !ruby/object:Gem::Requirement
16
+ requirement: !ruby/object:Gem::Requirement
19
17
  none: false
20
- requirements:
21
- - - ">="
22
- - !ruby/object:Gem::Version
23
- version: 1.4.4
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '1.4'
24
22
  type: :runtime
25
- version_requirements: *id001
26
- - !ruby/object:Gem::Dependency
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '1.4'
30
+ - !ruby/object:Gem::Dependency
27
31
  name: sanitize
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '2.0'
38
+ type: :runtime
28
39
  prerelease: false
29
- requirement: &id002 !ruby/object:Gem::Requirement
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '2.0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: thor
48
+ requirement: !ruby/object:Gem::Requirement
30
49
  none: false
31
- requirements:
32
- - - ">="
33
- - !ruby/object:Gem::Version
34
- version: 2.0.0
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
35
54
  type: :runtime
36
- version_requirements: *id002
37
- description: "Massages HTML how you want to: sanitize tags, remove headers and footers, convert to plain text."
38
- email:
39
- - code@hkw7.org
40
- executables: []
41
-
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: rest-client
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '1.6'
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '1.6'
78
+ - !ruby/object:Gem::Dependency
79
+ name: rspec
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '2.5'
86
+ type: :development
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '2.5'
94
+ description: ! 'Massages HTML how you want to: sanitize tags, remove headers and footers,
95
+ convert to plain text.'
96
+ email:
97
+ - code@harlantwood.net
98
+ executables:
99
+ - html_massage
42
100
  extensions: []
43
-
44
101
  extra_rdoc_files: []
45
-
46
- files:
102
+ files:
47
103
  - .gitignore
48
104
  - Gemfile
105
+ - License-MIT
49
106
  - README.md
50
107
  - Rakefile
108
+ - bin/html_massage
109
+ - generate_readme.rb
51
110
  - html_massage.gemspec
52
111
  - lib/html_massage.rb
112
+ - lib/html_massage/cli.rb
53
113
  - lib/html_massage/version.rb
54
- homepage: https://github.com/onesunone/html_massage
114
+ - spec/html_massage_spec.rb
115
+ homepage: https://github.com/harlantwood/html_massage
55
116
  licenses: []
56
-
57
117
  post_install_message:
58
118
  rdoc_options: []
59
-
60
- require_paths:
119
+ require_paths:
61
120
  - lib
62
- required_ruby_version: !ruby/object:Gem::Requirement
121
+ required_ruby_version: !ruby/object:Gem::Requirement
63
122
  none: false
64
- requirements:
65
- - - ">="
66
- - !ruby/object:Gem::Version
67
- version: "0"
68
- required_rubygems_version: !ruby/object:Gem::Requirement
123
+ requirements:
124
+ - - ! '>='
125
+ - !ruby/object:Gem::Version
126
+ version: '0'
127
+ required_rubygems_version: !ruby/object:Gem::Requirement
69
128
  none: false
70
- requirements:
71
- - - ">="
72
- - !ruby/object:Gem::Version
73
- version: "0"
129
+ requirements:
130
+ - - ! '>='
131
+ - !ruby/object:Gem::Version
132
+ version: '0'
74
133
  requirements: []
75
-
76
- rubyforge_project: html_massage
77
- rubygems_version: 1.8.5
134
+ rubyforge_project:
135
+ rubygems_version: 1.8.24
78
136
  signing_key:
79
137
  specification_version: 3
80
138
  summary: Massages HTML how you want to.
81
- test_files: []
82
-
139
+ test_files:
140
+ - spec/html_massage_spec.rb