html_massage 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ .bundle
2
+ .idea
3
+ Gemfile.lock
4
+ pkg/*
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in html_massage.gemspec
4
+ gemspec
data/README.md ADDED
@@ -0,0 +1,18 @@
1
+ # html_massage
2
+
3
+ Give your HTML a massage, in just the ways it loves:
4
+ * Remove headers and footers and navigation, and strip to only the "content" part of the HTML
5
+ * Sanitize tags, removing javascript and styling
6
+ * Convert your HTML to nicely-formatted plain text
7
+
8
+ ## Usage
9
+
10
+ require 'rubygems'
11
+ require 'html_massage'
12
+ html = "<html><body><div id='header'>My Site</div><div>This is some great content!</div></body></html>"
13
+ html_massage = HtmlMassage.new( html, :ignored_selectors => [ '#header' ] )
14
+ # => #<HtmlMassager::HtmlMassage ... >
15
+ html_massage.to_html
16
+ # => "<div>This is some great content!</div>"
17
+ html_massage.to_text
18
+ # => "This is some great content!\n"
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require 'bundler/gem_tasks'
@@ -0,0 +1,23 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "html_massage/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "html_massage"
7
+ s.version = HtmlMassager::VERSION
8
+ s.authors = ["Harlan Knight Wood"]
9
+ s.email = ["code@hkw7.org"]
10
+ s.homepage = "https://github.com/onesunone/html_massage"
11
+ s.summary = %{Massages HTML how you want to.}
12
+ s.description = %{Massages HTML how you want to: sanitize tags, remove headers and footers, convert to plain text.}
13
+
14
+ s.rubyforge_project = "html_massage"
15
+
16
+ s.add_dependency('nokogiri', ">= 1.4.4")
17
+ s.add_dependency('sanitize', ">= 2.0.0")
18
+
19
+ s.files = `git ls-files`.split("\n")
20
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
21
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
22
+ s.require_paths = ["lib"]
23
+ end
@@ -0,0 +1,3 @@
1
+ module HtmlMassager
2
+ VERSION = "0.0.2"
3
+ end
@@ -0,0 +1,176 @@
1
+ require "cgi"
2
+ require "nokogiri"
3
+ require "sanitize"
4
+ require "html_massage/version"
5
+
6
+ module HtmlMassager
7
+ class HtmlMassage
8
+ def initialize( html, options )
9
+ @source_url = options[ :source_url ]
10
+ @ignored_selectors = options[ :ignored_selectors ]
11
+ @clean_html = massage_html( html )
12
+ end
13
+
14
+ def massage_html( html )
15
+ html = content_only( html )
16
+ html = sanitize_html( html )
17
+ html = absolutify_links( html ) if @source_url
18
+ html
19
+ end
20
+
21
+ def content_only( content )
22
+ doc = Nokogiri::HTML( content )
23
+ body = doc / 'html' / 'body'
24
+
25
+ @ignored_selectors.to_a.each do |ignored_selector|
26
+ ( body / ignored_selector ).remove
27
+ end
28
+
29
+ content = body / '#content'
30
+ content = body if content.empty?
31
+ content = content.inner_html
32
+ content
33
+ end
34
+
35
+ def sanitize_html(html)
36
+ html = html.dup
37
+
38
+ %w[ script noscript style ].each do |tag|
39
+ html.gsub!( %r{<#{tag}[^>]*>.*?</#{tag}>}mi, '' )
40
+ end
41
+
42
+ Sanitize.clean(
43
+ html,
44
+ {
45
+ :elements => [
46
+ 'a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
47
+ 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
48
+ 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir',
49
+ 'div', 'dl', 'dt', 'em', 'fieldset', 'form', 'h1',
50
+ 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i',
51
+ 'img',
52
+ 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu',
53
+ 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
54
+ 'select', 'small', 'span', 'strike', 'strong', 'sub',
55
+ 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th',
56
+ 'thead', 'tr', 'tt', 'u', 'ul', 'var',
57
+ ],
58
+ :attributes => {
59
+ 'a' => ['href'],
60
+ 'img' => ['src'],
61
+ :all => ['abbr', 'accept', 'accept-charset',
62
+ 'accesskey', 'action', 'align', 'alt', 'axis',
63
+ 'border', 'cellpadding', 'cellspacing', 'char',
64
+ 'charoff', 'class', 'charset', 'checked', 'cite',
65
+ 'clear', 'cols', 'colspan', 'color',
66
+ 'compact', 'coords', 'datetime', 'dir',
67
+ 'disabled', 'enctype', 'for', 'frame',
68
+ 'headers', 'height', 'hreflang',
69
+ 'hspace', 'id', 'ismap', 'label', 'lang',
70
+ 'longdesc', 'maxlength', 'media', 'method',
71
+ 'multiple', 'name', 'nohref', 'noshade',
72
+ 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
73
+ 'rows', 'rowspan', 'rules', 'scope',
74
+ 'selected', 'shape', 'size', 'span',
75
+ 'start', 'summary', 'tabindex', 'target',
76
+ 'title', 'type', 'usemap', 'valign', 'value',
77
+ 'vspace', 'width']
78
+ },
79
+ :protocols => {
80
+ 'a' => {'href' => ['http', 'https', 'mailto', :relative]},
81
+ 'img' => {'src' => ['http', 'https', :relative]}
82
+ },
83
+
84
+ # consider including for deprecated/historical/or spam-suspect pages:
85
+ # Gollum has a nice way to add this to your config optionally, see:
86
+ # https://github.com/github/gollum/blob/master/lib/gollum/sanitization.rb
87
+ #
88
+ # :add_attributes => {
89
+ # 'a' => {'rel' => 'nofollow'}
90
+ # }
91
+ }
92
+ )
93
+ end
94
+
95
+ def absolutify_links( html )
96
+ match = @source_url.match( %r{(^[a-z]+://[^/]+)(/.+/)}i )
97
+ return html unless match
98
+ base_url = match[ 1 ]
99
+ resource_dir_url = match[ 0 ] # whole regexp match
100
+
101
+ dom = Nokogiri::HTML.fragment( html )
102
+ links = dom / 'a'
103
+ links.each do |link|
104
+ href = link[ 'href' ]
105
+ if href
106
+ link[ 'href' ] =
107
+ case href
108
+ when %r{^/}
109
+ File.join( base_url, href )
110
+ when %r{^\.\.}
111
+ File.join( resource_dir_url, href )
112
+ else
113
+ href
114
+ end
115
+ end
116
+ end
117
+ html = dom.to_s
118
+ html
119
+ end
120
+
121
+ def to_html
122
+ @clean_html
123
+ end
124
+
125
+ def to_text
126
+ text = CGI.unescapeHTML( @clean_html )
127
+
128
+ # normalize newlines
129
+ text.gsub!(/\r\n/, "\n")
130
+ text.gsub!(/\r/, "\n")
131
+
132
+ # nbsp => ' '
133
+ text.gsub!(/&nbsp;/, ' ')
134
+
135
+ # TODO: figure out how to do these in ruby 1.9.2:
136
+ # They now throw 'incompatible encoding -- ascii regexp for utf8 string'
137
+ # text.gsub!( /\302\240/, ' ' ) # UTF8 for nbsp
138
+ # text.gsub!( /\240/, ' ' ) # ascii for nbsp
139
+
140
+ text.gsub!(/\s+/, ' ') # all whitespace, including newlines, becomes a single space
141
+
142
+ # replace some tags with newlines
143
+ text.gsub!(%r{<br(\s[^>]*)?/?>}i, "\n")
144
+ text.gsub!(%r{<p(\s[^>]*)?/?>}i, "\n\n")
145
+ text.gsub!(%r{</(h\d|p|div|ol|ul)[^>]*>}i, "\n\n")
146
+
147
+ # replace some tags with meaningful text markup
148
+ text.gsub!(/<hr[^>]*>/i, "\n\n-------------------------\n\n")
149
+ text.gsub!(/<li[^>]*>/i, "\n* ")
150
+
151
+ # remove some tags and their inner html
152
+ text.gsub!(%r{<noscript\b.*?</noscript>}i, '')
153
+
154
+ # strip out all remaining tags
155
+ text.gsub!(/<[^>]+>/, '')
156
+
157
+ # normalize whitespace
158
+ text.gsub!(/ +/, ' ')
159
+ text = strip_lines(text)
160
+ text.gsub!( /\n{3,}/, "\n\n" )
161
+ text.strip!
162
+
163
+ "#{text}\n"
164
+ end
165
+
166
+ def strip_lines( text )
167
+ lines = text.split( "\n" )
168
+ lines.map!{ |line| line.strip }
169
+ text = lines.join( "\n" )
170
+ text.strip
171
+ end
172
+
173
+ end
174
+ end
175
+
176
+ include HtmlMassager
metadata ADDED
@@ -0,0 +1,82 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: html_massage
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.0.2
6
+ platform: ruby
7
+ authors:
8
+ - Harlan Knight Wood
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2011-06-18 00:00:00 Z
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: nokogiri
17
+ prerelease: false
18
+ requirement: &id001 !ruby/object:Gem::Requirement
19
+ none: false
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 1.4.4
24
+ type: :runtime
25
+ version_requirements: *id001
26
+ - !ruby/object:Gem::Dependency
27
+ name: sanitize
28
+ prerelease: false
29
+ requirement: &id002 !ruby/object:Gem::Requirement
30
+ none: false
31
+ requirements:
32
+ - - ">="
33
+ - !ruby/object:Gem::Version
34
+ version: 2.0.0
35
+ type: :runtime
36
+ version_requirements: *id002
37
+ description: "Massages HTML how you want to: sanitize tags, remove headers and footers, convert to plain text."
38
+ email:
39
+ - code@hkw7.org
40
+ executables: []
41
+
42
+ extensions: []
43
+
44
+ extra_rdoc_files: []
45
+
46
+ files:
47
+ - .gitignore
48
+ - Gemfile
49
+ - README.md
50
+ - Rakefile
51
+ - html_massage.gemspec
52
+ - lib/html_massage.rb
53
+ - lib/html_massage/version.rb
54
+ homepage: https://github.com/onesunone/html_massage
55
+ licenses: []
56
+
57
+ post_install_message:
58
+ rdoc_options: []
59
+
60
+ require_paths:
61
+ - lib
62
+ required_ruby_version: !ruby/object:Gem::Requirement
63
+ none: false
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ version: "0"
68
+ required_rubygems_version: !ruby/object:Gem::Requirement
69
+ none: false
70
+ requirements:
71
+ - - ">="
72
+ - !ruby/object:Gem::Version
73
+ version: "0"
74
+ requirements: []
75
+
76
+ rubyforge_project: html_massage
77
+ rubygems_version: 1.8.5
78
+ signing_key:
79
+ specification_version: 3
80
+ summary: Massages HTML how you want to.
81
+ test_files: []
82
+