html_massage 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ .bundle
2
+ .idea
3
+ Gemfile.lock
4
+ pkg/*
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in html_massage.gemspec
4
+ gemspec
data/README.md ADDED
@@ -0,0 +1,18 @@
1
+ # html_massage
2
+
3
+ Give your HTML a massage, in just the ways it loves:
4
+ * Remove headers and footers and navigation, and strip to only the "content" part of the HTML
5
+ * Sanitize tags, removing javascript and styling
6
+ * Convert your HTML to nicely-formatted plain text
7
+
8
+ ## Usage
9
+
10
+ require 'rubygems'
11
+ require 'html_massage'
12
+ html = "<html><body><div id='header'>My Site</div><div>This is some great content!</div></body></html>"
13
+ html_massage = HtmlMassage.new( html, :ignored_selectors => [ '#header' ] )
14
+ # => #<HtmlMassager::HtmlMassage ... >
15
+ html_massage.to_html
16
+ # => "<div>This is some great content!</div>"
17
+ html_massage.to_text
18
+ # => "This is some great content!\n"
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require 'bundler/gem_tasks'
@@ -0,0 +1,23 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "html_massage/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "html_massage"
7
+ s.version = HtmlMassager::VERSION
8
+ s.authors = ["Harlan Knight Wood"]
9
+ s.email = ["code@hkw7.org"]
10
+ s.homepage = "https://github.com/onesunone/html_massage"
11
+ s.summary = %{Massages HTML how you want to.}
12
+ s.description = %{Massages HTML how you want to: sanitize tags, remove headers and footers, convert to plain text.}
13
+
14
+ s.rubyforge_project = "html_massage"
15
+
16
+ s.add_dependency('nokogiri', ">= 1.4.4")
17
+ s.add_dependency('sanitize', ">= 2.0.0")
18
+
19
+ s.files = `git ls-files`.split("\n")
20
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
21
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
22
+ s.require_paths = ["lib"]
23
+ end
@@ -0,0 +1,3 @@
1
+ module HtmlMassager
2
+ VERSION = "0.0.2"
3
+ end
@@ -0,0 +1,176 @@
1
+ require "cgi"
2
+ require "nokogiri"
3
+ require "sanitize"
4
+ require "html_massage/version"
5
+
6
+ module HtmlMassager
7
+ class HtmlMassage
8
+ def initialize( html, options )
9
+ @source_url = options[ :source_url ]
10
+ @ignored_selectors = options[ :ignored_selectors ]
11
+ @clean_html = massage_html( html )
12
+ end
13
+
14
+ def massage_html( html )
15
+ html = content_only( html )
16
+ html = sanitize_html( html )
17
+ html = absolutify_links( html ) if @source_url
18
+ html
19
+ end
20
+
21
+ def content_only( content )
22
+ doc = Nokogiri::HTML( content )
23
+ body = doc / 'html' / 'body'
24
+
25
+ @ignored_selectors.to_a.each do |ignored_selector|
26
+ ( body / ignored_selector ).remove
27
+ end
28
+
29
+ content = body / '#content'
30
+ content = body if content.empty?
31
+ content = content.inner_html
32
+ content
33
+ end
34
+
35
+ def sanitize_html(html)
36
+ html = html.dup
37
+
38
+ %w[ script noscript style ].each do |tag|
39
+ html.gsub!( %r{<#{tag}[^>]*>.*?</#{tag}>}mi, '' )
40
+ end
41
+
42
+ Sanitize.clean(
43
+ html,
44
+ {
45
+ :elements => [
46
+ 'a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
47
+ 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
48
+ 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir',
49
+ 'div', 'dl', 'dt', 'em', 'fieldset', 'form', 'h1',
50
+ 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i',
51
+ 'img',
52
+ 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu',
53
+ 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
54
+ 'select', 'small', 'span', 'strike', 'strong', 'sub',
55
+ 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th',
56
+ 'thead', 'tr', 'tt', 'u', 'ul', 'var',
57
+ ],
58
+ :attributes => {
59
+ 'a' => ['href'],
60
+ 'img' => ['src'],
61
+ :all => ['abbr', 'accept', 'accept-charset',
62
+ 'accesskey', 'action', 'align', 'alt', 'axis',
63
+ 'border', 'cellpadding', 'cellspacing', 'char',
64
+ 'charoff', 'class', 'charset', 'checked', 'cite',
65
+ 'clear', 'cols', 'colspan', 'color',
66
+ 'compact', 'coords', 'datetime', 'dir',
67
+ 'disabled', 'enctype', 'for', 'frame',
68
+ 'headers', 'height', 'hreflang',
69
+ 'hspace', 'id', 'ismap', 'label', 'lang',
70
+ 'longdesc', 'maxlength', 'media', 'method',
71
+ 'multiple', 'name', 'nohref', 'noshade',
72
+ 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
73
+ 'rows', 'rowspan', 'rules', 'scope',
74
+ 'selected', 'shape', 'size', 'span',
75
+ 'start', 'summary', 'tabindex', 'target',
76
+ 'title', 'type', 'usemap', 'valign', 'value',
77
+ 'vspace', 'width']
78
+ },
79
+ :protocols => {
80
+ 'a' => {'href' => ['http', 'https', 'mailto', :relative]},
81
+ 'img' => {'src' => ['http', 'https', :relative]}
82
+ },
83
+
84
+ # consider including for deprecated/historical/or spam-suspect pages:
85
+ # Gollum has a nice way to add this to your config optionally, see:
86
+ # https://github.com/github/gollum/blob/master/lib/gollum/sanitization.rb
87
+ #
88
+ # :add_attributes => {
89
+ # 'a' => {'rel' => 'nofollow'}
90
+ # }
91
+ }
92
+ )
93
+ end
94
+
95
+ def absolutify_links( html )
96
+ match = @source_url.match( %r{(^[a-z]+://[^/]+)(/.+/)}i )
97
+ return html unless match
98
+ base_url = match[ 1 ]
99
+ resource_dir_url = match[ 0 ] # whole regexp match
100
+
101
+ dom = Nokogiri::HTML.fragment( html )
102
+ links = dom / 'a'
103
+ links.each do |link|
104
+ href = link[ 'href' ]
105
+ if href
106
+ link[ 'href' ] =
107
+ case href
108
+ when %r{^/}
109
+ File.join( base_url, href )
110
+ when %r{^\.\.}
111
+ File.join( resource_dir_url, href )
112
+ else
113
+ href
114
+ end
115
+ end
116
+ end
117
+ html = dom.to_s
118
+ html
119
+ end
120
+
121
+ def to_html
122
+ @clean_html
123
+ end
124
+
125
+ def to_text
126
+ text = CGI.unescapeHTML( @clean_html )
127
+
128
+ # normalize newlines
129
+ text.gsub!(/\r\n/, "\n")
130
+ text.gsub!(/\r/, "\n")
131
+
132
+ # nbsp => ' '
133
+ text.gsub!(/&nbsp;/, ' ')
134
+
135
+ # TODO: figure out how to do these in ruby 1.9.2:
136
+ # They now throw 'incompatible encoding -- ascii regexp for utf8 string'
137
+ # text.gsub!( /\302\240/, ' ' ) # UTF8 for nbsp
138
+ # text.gsub!( /\240/, ' ' ) # ascii for nbsp
139
+
140
+ text.gsub!(/\s+/, ' ') # all whitespace, including newlines, becomes a single space
141
+
142
+ # replace some tags with newlines
143
+ text.gsub!(%r{<br(\s[^>]*)?/?>}i, "\n")
144
+ text.gsub!(%r{<p(\s[^>]*)?/?>}i, "\n\n")
145
+ text.gsub!(%r{</(h\d|p|div|ol|ul)[^>]*>}i, "\n\n")
146
+
147
+ # replace some tags with meaningful text markup
148
+ text.gsub!(/<hr[^>]*>/i, "\n\n-------------------------\n\n")
149
+ text.gsub!(/<li[^>]*>/i, "\n* ")
150
+
151
+ # remove some tags and their inner html
152
+ text.gsub!(%r{<noscript\b.*?</noscript>}i, '')
153
+
154
+ # strip out all remaining tags
155
+ text.gsub!(/<[^>]+>/, '')
156
+
157
+ # normalize whitespace
158
+ text.gsub!(/ +/, ' ')
159
+ text = strip_lines(text)
160
+ text.gsub!( /\n{3,}/, "\n\n" )
161
+ text.strip!
162
+
163
+ "#{text}\n"
164
+ end
165
+
166
+ def strip_lines( text )
167
+ lines = text.split( "\n" )
168
+ lines.map!{ |line| line.strip }
169
+ text = lines.join( "\n" )
170
+ text.strip
171
+ end
172
+
173
+ end
174
+ end
175
+
176
+ include HtmlMassager
metadata ADDED
@@ -0,0 +1,82 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: html_massage
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.0.2
6
+ platform: ruby
7
+ authors:
8
+ - Harlan Knight Wood
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2011-06-18 00:00:00 Z
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: nokogiri
17
+ prerelease: false
18
+ requirement: &id001 !ruby/object:Gem::Requirement
19
+ none: false
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 1.4.4
24
+ type: :runtime
25
+ version_requirements: *id001
26
+ - !ruby/object:Gem::Dependency
27
+ name: sanitize
28
+ prerelease: false
29
+ requirement: &id002 !ruby/object:Gem::Requirement
30
+ none: false
31
+ requirements:
32
+ - - ">="
33
+ - !ruby/object:Gem::Version
34
+ version: 2.0.0
35
+ type: :runtime
36
+ version_requirements: *id002
37
+ description: "Massages HTML how you want to: sanitize tags, remove headers and footers, convert to plain text."
38
+ email:
39
+ - code@hkw7.org
40
+ executables: []
41
+
42
+ extensions: []
43
+
44
+ extra_rdoc_files: []
45
+
46
+ files:
47
+ - .gitignore
48
+ - Gemfile
49
+ - README.md
50
+ - Rakefile
51
+ - html_massage.gemspec
52
+ - lib/html_massage.rb
53
+ - lib/html_massage/version.rb
54
+ homepage: https://github.com/onesunone/html_massage
55
+ licenses: []
56
+
57
+ post_install_message:
58
+ rdoc_options: []
59
+
60
+ require_paths:
61
+ - lib
62
+ required_ruby_version: !ruby/object:Gem::Requirement
63
+ none: false
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ version: "0"
68
+ required_rubygems_version: !ruby/object:Gem::Requirement
69
+ none: false
70
+ requirements:
71
+ - - ">="
72
+ - !ruby/object:Gem::Version
73
+ version: "0"
74
+ requirements: []
75
+
76
+ rubyforge_project: html_massage
77
+ rubygems_version: 1.8.5
78
+ signing_key:
79
+ specification_version: 3
80
+ summary: Massages HTML how you want to.
81
+ test_files: []
82
+