html_massage 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -0
- data/Gemfile +4 -0
- data/README.md +18 -0
- data/Rakefile +1 -0
- data/html_massage.gemspec +23 -0
- data/lib/html_massage/version.rb +3 -0
- data/lib/html_massage.rb +176 -0
- metadata +82 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
# html_massage
|
2
|
+
|
3
|
+
Give your HTML a massage, in just the ways it loves:
|
4
|
+
* Remove headers and footers and navigation, and strip to only the "content" part of the HTML
|
5
|
+
* Sanitize tags, removing javascript and styling
|
6
|
+
* Convert your HTML to nicely-formatted plain text
|
7
|
+
|
8
|
+
## Usage
|
9
|
+
|
10
|
+
require 'rubygems'
|
11
|
+
require 'html_massage'
|
12
|
+
html = "<html><body><div id='header'>My Site</div><div>This is some great content!</div></body></html>"
|
13
|
+
html_massage = HtmlMassage.new( html, :ignored_selectors => [ '#header' ] )
|
14
|
+
# => #<HtmlMassager::HtmlMassage ... >
|
15
|
+
html_massage.to_html
|
16
|
+
# => "<div>This is some great content!</div>"
|
17
|
+
html_massage.to_text
|
18
|
+
# => "This is some great content!\n"
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require 'bundler/gem_tasks'
|
@@ -0,0 +1,23 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "html_massage/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "html_massage"
|
7
|
+
s.version = HtmlMassager::VERSION
|
8
|
+
s.authors = ["Harlan Knight Wood"]
|
9
|
+
s.email = ["code@hkw7.org"]
|
10
|
+
s.homepage = "https://github.com/onesunone/html_massage"
|
11
|
+
s.summary = %{Massages HTML how you want to.}
|
12
|
+
s.description = %{Massages HTML how you want to: sanitize tags, remove headers and footers, convert to plain text.}
|
13
|
+
|
14
|
+
s.rubyforge_project = "html_massage"
|
15
|
+
|
16
|
+
s.add_dependency('nokogiri', ">= 1.4.4")
|
17
|
+
s.add_dependency('sanitize', ">= 2.0.0")
|
18
|
+
|
19
|
+
s.files = `git ls-files`.split("\n")
|
20
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
21
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
22
|
+
s.require_paths = ["lib"]
|
23
|
+
end
|
data/lib/html_massage.rb
ADDED
@@ -0,0 +1,176 @@
|
|
1
|
+
require "cgi"
|
2
|
+
require "nokogiri"
|
3
|
+
require "sanitize"
|
4
|
+
require "html_massage/version"
|
5
|
+
|
6
|
+
module HtmlMassager
|
7
|
+
class HtmlMassage
|
8
|
+
def initialize( html, options )
|
9
|
+
@source_url = options[ :source_url ]
|
10
|
+
@ignored_selectors = options[ :ignored_selectors ]
|
11
|
+
@clean_html = massage_html( html )
|
12
|
+
end
|
13
|
+
|
14
|
+
def massage_html( html )
|
15
|
+
html = content_only( html )
|
16
|
+
html = sanitize_html( html )
|
17
|
+
html = absolutify_links( html ) if @source_url
|
18
|
+
html
|
19
|
+
end
|
20
|
+
|
21
|
+
def content_only( content )
|
22
|
+
doc = Nokogiri::HTML( content )
|
23
|
+
body = doc / 'html' / 'body'
|
24
|
+
|
25
|
+
@ignored_selectors.to_a.each do |ignored_selector|
|
26
|
+
( body / ignored_selector ).remove
|
27
|
+
end
|
28
|
+
|
29
|
+
content = body / '#content'
|
30
|
+
content = body if content.empty?
|
31
|
+
content = content.inner_html
|
32
|
+
content
|
33
|
+
end
|
34
|
+
|
35
|
+
def sanitize_html(html)
|
36
|
+
html = html.dup
|
37
|
+
|
38
|
+
%w[ script noscript style ].each do |tag|
|
39
|
+
html.gsub!( %r{<#{tag}[^>]*>.*?</#{tag}>}mi, '' )
|
40
|
+
end
|
41
|
+
|
42
|
+
Sanitize.clean(
|
43
|
+
html,
|
44
|
+
{
|
45
|
+
:elements => [
|
46
|
+
'a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
|
47
|
+
'blockquote', 'br', 'button', 'caption', 'center', 'cite',
|
48
|
+
'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir',
|
49
|
+
'div', 'dl', 'dt', 'em', 'fieldset', 'form', 'h1',
|
50
|
+
'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i',
|
51
|
+
'img',
|
52
|
+
'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu',
|
53
|
+
'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
|
54
|
+
'select', 'small', 'span', 'strike', 'strong', 'sub',
|
55
|
+
'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th',
|
56
|
+
'thead', 'tr', 'tt', 'u', 'ul', 'var',
|
57
|
+
],
|
58
|
+
:attributes => {
|
59
|
+
'a' => ['href'],
|
60
|
+
'img' => ['src'],
|
61
|
+
:all => ['abbr', 'accept', 'accept-charset',
|
62
|
+
'accesskey', 'action', 'align', 'alt', 'axis',
|
63
|
+
'border', 'cellpadding', 'cellspacing', 'char',
|
64
|
+
'charoff', 'class', 'charset', 'checked', 'cite',
|
65
|
+
'clear', 'cols', 'colspan', 'color',
|
66
|
+
'compact', 'coords', 'datetime', 'dir',
|
67
|
+
'disabled', 'enctype', 'for', 'frame',
|
68
|
+
'headers', 'height', 'hreflang',
|
69
|
+
'hspace', 'id', 'ismap', 'label', 'lang',
|
70
|
+
'longdesc', 'maxlength', 'media', 'method',
|
71
|
+
'multiple', 'name', 'nohref', 'noshade',
|
72
|
+
'nowrap', 'prompt', 'readonly', 'rel', 'rev',
|
73
|
+
'rows', 'rowspan', 'rules', 'scope',
|
74
|
+
'selected', 'shape', 'size', 'span',
|
75
|
+
'start', 'summary', 'tabindex', 'target',
|
76
|
+
'title', 'type', 'usemap', 'valign', 'value',
|
77
|
+
'vspace', 'width']
|
78
|
+
},
|
79
|
+
:protocols => {
|
80
|
+
'a' => {'href' => ['http', 'https', 'mailto', :relative]},
|
81
|
+
'img' => {'src' => ['http', 'https', :relative]}
|
82
|
+
},
|
83
|
+
|
84
|
+
# consider including for deprecated/historical/or spam-suspect pages:
|
85
|
+
# Gollum has a nice way to add this to your config optionally, see:
|
86
|
+
# https://github.com/github/gollum/blob/master/lib/gollum/sanitization.rb
|
87
|
+
#
|
88
|
+
# :add_attributes => {
|
89
|
+
# 'a' => {'rel' => 'nofollow'}
|
90
|
+
# }
|
91
|
+
}
|
92
|
+
)
|
93
|
+
end
|
94
|
+
|
95
|
+
def absolutify_links( html )
|
96
|
+
match = @source_url.match( %r{(^[a-z]+://[^/]+)(/.+/)}i )
|
97
|
+
return html unless match
|
98
|
+
base_url = match[ 1 ]
|
99
|
+
resource_dir_url = match[ 0 ] # whole regexp match
|
100
|
+
|
101
|
+
dom = Nokogiri::HTML.fragment( html )
|
102
|
+
links = dom / 'a'
|
103
|
+
links.each do |link|
|
104
|
+
href = link[ 'href' ]
|
105
|
+
if href
|
106
|
+
link[ 'href' ] =
|
107
|
+
case href
|
108
|
+
when %r{^/}
|
109
|
+
File.join( base_url, href )
|
110
|
+
when %r{^\.\.}
|
111
|
+
File.join( resource_dir_url, href )
|
112
|
+
else
|
113
|
+
href
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
html = dom.to_s
|
118
|
+
html
|
119
|
+
end
|
120
|
+
|
121
|
+
def to_html
|
122
|
+
@clean_html
|
123
|
+
end
|
124
|
+
|
125
|
+
def to_text
|
126
|
+
text = CGI.unescapeHTML( @clean_html )
|
127
|
+
|
128
|
+
# normalize newlines
|
129
|
+
text.gsub!(/\r\n/, "\n")
|
130
|
+
text.gsub!(/\r/, "\n")
|
131
|
+
|
132
|
+
# nbsp => ' '
|
133
|
+
text.gsub!(/ /, ' ')
|
134
|
+
|
135
|
+
# TODO: figure out how to do these in ruby 1.9.2:
|
136
|
+
# They now throw 'incompatible encoding -- ascii regexp for utf8 string'
|
137
|
+
# text.gsub!( /\302\240/, ' ' ) # UTF8 for nbsp
|
138
|
+
# text.gsub!( /\240/, ' ' ) # ascii for nbsp
|
139
|
+
|
140
|
+
text.gsub!(/\s+/, ' ') # all whitespace, including newlines, becomes a single space
|
141
|
+
|
142
|
+
# replace some tags with newlines
|
143
|
+
text.gsub!(%r{<br(\s[^>]*)?/?>}i, "\n")
|
144
|
+
text.gsub!(%r{<p(\s[^>]*)?/?>}i, "\n\n")
|
145
|
+
text.gsub!(%r{</(h\d|p|div|ol|ul)[^>]*>}i, "\n\n")
|
146
|
+
|
147
|
+
# replace some tags with meaningful text markup
|
148
|
+
text.gsub!(/<hr[^>]*>/i, "\n\n-------------------------\n\n")
|
149
|
+
text.gsub!(/<li[^>]*>/i, "\n* ")
|
150
|
+
|
151
|
+
# remove some tags and their inner html
|
152
|
+
text.gsub!(%r{<noscript\b.*?</noscript>}i, '')
|
153
|
+
|
154
|
+
# strip out all remaining tags
|
155
|
+
text.gsub!(/<[^>]+>/, '')
|
156
|
+
|
157
|
+
# normalize whitespace
|
158
|
+
text.gsub!(/ +/, ' ')
|
159
|
+
text = strip_lines(text)
|
160
|
+
text.gsub!( /\n{3,}/, "\n\n" )
|
161
|
+
text.strip!
|
162
|
+
|
163
|
+
"#{text}\n"
|
164
|
+
end
|
165
|
+
|
166
|
+
def strip_lines( text )
|
167
|
+
lines = text.split( "\n" )
|
168
|
+
lines.map!{ |line| line.strip }
|
169
|
+
text = lines.join( "\n" )
|
170
|
+
text.strip
|
171
|
+
end
|
172
|
+
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
include HtmlMassager
|
metadata
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: html_massage
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 0.0.2
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Harlan Knight Wood
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2011-06-18 00:00:00 Z
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: nokogiri
|
17
|
+
prerelease: false
|
18
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
19
|
+
none: false
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 1.4.4
|
24
|
+
type: :runtime
|
25
|
+
version_requirements: *id001
|
26
|
+
- !ruby/object:Gem::Dependency
|
27
|
+
name: sanitize
|
28
|
+
prerelease: false
|
29
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
30
|
+
none: false
|
31
|
+
requirements:
|
32
|
+
- - ">="
|
33
|
+
- !ruby/object:Gem::Version
|
34
|
+
version: 2.0.0
|
35
|
+
type: :runtime
|
36
|
+
version_requirements: *id002
|
37
|
+
description: "Massages HTML how you want to: sanitize tags, remove headers and footers, convert to plain text."
|
38
|
+
email:
|
39
|
+
- code@hkw7.org
|
40
|
+
executables: []
|
41
|
+
|
42
|
+
extensions: []
|
43
|
+
|
44
|
+
extra_rdoc_files: []
|
45
|
+
|
46
|
+
files:
|
47
|
+
- .gitignore
|
48
|
+
- Gemfile
|
49
|
+
- README.md
|
50
|
+
- Rakefile
|
51
|
+
- html_massage.gemspec
|
52
|
+
- lib/html_massage.rb
|
53
|
+
- lib/html_massage/version.rb
|
54
|
+
homepage: https://github.com/onesunone/html_massage
|
55
|
+
licenses: []
|
56
|
+
|
57
|
+
post_install_message:
|
58
|
+
rdoc_options: []
|
59
|
+
|
60
|
+
require_paths:
|
61
|
+
- lib
|
62
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
63
|
+
none: false
|
64
|
+
requirements:
|
65
|
+
- - ">="
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
version: "0"
|
68
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
69
|
+
none: false
|
70
|
+
requirements:
|
71
|
+
- - ">="
|
72
|
+
- !ruby/object:Gem::Version
|
73
|
+
version: "0"
|
74
|
+
requirements: []
|
75
|
+
|
76
|
+
rubyforge_project: html_massage
|
77
|
+
rubygems_version: 1.8.5
|
78
|
+
signing_key:
|
79
|
+
specification_version: 3
|
80
|
+
summary: Massages HTML how you want to.
|
81
|
+
test_files: []
|
82
|
+
|