pismo 0.5.0 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +19 -28
- data/NOTICE +4 -0
- data/README.markdown +37 -40
- data/Rakefile +3 -2
- data/VERSION +1 -1
- data/bin/pismo +15 -7
- data/lib/pismo/document.rb +2 -2
- data/lib/pismo/internal_attributes.rb +23 -16
- data/lib/pismo/reader.rb +390 -0
- data/lib/pismo.rb +3 -2
- data/pismo.gemspec +23 -15
- data/test/corpus/bbcnews2.html +1575 -0
- data/test/corpus/gmane.html +138 -0
- data/test/corpus/metadata_expected.yaml +20 -5
- data/test/corpus/queness.html +919 -0
- data/test/corpus/reader_expected.yaml +45 -0
- data/test/corpus/tweet.html +360 -0
- data/test/corpus/zefrank.html +535 -0
- data/test/test_corpus.rb +9 -1
- metadata +89 -34
- data/lib/pismo/readability.rb +0 -342
- data/test/test_readability.rb +0 -152
metadata
CHANGED
@@ -1,7 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pismo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
4
|
+
hash: 7
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 6
|
9
|
+
- 0
|
10
|
+
version: 0.6.0
|
5
11
|
platform: ruby
|
6
12
|
authors:
|
7
13
|
- Peter Cooper
|
@@ -9,69 +15,107 @@ autorequire:
|
|
9
15
|
bindir: bin
|
10
16
|
cert_chain: []
|
11
17
|
|
12
|
-
date: 2010-06-
|
18
|
+
date: 2010-06-20 00:00:00 +01:00
|
13
19
|
default_executable: pismo
|
14
20
|
dependencies:
|
15
21
|
- !ruby/object:Gem::Dependency
|
16
22
|
name: shoulda
|
17
|
-
|
18
|
-
|
19
|
-
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
20
26
|
requirements:
|
21
27
|
- - ">="
|
22
28
|
- !ruby/object:Gem::Version
|
29
|
+
hash: 3
|
30
|
+
segments:
|
31
|
+
- 0
|
23
32
|
version: "0"
|
24
|
-
|
33
|
+
type: :development
|
34
|
+
version_requirements: *id001
|
25
35
|
- !ruby/object:Gem::Dependency
|
26
|
-
name:
|
27
|
-
|
28
|
-
|
29
|
-
|
36
|
+
name: awesome_print
|
37
|
+
prerelease: false
|
38
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
30
40
|
requirements:
|
31
41
|
- - ">="
|
32
42
|
- !ruby/object:Gem::Version
|
43
|
+
hash: 3
|
44
|
+
segments:
|
45
|
+
- 0
|
33
46
|
version: "0"
|
34
|
-
|
47
|
+
type: :development
|
48
|
+
version_requirements: *id002
|
35
49
|
- !ruby/object:Gem::Dependency
|
36
|
-
name:
|
37
|
-
|
38
|
-
|
39
|
-
|
50
|
+
name: jeweler
|
51
|
+
prerelease: false
|
52
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
53
|
+
none: false
|
40
54
|
requirements:
|
41
55
|
- - ">="
|
42
56
|
- !ruby/object:Gem::Version
|
57
|
+
hash: 3
|
58
|
+
segments:
|
59
|
+
- 0
|
43
60
|
version: "0"
|
44
|
-
|
61
|
+
type: :runtime
|
62
|
+
version_requirements: *id003
|
45
63
|
- !ruby/object:Gem::Dependency
|
46
|
-
name:
|
64
|
+
name: nokogiri
|
65
|
+
prerelease: false
|
66
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
67
|
+
none: false
|
68
|
+
requirements:
|
69
|
+
- - ">="
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
hash: 3
|
72
|
+
segments:
|
73
|
+
- 0
|
74
|
+
version: "0"
|
47
75
|
type: :runtime
|
48
|
-
|
49
|
-
|
76
|
+
version_requirements: *id004
|
77
|
+
- !ruby/object:Gem::Dependency
|
78
|
+
name: sanitize
|
79
|
+
prerelease: false
|
80
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
50
82
|
requirements:
|
51
83
|
- - ">="
|
52
84
|
- !ruby/object:Gem::Version
|
85
|
+
hash: 3
|
86
|
+
segments:
|
87
|
+
- 0
|
53
88
|
version: "0"
|
54
|
-
|
89
|
+
type: :runtime
|
90
|
+
version_requirements: *id005
|
55
91
|
- !ruby/object:Gem::Dependency
|
56
92
|
name: fast-stemmer
|
57
|
-
|
58
|
-
|
59
|
-
|
93
|
+
prerelease: false
|
94
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
95
|
+
none: false
|
60
96
|
requirements:
|
61
97
|
- - ">="
|
62
98
|
- !ruby/object:Gem::Version
|
99
|
+
hash: 3
|
100
|
+
segments:
|
101
|
+
- 0
|
63
102
|
version: "0"
|
64
|
-
|
103
|
+
type: :runtime
|
104
|
+
version_requirements: *id006
|
65
105
|
- !ruby/object:Gem::Dependency
|
66
106
|
name: chronic
|
67
|
-
|
68
|
-
|
69
|
-
|
107
|
+
prerelease: false
|
108
|
+
requirement: &id007 !ruby/object:Gem::Requirement
|
109
|
+
none: false
|
70
110
|
requirements:
|
71
111
|
- - ">="
|
72
112
|
- !ruby/object:Gem::Version
|
113
|
+
hash: 3
|
114
|
+
segments:
|
115
|
+
- 0
|
73
116
|
version: "0"
|
74
|
-
|
117
|
+
type: :runtime
|
118
|
+
version_requirements: *id007
|
75
119
|
description: Pismo extracts and retrieves content-related metadata from HTML pages - you can use the resulting data in an organized way, such as a summary/first paragraph, body text, keywords, RSS feed URL, favicon, etc.
|
76
120
|
email: git@peterc.org
|
77
121
|
executables:
|
@@ -85,6 +129,7 @@ files:
|
|
85
129
|
- .document
|
86
130
|
- .gitignore
|
87
131
|
- LICENSE
|
132
|
+
- NOTICE
|
88
133
|
- README.markdown
|
89
134
|
- Rakefile
|
90
135
|
- VERSION
|
@@ -93,25 +138,30 @@ files:
|
|
93
138
|
- lib/pismo/document.rb
|
94
139
|
- lib/pismo/external_attributes.rb
|
95
140
|
- lib/pismo/internal_attributes.rb
|
96
|
-
- lib/pismo/
|
141
|
+
- lib/pismo/reader.rb
|
97
142
|
- lib/pismo/stopwords.txt
|
98
143
|
- pismo.gemspec
|
99
144
|
- test/corpus/bbcnews.html
|
145
|
+
- test/corpus/bbcnews2.html
|
100
146
|
- test/corpus/briancray.html
|
101
147
|
- test/corpus/cant_read.html
|
102
148
|
- test/corpus/factor.html
|
149
|
+
- test/corpus/gmane.html
|
103
150
|
- test/corpus/huffington.html
|
104
151
|
- test/corpus/metadata_expected.yaml
|
105
152
|
- test/corpus/metadata_expected.yaml.old
|
153
|
+
- test/corpus/queness.html
|
154
|
+
- test/corpus/reader_expected.yaml
|
106
155
|
- test/corpus/rubyinside.html
|
107
156
|
- test/corpus/rww.html
|
108
157
|
- test/corpus/spolsky.html
|
109
158
|
- test/corpus/techcrunch.html
|
159
|
+
- test/corpus/tweet.html
|
110
160
|
- test/corpus/youtube.html
|
161
|
+
- test/corpus/zefrank.html
|
111
162
|
- test/helper.rb
|
112
163
|
- test/test_corpus.rb
|
113
164
|
- test/test_pismo_document.rb
|
114
|
-
- test/test_readability.rb
|
115
165
|
has_rdoc: true
|
116
166
|
homepage: http://github.com/peterc/pismo
|
117
167
|
licenses: []
|
@@ -122,21 +172,27 @@ rdoc_options:
|
|
122
172
|
require_paths:
|
123
173
|
- lib
|
124
174
|
required_ruby_version: !ruby/object:Gem::Requirement
|
175
|
+
none: false
|
125
176
|
requirements:
|
126
177
|
- - ">="
|
127
178
|
- !ruby/object:Gem::Version
|
179
|
+
hash: 3
|
180
|
+
segments:
|
181
|
+
- 0
|
128
182
|
version: "0"
|
129
|
-
version:
|
130
183
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
184
|
+
none: false
|
131
185
|
requirements:
|
132
186
|
- - ">="
|
133
187
|
- !ruby/object:Gem::Version
|
188
|
+
hash: 3
|
189
|
+
segments:
|
190
|
+
- 0
|
134
191
|
version: "0"
|
135
|
-
version:
|
136
192
|
requirements: []
|
137
193
|
|
138
194
|
rubyforge_project:
|
139
|
-
rubygems_version: 1.3.
|
195
|
+
rubygems_version: 1.3.7
|
140
196
|
signing_key:
|
141
197
|
specification_version: 3
|
142
198
|
summary: Extracts or retrieves content-related metadata from HTML pages
|
@@ -144,4 +200,3 @@ test_files:
|
|
144
200
|
- test/helper.rb
|
145
201
|
- test/test_corpus.rb
|
146
202
|
- test/test_pismo_document.rb
|
147
|
-
- test/test_readability.rb
|
data/lib/pismo/readability.rb
DELETED
@@ -1,342 +0,0 @@
|
|
1
|
-
# This code is under the Apache License 2.0. http://www.apache.org/licenses/LICENSE-2.0
|
2
|
-
#
|
3
|
-
# This is a Ruby port of arc90's readability project
|
4
|
-
# http://lab.arc90.com/experiments/readability/
|
5
|
-
# Given a html document, it pulls out the main body text and cleans it up.
|
6
|
-
# Ruby port by starrhorne and iterationlabs
|
7
|
-
#
|
8
|
-
# Original JavaScript version:
|
9
|
-
# http://lab.arc90.com/experiments/readability/js/readability.js
|
10
|
-
# * Copyright (c) 2009 Arc90 Inc
|
11
|
-
# * Readability is licensed under the Apache License, Version 2.0.
|
12
|
-
#
|
13
|
-
# Minor edits and tweaks by Peter Cooper
|
14
|
-
|
15
|
-
require 'nokogiri'
|
16
|
-
|
17
|
-
IS_RUBY19 = "a".respond_to?(:encoding)
|
18
|
-
|
19
|
-
module Readability
|
20
|
-
class Document
|
21
|
-
TEXT_LENGTH_THRESHOLD = 25
|
22
|
-
RETRY_LENGTH = 250
|
23
|
-
|
24
|
-
attr_accessor :options, :html
|
25
|
-
|
26
|
-
def initialize(input, options = {})
|
27
|
-
@input = input
|
28
|
-
@options = options
|
29
|
-
make_html
|
30
|
-
end
|
31
|
-
|
32
|
-
def make_html
|
33
|
-
@html = Nokogiri::HTML(@input) #, nil, 'UTF-8')
|
34
|
-
end
|
35
|
-
|
36
|
-
REGEXES = {
|
37
|
-
:unlikelyCandidatesRe => /combx|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|sponsor/i,
|
38
|
-
:okMaybeItsACandidateRe => /and|article|body|column|main/i,
|
39
|
-
:positiveRe => /article|body|content|entry|hentry|page|pagination|post|story|text/i,
|
40
|
-
:negativeRe => /combx|comment|contact|foot|box_wrap|footer|footnote|link|media|meta|promo|related|scroll|shoutbox|sponsor|tags/i,
|
41
|
-
:divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
|
42
|
-
:replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
|
43
|
-
:replaceFontsRe => /<(\/?)font[^>]*>/i,
|
44
|
-
:trimRe => /^\s+|\s+$/,
|
45
|
-
:normalizeRe => /\s{2,}/,
|
46
|
-
:killBreaksRe => /(<br\s*\/?>(\s| ?)*){1,}/,
|
47
|
-
:videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
|
48
|
-
}
|
49
|
-
|
50
|
-
def content(remove_unlikely_candidates = true)
|
51
|
-
@html.css("script, style").each { |i| i.remove }
|
52
|
-
|
53
|
-
remove_unlikely_candidates! if remove_unlikely_candidates
|
54
|
-
transform_misused_divs_into_paragraphs!
|
55
|
-
candidates = score_paragraphs(options[:min_text_length] || TEXT_LENGTH_THRESHOLD)
|
56
|
-
best_candidate = select_best_candidate(candidates)
|
57
|
-
article = get_article(candidates, best_candidate)
|
58
|
-
cleaned_article = sanitize(article, candidates, options)
|
59
|
-
cleaned_article.gsub!(/^\s+\n/, "\n")
|
60
|
-
cleaned_article.gsub!(/[\ \t]+/, ' ')
|
61
|
-
cleaned_article.gsub!(/^\s+/, '')
|
62
|
-
cleaned_article.gsub!(/\<\!\-\-.*?\-\-\>/m, '')
|
63
|
-
if remove_unlikely_candidates && article.text.strip.length < (options[:retry_length] || RETRY_LENGTH)
|
64
|
-
make_html
|
65
|
-
content(false)
|
66
|
-
else
|
67
|
-
cleaned_article
|
68
|
-
end
|
69
|
-
end
|
70
|
-
|
71
|
-
def get_article(candidates, best_candidate)
|
72
|
-
# Now that we have the top candidate, look through its siblings for content that might also be related.
|
73
|
-
# Things like preambles, content split by ads that we removed, etc.
|
74
|
-
|
75
|
-
sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
|
76
|
-
output = Nokogiri::XML::Node.new('div', @html)
|
77
|
-
|
78
|
-
return output unless best_candidate[:elem]
|
79
|
-
|
80
|
-
best_candidate[:elem].parent.children.each do |sibling|
|
81
|
-
append = false
|
82
|
-
append = true if sibling == best_candidate[:elem]
|
83
|
-
append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
|
84
|
-
|
85
|
-
if sibling.name.downcase == "p"
|
86
|
-
link_density = get_link_density(sibling)
|
87
|
-
node_content = sibling.text
|
88
|
-
node_length = node_content.length
|
89
|
-
|
90
|
-
if node_length > 80 && link_density < 0.25
|
91
|
-
append = true
|
92
|
-
elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/
|
93
|
-
append = true
|
94
|
-
end
|
95
|
-
end
|
96
|
-
|
97
|
-
if append
|
98
|
-
sibling.name = "div" unless %w[div p].include?(sibling.name.downcase)
|
99
|
-
output << sibling
|
100
|
-
end
|
101
|
-
end
|
102
|
-
|
103
|
-
output
|
104
|
-
end
|
105
|
-
|
106
|
-
def select_best_candidate(candidates)
|
107
|
-
sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
|
108
|
-
|
109
|
-
debug("Top 5 canidates:")
|
110
|
-
sorted_candidates[0...5].each do |candidate|
|
111
|
-
debug("Candidate #{candidate[:elem].name}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}")
|
112
|
-
end
|
113
|
-
|
114
|
-
best_candidate = sorted_candidates.first || { :elem => @html.css("body").first, :content_score => 0 }
|
115
|
-
#debug("Best candidate #{best_candidate[:elem].name}##{best_candidate[:elem][:id]}.#{best_candidate[:elem][:class]} with score #{best_candidate[:content_score]}")
|
116
|
-
|
117
|
-
best_candidate
|
118
|
-
end
|
119
|
-
|
120
|
-
def get_link_density(elem)
|
121
|
-
link_length = elem.css("a").map {|i| i.text}.join("").length
|
122
|
-
text_length = elem.text.length
|
123
|
-
link_length / text_length.to_f
|
124
|
-
end
|
125
|
-
|
126
|
-
def score_paragraphs(min_text_length)
|
127
|
-
candidates = {}
|
128
|
-
@html.css("p,td").each do |elem|
|
129
|
-
parent_node = elem.parent
|
130
|
-
grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil
|
131
|
-
inner_text = elem.text
|
132
|
-
|
133
|
-
# If this paragraph is less than 25 characters, don't even count it.
|
134
|
-
next if inner_text.length < min_text_length
|
135
|
-
|
136
|
-
candidates[parent_node] ||= score_node(parent_node)
|
137
|
-
candidates[grand_parent_node] ||= score_node(grand_parent_node) if grand_parent_node
|
138
|
-
|
139
|
-
content_score = 1
|
140
|
-
|
141
|
-
begin
|
142
|
-
content_score += inner_text.split(',').length
|
143
|
-
content_score += [(inner_text.length / 100).to_i, 3].min
|
144
|
-
rescue => e
|
145
|
-
raise e unless IS_RUBY19
|
146
|
-
inner_text.force_encoding('ASCII-8BIT')
|
147
|
-
content_score += inner_text.split(',').length
|
148
|
-
content_score += [(inner_text.length / 100).to_i, 3].min
|
149
|
-
end
|
150
|
-
|
151
|
-
candidates[parent_node][:content_score] += content_score
|
152
|
-
candidates[grand_parent_node][:content_score] += content_score / 2.0 if grand_parent_node
|
153
|
-
end
|
154
|
-
|
155
|
-
# Scale the final candidates score based on link density. Good content should have a
|
156
|
-
# relatively small link density (5% or less) and be mostly unaffected by this operation.
|
157
|
-
candidates.each do |elem, candidate|
|
158
|
-
candidate[:content_score] = candidate[:content_score] * (1 - get_link_density(elem))
|
159
|
-
end
|
160
|
-
|
161
|
-
candidates
|
162
|
-
end
|
163
|
-
|
164
|
-
def class_weight(e)
|
165
|
-
weight = 0
|
166
|
-
if e[:class] && e[:class] != ""
|
167
|
-
if e[:class] =~ REGEXES[:negativeRe]
|
168
|
-
weight -= 25
|
169
|
-
end
|
170
|
-
|
171
|
-
if e[:class] =~ REGEXES[:positiveRe]
|
172
|
-
weight += 25
|
173
|
-
end
|
174
|
-
end
|
175
|
-
|
176
|
-
if e[:id] && e[:id] != ""
|
177
|
-
if e[:id] =~ REGEXES[:negativeRe]
|
178
|
-
weight -= 25
|
179
|
-
end
|
180
|
-
|
181
|
-
if e[:id] =~ REGEXES[:positiveRe]
|
182
|
-
weight += 25
|
183
|
-
end
|
184
|
-
end
|
185
|
-
|
186
|
-
weight
|
187
|
-
end
|
188
|
-
|
189
|
-
def score_node(elem)
|
190
|
-
content_score = class_weight(elem)
|
191
|
-
case elem.name.downcase
|
192
|
-
when "div"
|
193
|
-
content_score += 5
|
194
|
-
when "blockquote"
|
195
|
-
content_score += 3
|
196
|
-
when "form"
|
197
|
-
content_score -= 3
|
198
|
-
when "th"
|
199
|
-
content_score -= 5
|
200
|
-
end
|
201
|
-
{ :content_score => content_score, :elem => elem }
|
202
|
-
end
|
203
|
-
|
204
|
-
def debug(str)
|
205
|
-
puts str if options[:debug]
|
206
|
-
end
|
207
|
-
|
208
|
-
def remove_unlikely_candidates!
|
209
|
-
@html.css("*").each do |elem|
|
210
|
-
str = "#{elem[:class]}#{elem[:id]}"
|
211
|
-
if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && elem.name.downcase != 'body'
|
212
|
-
debug("Removing unlikely candidate - #{str}")
|
213
|
-
elem.remove
|
214
|
-
end
|
215
|
-
end
|
216
|
-
end
|
217
|
-
|
218
|
-
def transform_misused_divs_into_paragraphs!
|
219
|
-
@html.css("*").each do |elem|
|
220
|
-
if elem.name.downcase == "div"
|
221
|
-
# transform <div>s that do not contain other block elements into <p>s
|
222
|
-
elem_inner_html = IS_RUBY19 ? elem.inner_html.dup.force_encoding('ASCII-8BIT') : elem.inner_html
|
223
|
-
if elem_inner_html !~ REGEXES[:divToPElementsRe]
|
224
|
-
debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
|
225
|
-
elem.name = "p"
|
226
|
-
end
|
227
|
-
else
|
228
|
-
# wrap text nodes in p tags
|
229
|
-
# elem.children.each do |child|
|
230
|
-
# if child.text?
|
231
|
-
## debug("wrapping text node with a p")
|
232
|
-
# child.swap("<p>#{child.text}</p>")
|
233
|
-
# end
|
234
|
-
# end
|
235
|
-
end
|
236
|
-
end
|
237
|
-
end
|
238
|
-
|
239
|
-
def sanitize(node, candidates, options = {})
|
240
|
-
node.css("h1, h2, h3, h4, h5, h6").each do |header|
|
241
|
-
header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33
|
242
|
-
end
|
243
|
-
|
244
|
-
node.css("form, object, iframe, embed").each do |elem|
|
245
|
-
elem.remove
|
246
|
-
end
|
247
|
-
|
248
|
-
# Remove empty <p> tags
|
249
|
-
node.css("p").each do |elem|
|
250
|
-
elem.remove if elem.content.strip.empty?
|
251
|
-
end
|
252
|
-
|
253
|
-
# Remove empty <div> tags
|
254
|
-
node.css("div").each do |elem|
|
255
|
-
elem.remove if elem.content.strip.empty?
|
256
|
-
end
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
# Conditionally clean <table>s, <ul>s, and <div>s
|
261
|
-
node.css("table, ul, div").each do |el|
|
262
|
-
weight = class_weight(el)
|
263
|
-
content_score = candidates[el] ? candidates[el][:content_score] : 0
|
264
|
-
name = el.name.downcase
|
265
|
-
|
266
|
-
if weight + content_score < 0
|
267
|
-
el.remove
|
268
|
-
debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
|
269
|
-
elsif (IS_RUBY19 && el.text.force_encoding("ASCII-8BIT").count(",") < 10) || (!IS_RUBY19 && el.text.count(",") < 10)
|
270
|
-
counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
|
271
|
-
counts["li"] -= 100
|
272
|
-
|
273
|
-
content_length = el.text.length
|
274
|
-
link_density = get_link_density(el)
|
275
|
-
to_remove = false
|
276
|
-
reason = ""
|
277
|
-
|
278
|
-
if counts["img"] > counts["p"]
|
279
|
-
reason = "too many images"
|
280
|
-
to_remove = true
|
281
|
-
elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
|
282
|
-
reason = "more <li>s than <p>s"
|
283
|
-
to_remove = true
|
284
|
-
elsif counts["input"] > (counts["p"] / 3).to_i
|
285
|
-
reason = "less than 3x <p>s than <input>s"
|
286
|
-
to_remove = true
|
287
|
-
elsif content_length < (options[:min_text_length] || TEXT_LENGTH_THRESHOLD) && (counts["img"] == 0 || counts["img"] > 2)
|
288
|
-
reason = "too short a content length without a single image"
|
289
|
-
to_remove = true
|
290
|
-
elsif weight < 25 && link_density > 0.2
|
291
|
-
reason = "too many links for its weight (#{weight})"
|
292
|
-
to_remove = true
|
293
|
-
elsif weight >= 25 && link_density > 0.5
|
294
|
-
reason = "too many links for its weight (#{weight})"
|
295
|
-
to_remove = true
|
296
|
-
elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1
|
297
|
-
reason = "<embed>s with too short a content length, or too many <embed>s"
|
298
|
-
to_remove = true
|
299
|
-
end
|
300
|
-
|
301
|
-
if to_remove
|
302
|
-
debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
|
303
|
-
el.remove
|
304
|
-
end
|
305
|
-
end
|
306
|
-
end
|
307
|
-
|
308
|
-
# We'll sanitize all elements using a whitelist
|
309
|
-
whitelist = @options[:tags] || %w[p]
|
310
|
-
|
311
|
-
# Use a hash for speed (don't want to make a million calls to include?)
|
312
|
-
whitelist = Hash[ whitelist.zip([true] * whitelist.size) ]
|
313
|
-
|
314
|
-
([node] + node.css("*")).each do |el|
|
315
|
-
|
316
|
-
# If element is in whitelist, delete all its attributes
|
317
|
-
if whitelist[el.node_name]
|
318
|
-
el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
|
319
|
-
|
320
|
-
# Otherwise, replace the element with its contents
|
321
|
-
else
|
322
|
-
begin
|
323
|
-
el.swap(el.text)
|
324
|
-
rescue => e
|
325
|
-
raise e unless IS_RUBY19
|
326
|
-
el.swap(el.text.force_encoding("ASCII-8BIT"))
|
327
|
-
end
|
328
|
-
end
|
329
|
-
|
330
|
-
end
|
331
|
-
|
332
|
-
# Get rid of duplicate whitespace
|
333
|
-
begin
|
334
|
-
node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/ /, " ")
|
335
|
-
rescue => e
|
336
|
-
raise e unless IS_RUBY19
|
337
|
-
node.to_html.force_encoding("ASCII-8BIT").gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/ /, " ")
|
338
|
-
end
|
339
|
-
end
|
340
|
-
|
341
|
-
end
|
342
|
-
end
|