pismo 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +19 -28
- data/NOTICE +4 -0
- data/README.markdown +37 -40
- data/Rakefile +3 -2
- data/VERSION +1 -1
- data/bin/pismo +15 -7
- data/lib/pismo/document.rb +2 -2
- data/lib/pismo/internal_attributes.rb +23 -16
- data/lib/pismo/reader.rb +390 -0
- data/lib/pismo.rb +3 -2
- data/pismo.gemspec +23 -15
- data/test/corpus/bbcnews2.html +1575 -0
- data/test/corpus/gmane.html +138 -0
- data/test/corpus/metadata_expected.yaml +20 -5
- data/test/corpus/queness.html +919 -0
- data/test/corpus/reader_expected.yaml +45 -0
- data/test/corpus/tweet.html +360 -0
- data/test/corpus/zefrank.html +535 -0
- data/test/test_corpus.rb +9 -1
- metadata +89 -34
- data/lib/pismo/readability.rb +0 -342
- data/test/test_readability.rb +0 -152
metadata
CHANGED
@@ -1,7 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pismo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
4
|
+
hash: 7
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 6
|
9
|
+
- 0
|
10
|
+
version: 0.6.0
|
5
11
|
platform: ruby
|
6
12
|
authors:
|
7
13
|
- Peter Cooper
|
@@ -9,69 +15,107 @@ autorequire:
|
|
9
15
|
bindir: bin
|
10
16
|
cert_chain: []
|
11
17
|
|
12
|
-
date: 2010-06-
|
18
|
+
date: 2010-06-20 00:00:00 +01:00
|
13
19
|
default_executable: pismo
|
14
20
|
dependencies:
|
15
21
|
- !ruby/object:Gem::Dependency
|
16
22
|
name: shoulda
|
17
|
-
|
18
|
-
|
19
|
-
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
20
26
|
requirements:
|
21
27
|
- - ">="
|
22
28
|
- !ruby/object:Gem::Version
|
29
|
+
hash: 3
|
30
|
+
segments:
|
31
|
+
- 0
|
23
32
|
version: "0"
|
24
|
-
|
33
|
+
type: :development
|
34
|
+
version_requirements: *id001
|
25
35
|
- !ruby/object:Gem::Dependency
|
26
|
-
name:
|
27
|
-
|
28
|
-
|
29
|
-
|
36
|
+
name: awesome_print
|
37
|
+
prerelease: false
|
38
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
30
40
|
requirements:
|
31
41
|
- - ">="
|
32
42
|
- !ruby/object:Gem::Version
|
43
|
+
hash: 3
|
44
|
+
segments:
|
45
|
+
- 0
|
33
46
|
version: "0"
|
34
|
-
|
47
|
+
type: :development
|
48
|
+
version_requirements: *id002
|
35
49
|
- !ruby/object:Gem::Dependency
|
36
|
-
name:
|
37
|
-
|
38
|
-
|
39
|
-
|
50
|
+
name: jeweler
|
51
|
+
prerelease: false
|
52
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
53
|
+
none: false
|
40
54
|
requirements:
|
41
55
|
- - ">="
|
42
56
|
- !ruby/object:Gem::Version
|
57
|
+
hash: 3
|
58
|
+
segments:
|
59
|
+
- 0
|
43
60
|
version: "0"
|
44
|
-
|
61
|
+
type: :runtime
|
62
|
+
version_requirements: *id003
|
45
63
|
- !ruby/object:Gem::Dependency
|
46
|
-
name:
|
64
|
+
name: nokogiri
|
65
|
+
prerelease: false
|
66
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
67
|
+
none: false
|
68
|
+
requirements:
|
69
|
+
- - ">="
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
hash: 3
|
72
|
+
segments:
|
73
|
+
- 0
|
74
|
+
version: "0"
|
47
75
|
type: :runtime
|
48
|
-
|
49
|
-
|
76
|
+
version_requirements: *id004
|
77
|
+
- !ruby/object:Gem::Dependency
|
78
|
+
name: sanitize
|
79
|
+
prerelease: false
|
80
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
50
82
|
requirements:
|
51
83
|
- - ">="
|
52
84
|
- !ruby/object:Gem::Version
|
85
|
+
hash: 3
|
86
|
+
segments:
|
87
|
+
- 0
|
53
88
|
version: "0"
|
54
|
-
|
89
|
+
type: :runtime
|
90
|
+
version_requirements: *id005
|
55
91
|
- !ruby/object:Gem::Dependency
|
56
92
|
name: fast-stemmer
|
57
|
-
|
58
|
-
|
59
|
-
|
93
|
+
prerelease: false
|
94
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
95
|
+
none: false
|
60
96
|
requirements:
|
61
97
|
- - ">="
|
62
98
|
- !ruby/object:Gem::Version
|
99
|
+
hash: 3
|
100
|
+
segments:
|
101
|
+
- 0
|
63
102
|
version: "0"
|
64
|
-
|
103
|
+
type: :runtime
|
104
|
+
version_requirements: *id006
|
65
105
|
- !ruby/object:Gem::Dependency
|
66
106
|
name: chronic
|
67
|
-
|
68
|
-
|
69
|
-
|
107
|
+
prerelease: false
|
108
|
+
requirement: &id007 !ruby/object:Gem::Requirement
|
109
|
+
none: false
|
70
110
|
requirements:
|
71
111
|
- - ">="
|
72
112
|
- !ruby/object:Gem::Version
|
113
|
+
hash: 3
|
114
|
+
segments:
|
115
|
+
- 0
|
73
116
|
version: "0"
|
74
|
-
|
117
|
+
type: :runtime
|
118
|
+
version_requirements: *id007
|
75
119
|
description: Pismo extracts and retrieves content-related metadata from HTML pages - you can use the resulting data in an organized way, such as a summary/first paragraph, body text, keywords, RSS feed URL, favicon, etc.
|
76
120
|
email: git@peterc.org
|
77
121
|
executables:
|
@@ -85,6 +129,7 @@ files:
|
|
85
129
|
- .document
|
86
130
|
- .gitignore
|
87
131
|
- LICENSE
|
132
|
+
- NOTICE
|
88
133
|
- README.markdown
|
89
134
|
- Rakefile
|
90
135
|
- VERSION
|
@@ -93,25 +138,30 @@ files:
|
|
93
138
|
- lib/pismo/document.rb
|
94
139
|
- lib/pismo/external_attributes.rb
|
95
140
|
- lib/pismo/internal_attributes.rb
|
96
|
-
- lib/pismo/
|
141
|
+
- lib/pismo/reader.rb
|
97
142
|
- lib/pismo/stopwords.txt
|
98
143
|
- pismo.gemspec
|
99
144
|
- test/corpus/bbcnews.html
|
145
|
+
- test/corpus/bbcnews2.html
|
100
146
|
- test/corpus/briancray.html
|
101
147
|
- test/corpus/cant_read.html
|
102
148
|
- test/corpus/factor.html
|
149
|
+
- test/corpus/gmane.html
|
103
150
|
- test/corpus/huffington.html
|
104
151
|
- test/corpus/metadata_expected.yaml
|
105
152
|
- test/corpus/metadata_expected.yaml.old
|
153
|
+
- test/corpus/queness.html
|
154
|
+
- test/corpus/reader_expected.yaml
|
106
155
|
- test/corpus/rubyinside.html
|
107
156
|
- test/corpus/rww.html
|
108
157
|
- test/corpus/spolsky.html
|
109
158
|
- test/corpus/techcrunch.html
|
159
|
+
- test/corpus/tweet.html
|
110
160
|
- test/corpus/youtube.html
|
161
|
+
- test/corpus/zefrank.html
|
111
162
|
- test/helper.rb
|
112
163
|
- test/test_corpus.rb
|
113
164
|
- test/test_pismo_document.rb
|
114
|
-
- test/test_readability.rb
|
115
165
|
has_rdoc: true
|
116
166
|
homepage: http://github.com/peterc/pismo
|
117
167
|
licenses: []
|
@@ -122,21 +172,27 @@ rdoc_options:
|
|
122
172
|
require_paths:
|
123
173
|
- lib
|
124
174
|
required_ruby_version: !ruby/object:Gem::Requirement
|
175
|
+
none: false
|
125
176
|
requirements:
|
126
177
|
- - ">="
|
127
178
|
- !ruby/object:Gem::Version
|
179
|
+
hash: 3
|
180
|
+
segments:
|
181
|
+
- 0
|
128
182
|
version: "0"
|
129
|
-
version:
|
130
183
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
184
|
+
none: false
|
131
185
|
requirements:
|
132
186
|
- - ">="
|
133
187
|
- !ruby/object:Gem::Version
|
188
|
+
hash: 3
|
189
|
+
segments:
|
190
|
+
- 0
|
134
191
|
version: "0"
|
135
|
-
version:
|
136
192
|
requirements: []
|
137
193
|
|
138
194
|
rubyforge_project:
|
139
|
-
rubygems_version: 1.3.
|
195
|
+
rubygems_version: 1.3.7
|
140
196
|
signing_key:
|
141
197
|
specification_version: 3
|
142
198
|
summary: Extracts or retrieves content-related metadata from HTML pages
|
@@ -144,4 +200,3 @@ test_files:
|
|
144
200
|
- test/helper.rb
|
145
201
|
- test/test_corpus.rb
|
146
202
|
- test/test_pismo_document.rb
|
147
|
-
- test/test_readability.rb
|
data/lib/pismo/readability.rb
DELETED
@@ -1,342 +0,0 @@
|
|
1
|
-
# This code is under the Apache License 2.0. http://www.apache.org/licenses/LICENSE-2.0
|
2
|
-
#
|
3
|
-
# This is a Ruby port of arc90's readability project
|
4
|
-
# http://lab.arc90.com/experiments/readability/
|
5
|
-
# Given a html document, it pulls out the main body text and cleans it up.
|
6
|
-
# Ruby port by starrhorne and iterationlabs
|
7
|
-
#
|
8
|
-
# Original JavaScript version:
|
9
|
-
# http://lab.arc90.com/experiments/readability/js/readability.js
|
10
|
-
# * Copyright (c) 2009 Arc90 Inc
|
11
|
-
# * Readability is licensed under the Apache License, Version 2.0.
|
12
|
-
#
|
13
|
-
# Minor edits and tweaks by Peter Cooper
|
14
|
-
|
15
|
-
require 'nokogiri'
|
16
|
-
|
17
|
-
IS_RUBY19 = "a".respond_to?(:encoding)
|
18
|
-
|
19
|
-
module Readability
|
20
|
-
class Document
|
21
|
-
TEXT_LENGTH_THRESHOLD = 25
|
22
|
-
RETRY_LENGTH = 250
|
23
|
-
|
24
|
-
attr_accessor :options, :html
|
25
|
-
|
26
|
-
def initialize(input, options = {})
|
27
|
-
@input = input
|
28
|
-
@options = options
|
29
|
-
make_html
|
30
|
-
end
|
31
|
-
|
32
|
-
def make_html
|
33
|
-
@html = Nokogiri::HTML(@input) #, nil, 'UTF-8')
|
34
|
-
end
|
35
|
-
|
36
|
-
REGEXES = {
|
37
|
-
:unlikelyCandidatesRe => /combx|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|sponsor/i,
|
38
|
-
:okMaybeItsACandidateRe => /and|article|body|column|main/i,
|
39
|
-
:positiveRe => /article|body|content|entry|hentry|page|pagination|post|story|text/i,
|
40
|
-
:negativeRe => /combx|comment|contact|foot|box_wrap|footer|footnote|link|media|meta|promo|related|scroll|shoutbox|sponsor|tags/i,
|
41
|
-
:divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
|
42
|
-
:replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
|
43
|
-
:replaceFontsRe => /<(\/?)font[^>]*>/i,
|
44
|
-
:trimRe => /^\s+|\s+$/,
|
45
|
-
:normalizeRe => /\s{2,}/,
|
46
|
-
:killBreaksRe => /(<br\s*\/?>(\s| ?)*){1,}/,
|
47
|
-
:videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
|
48
|
-
}
|
49
|
-
|
50
|
-
def content(remove_unlikely_candidates = true)
|
51
|
-
@html.css("script, style").each { |i| i.remove }
|
52
|
-
|
53
|
-
remove_unlikely_candidates! if remove_unlikely_candidates
|
54
|
-
transform_misused_divs_into_paragraphs!
|
55
|
-
candidates = score_paragraphs(options[:min_text_length] || TEXT_LENGTH_THRESHOLD)
|
56
|
-
best_candidate = select_best_candidate(candidates)
|
57
|
-
article = get_article(candidates, best_candidate)
|
58
|
-
cleaned_article = sanitize(article, candidates, options)
|
59
|
-
cleaned_article.gsub!(/^\s+\n/, "\n")
|
60
|
-
cleaned_article.gsub!(/[\ \t]+/, ' ')
|
61
|
-
cleaned_article.gsub!(/^\s+/, '')
|
62
|
-
cleaned_article.gsub!(/\<\!\-\-.*?\-\-\>/m, '')
|
63
|
-
if remove_unlikely_candidates && article.text.strip.length < (options[:retry_length] || RETRY_LENGTH)
|
64
|
-
make_html
|
65
|
-
content(false)
|
66
|
-
else
|
67
|
-
cleaned_article
|
68
|
-
end
|
69
|
-
end
|
70
|
-
|
71
|
-
def get_article(candidates, best_candidate)
|
72
|
-
# Now that we have the top candidate, look through its siblings for content that might also be related.
|
73
|
-
# Things like preambles, content split by ads that we removed, etc.
|
74
|
-
|
75
|
-
sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
|
76
|
-
output = Nokogiri::XML::Node.new('div', @html)
|
77
|
-
|
78
|
-
return output unless best_candidate[:elem]
|
79
|
-
|
80
|
-
best_candidate[:elem].parent.children.each do |sibling|
|
81
|
-
append = false
|
82
|
-
append = true if sibling == best_candidate[:elem]
|
83
|
-
append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
|
84
|
-
|
85
|
-
if sibling.name.downcase == "p"
|
86
|
-
link_density = get_link_density(sibling)
|
87
|
-
node_content = sibling.text
|
88
|
-
node_length = node_content.length
|
89
|
-
|
90
|
-
if node_length > 80 && link_density < 0.25
|
91
|
-
append = true
|
92
|
-
elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/
|
93
|
-
append = true
|
94
|
-
end
|
95
|
-
end
|
96
|
-
|
97
|
-
if append
|
98
|
-
sibling.name = "div" unless %w[div p].include?(sibling.name.downcase)
|
99
|
-
output << sibling
|
100
|
-
end
|
101
|
-
end
|
102
|
-
|
103
|
-
output
|
104
|
-
end
|
105
|
-
|
106
|
-
def select_best_candidate(candidates)
|
107
|
-
sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
|
108
|
-
|
109
|
-
debug("Top 5 canidates:")
|
110
|
-
sorted_candidates[0...5].each do |candidate|
|
111
|
-
debug("Candidate #{candidate[:elem].name}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}")
|
112
|
-
end
|
113
|
-
|
114
|
-
best_candidate = sorted_candidates.first || { :elem => @html.css("body").first, :content_score => 0 }
|
115
|
-
#debug("Best candidate #{best_candidate[:elem].name}##{best_candidate[:elem][:id]}.#{best_candidate[:elem][:class]} with score #{best_candidate[:content_score]}")
|
116
|
-
|
117
|
-
best_candidate
|
118
|
-
end
|
119
|
-
|
120
|
-
def get_link_density(elem)
|
121
|
-
link_length = elem.css("a").map {|i| i.text}.join("").length
|
122
|
-
text_length = elem.text.length
|
123
|
-
link_length / text_length.to_f
|
124
|
-
end
|
125
|
-
|
126
|
-
def score_paragraphs(min_text_length)
|
127
|
-
candidates = {}
|
128
|
-
@html.css("p,td").each do |elem|
|
129
|
-
parent_node = elem.parent
|
130
|
-
grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil
|
131
|
-
inner_text = elem.text
|
132
|
-
|
133
|
-
# If this paragraph is less than 25 characters, don't even count it.
|
134
|
-
next if inner_text.length < min_text_length
|
135
|
-
|
136
|
-
candidates[parent_node] ||= score_node(parent_node)
|
137
|
-
candidates[grand_parent_node] ||= score_node(grand_parent_node) if grand_parent_node
|
138
|
-
|
139
|
-
content_score = 1
|
140
|
-
|
141
|
-
begin
|
142
|
-
content_score += inner_text.split(',').length
|
143
|
-
content_score += [(inner_text.length / 100).to_i, 3].min
|
144
|
-
rescue => e
|
145
|
-
raise e unless IS_RUBY19
|
146
|
-
inner_text.force_encoding('ASCII-8BIT')
|
147
|
-
content_score += inner_text.split(',').length
|
148
|
-
content_score += [(inner_text.length / 100).to_i, 3].min
|
149
|
-
end
|
150
|
-
|
151
|
-
candidates[parent_node][:content_score] += content_score
|
152
|
-
candidates[grand_parent_node][:content_score] += content_score / 2.0 if grand_parent_node
|
153
|
-
end
|
154
|
-
|
155
|
-
# Scale the final candidates score based on link density. Good content should have a
|
156
|
-
# relatively small link density (5% or less) and be mostly unaffected by this operation.
|
157
|
-
candidates.each do |elem, candidate|
|
158
|
-
candidate[:content_score] = candidate[:content_score] * (1 - get_link_density(elem))
|
159
|
-
end
|
160
|
-
|
161
|
-
candidates
|
162
|
-
end
|
163
|
-
|
164
|
-
def class_weight(e)
|
165
|
-
weight = 0
|
166
|
-
if e[:class] && e[:class] != ""
|
167
|
-
if e[:class] =~ REGEXES[:negativeRe]
|
168
|
-
weight -= 25
|
169
|
-
end
|
170
|
-
|
171
|
-
if e[:class] =~ REGEXES[:positiveRe]
|
172
|
-
weight += 25
|
173
|
-
end
|
174
|
-
end
|
175
|
-
|
176
|
-
if e[:id] && e[:id] != ""
|
177
|
-
if e[:id] =~ REGEXES[:negativeRe]
|
178
|
-
weight -= 25
|
179
|
-
end
|
180
|
-
|
181
|
-
if e[:id] =~ REGEXES[:positiveRe]
|
182
|
-
weight += 25
|
183
|
-
end
|
184
|
-
end
|
185
|
-
|
186
|
-
weight
|
187
|
-
end
|
188
|
-
|
189
|
-
def score_node(elem)
|
190
|
-
content_score = class_weight(elem)
|
191
|
-
case elem.name.downcase
|
192
|
-
when "div"
|
193
|
-
content_score += 5
|
194
|
-
when "blockquote"
|
195
|
-
content_score += 3
|
196
|
-
when "form"
|
197
|
-
content_score -= 3
|
198
|
-
when "th"
|
199
|
-
content_score -= 5
|
200
|
-
end
|
201
|
-
{ :content_score => content_score, :elem => elem }
|
202
|
-
end
|
203
|
-
|
204
|
-
def debug(str)
|
205
|
-
puts str if options[:debug]
|
206
|
-
end
|
207
|
-
|
208
|
-
def remove_unlikely_candidates!
|
209
|
-
@html.css("*").each do |elem|
|
210
|
-
str = "#{elem[:class]}#{elem[:id]}"
|
211
|
-
if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && elem.name.downcase != 'body'
|
212
|
-
debug("Removing unlikely candidate - #{str}")
|
213
|
-
elem.remove
|
214
|
-
end
|
215
|
-
end
|
216
|
-
end
|
217
|
-
|
218
|
-
def transform_misused_divs_into_paragraphs!
|
219
|
-
@html.css("*").each do |elem|
|
220
|
-
if elem.name.downcase == "div"
|
221
|
-
# transform <div>s that do not contain other block elements into <p>s
|
222
|
-
elem_inner_html = IS_RUBY19 ? elem.inner_html.dup.force_encoding('ASCII-8BIT') : elem.inner_html
|
223
|
-
if elem_inner_html !~ REGEXES[:divToPElementsRe]
|
224
|
-
debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
|
225
|
-
elem.name = "p"
|
226
|
-
end
|
227
|
-
else
|
228
|
-
# wrap text nodes in p tags
|
229
|
-
# elem.children.each do |child|
|
230
|
-
# if child.text?
|
231
|
-
## debug("wrapping text node with a p")
|
232
|
-
# child.swap("<p>#{child.text}</p>")
|
233
|
-
# end
|
234
|
-
# end
|
235
|
-
end
|
236
|
-
end
|
237
|
-
end
|
238
|
-
|
239
|
-
def sanitize(node, candidates, options = {})
|
240
|
-
node.css("h1, h2, h3, h4, h5, h6").each do |header|
|
241
|
-
header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33
|
242
|
-
end
|
243
|
-
|
244
|
-
node.css("form, object, iframe, embed").each do |elem|
|
245
|
-
elem.remove
|
246
|
-
end
|
247
|
-
|
248
|
-
# Remove empty <p> tags
|
249
|
-
node.css("p").each do |elem|
|
250
|
-
elem.remove if elem.content.strip.empty?
|
251
|
-
end
|
252
|
-
|
253
|
-
# Remove empty <div> tags
|
254
|
-
node.css("div").each do |elem|
|
255
|
-
elem.remove if elem.content.strip.empty?
|
256
|
-
end
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
# Conditionally clean <table>s, <ul>s, and <div>s
|
261
|
-
node.css("table, ul, div").each do |el|
|
262
|
-
weight = class_weight(el)
|
263
|
-
content_score = candidates[el] ? candidates[el][:content_score] : 0
|
264
|
-
name = el.name.downcase
|
265
|
-
|
266
|
-
if weight + content_score < 0
|
267
|
-
el.remove
|
268
|
-
debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
|
269
|
-
elsif (IS_RUBY19 && el.text.force_encoding("ASCII-8BIT").count(",") < 10) || (!IS_RUBY19 && el.text.count(",") < 10)
|
270
|
-
counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
|
271
|
-
counts["li"] -= 100
|
272
|
-
|
273
|
-
content_length = el.text.length
|
274
|
-
link_density = get_link_density(el)
|
275
|
-
to_remove = false
|
276
|
-
reason = ""
|
277
|
-
|
278
|
-
if counts["img"] > counts["p"]
|
279
|
-
reason = "too many images"
|
280
|
-
to_remove = true
|
281
|
-
elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
|
282
|
-
reason = "more <li>s than <p>s"
|
283
|
-
to_remove = true
|
284
|
-
elsif counts["input"] > (counts["p"] / 3).to_i
|
285
|
-
reason = "less than 3x <p>s than <input>s"
|
286
|
-
to_remove = true
|
287
|
-
elsif content_length < (options[:min_text_length] || TEXT_LENGTH_THRESHOLD) && (counts["img"] == 0 || counts["img"] > 2)
|
288
|
-
reason = "too short a content length without a single image"
|
289
|
-
to_remove = true
|
290
|
-
elsif weight < 25 && link_density > 0.2
|
291
|
-
reason = "too many links for its weight (#{weight})"
|
292
|
-
to_remove = true
|
293
|
-
elsif weight >= 25 && link_density > 0.5
|
294
|
-
reason = "too many links for its weight (#{weight})"
|
295
|
-
to_remove = true
|
296
|
-
elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1
|
297
|
-
reason = "<embed>s with too short a content length, or too many <embed>s"
|
298
|
-
to_remove = true
|
299
|
-
end
|
300
|
-
|
301
|
-
if to_remove
|
302
|
-
debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
|
303
|
-
el.remove
|
304
|
-
end
|
305
|
-
end
|
306
|
-
end
|
307
|
-
|
308
|
-
# We'll sanitize all elements using a whitelist
|
309
|
-
whitelist = @options[:tags] || %w[p]
|
310
|
-
|
311
|
-
# Use a hash for speed (don't want to make a million calls to include?)
|
312
|
-
whitelist = Hash[ whitelist.zip([true] * whitelist.size) ]
|
313
|
-
|
314
|
-
([node] + node.css("*")).each do |el|
|
315
|
-
|
316
|
-
# If element is in whitelist, delete all its attributes
|
317
|
-
if whitelist[el.node_name]
|
318
|
-
el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
|
319
|
-
|
320
|
-
# Otherwise, replace the element with its contents
|
321
|
-
else
|
322
|
-
begin
|
323
|
-
el.swap(el.text)
|
324
|
-
rescue => e
|
325
|
-
raise e unless IS_RUBY19
|
326
|
-
el.swap(el.text.force_encoding("ASCII-8BIT"))
|
327
|
-
end
|
328
|
-
end
|
329
|
-
|
330
|
-
end
|
331
|
-
|
332
|
-
# Get rid of duplicate whitespace
|
333
|
-
begin
|
334
|
-
node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/ /, " ")
|
335
|
-
rescue => e
|
336
|
-
raise e unless IS_RUBY19
|
337
|
-
node.to_html.force_encoding("ASCII-8BIT").gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/ /, " ")
|
338
|
-
end
|
339
|
-
end
|
340
|
-
|
341
|
-
end
|
342
|
-
end
|