pismo 0.7.2 → 0.7.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +3 -0
- data/LICENSE +1 -1
- data/NOTICE +1 -1
- data/README.markdown +20 -7
- data/Rakefile +0 -23
- data/lib/pismo.rb +6 -3
- data/lib/pismo/document.rb +8 -3
- data/lib/pismo/internal_attributes.rb +38 -6
- data/lib/pismo/reader.rb +10 -394
- data/lib/pismo/reader/base.rb +261 -0
- data/lib/pismo/reader/cluster.rb +171 -0
- data/lib/pismo/reader/tree.rb +154 -0
- data/lib/pismo/version.rb +1 -1
- data/pismo.gemspec +2 -3
- data/test/corpus/metadata_expected.yaml +8 -2
- data/test/corpus/readers/cluster_expected.yaml +45 -0
- data/test/corpus/readers/tree_expected.yaml +55 -0
- data/test/corpus/thegoodbookblog.html +612 -0
- data/test/helper.rb +3 -0
- data/test/test_corpus.rb +16 -3
- metadata +108 -111
- data/test/corpus/metadata_expected.yaml.old +0 -122
@@ -0,0 +1,261 @@
|
|
1
|
+
# encoding: us-ascii
|
2
|
+
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'sanitize'
|
5
|
+
begin; require 'ap'; rescue LoadError; end
|
6
|
+
|
7
|
+
module Pismo
|
8
|
+
module Reader
|
9
|
+
class Base
|
10
|
+
attr_reader :raw_content, :doc, :content_candidates, :options
|
11
|
+
|
12
|
+
# Elements to keep for /input/ sanitization
|
13
|
+
OK_ELEMENTS = %w{a td br th tbody table tr div span img strong em b i body html head title p h1 h2 h3 h4 h5 h6 pre code tt ul li ol blockquote font big small section article abbr audio video cite dd dt figure caption sup form dl dt dd center}
|
14
|
+
|
15
|
+
# Build a tree of attributes that are allowed for each element.. doing it this messy way due to how Sanitize works, alas
|
16
|
+
OK_ATTRIBUTES = {}
|
17
|
+
OK_CLEAN_ATTRIBUTES = {}
|
18
|
+
OK_ELEMENTS.each { |el| OK_ATTRIBUTES[el] = %w{id class href name content type alt title src} }
|
19
|
+
OK_ELEMENTS.each { |el| OK_CLEAN_ATTRIBUTES[el] = %w{href title src alt} }
|
20
|
+
|
21
|
+
|
22
|
+
# Words that we'd like to see in class and ID names for "content"
|
23
|
+
GOOD_WORDS = %w{content post blogpost main story body entry text desc asset hentry single entrytext postcontent bodycontent}.uniq
|
24
|
+
|
25
|
+
# Words that indicate crap in general
|
26
|
+
BAD_WORDS = %w{reply metadata options commenting comments comment about footer header outer credit sidebar widget subscribe clearfix date social bookmarks links share video watch excerpt related supplement accessibility offscreen meta title signup blq secondary feedback featured clearfix small job jobs listing listings navigation nav byline addcomment postcomment trackback neighbor ads commentform fbfans login similar thumb link blogroll grid twitter wrapper container nav sitesub printfooter editsection visualclear catlinks hidden toc contentsub caption disqus rss shoutbox sponsor blogcomments}.uniq
|
27
|
+
|
28
|
+
# Words that kill a branch dead
|
29
|
+
FATAL_WORDS = %w{comments comment bookmarks social links ads related similar footer digg totop metadata sitesub nav sidebar commenting options addcomment leaderboard offscreen job prevlink prevnext navigation reply-link hide hidden sidebox archives vcard}
|
30
|
+
|
31
|
+
META_WORDS = %w{january february march april may june july august september october november december jan feb mar apr may jun jul aug sep oct nov dec st th rd nd comments written posted on at published 2000 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 updated last gmt est pst pdt edt cet cdt cst article feature featured filed under comment comments follow twitter facebook email e-mail register story continue continues reading read inside more page next related response responses respond contact street phone tel email e-mail fax info tags tagged tag thanks credit creative commons copy nbsp lt gt this friend printable version subscribe rss mail follow twitter article via leave}.uniq
|
32
|
+
|
33
|
+
WONT_CONTAIN_FULL_CONTENT = %w{h1 h2 h3 h4 h5 h6 h6 li ol ul br a img meta cite strong em i b input head small big code title sup sub dd dt}
|
34
|
+
COULD_CONTAIN_FULL_CONTENT = %w{body div p table tr td article pre blockquote tbody section}
|
35
|
+
|
36
|
+
## Output sanitization element sets
|
37
|
+
BLOCK_OUTPUT_ELEMENTS = %w{div p h2 h3 h4 h5 h6 li dl pre ul ol blockquote section article audio video cite dd dt figure caption br table tr td thead tbody tfoot}
|
38
|
+
INLINE_OUTPUT_ELEMENTS = %w{a img b strong em i br code sup font small big dd dt}
|
39
|
+
OUTPUT_ELEMENTS = BLOCK_OUTPUT_ELEMENTS + INLINE_OUTPUT_ELEMENTS
|
40
|
+
NON_HEADER_ELEMENTS = %w{p br}
|
41
|
+
|
42
|
+
# Create a document object based on the raw HTML content provided
|
43
|
+
def initialize(raw_content, options = {})
|
44
|
+
@options = options
|
45
|
+
@raw_content = Pismo::Document.clean_html(raw_content)
|
46
|
+
build_doc
|
47
|
+
end
|
48
|
+
|
49
|
+
def build_doc
|
50
|
+
@content = {}
|
51
|
+
|
52
|
+
if RUBY_VERSION > "1.9"
|
53
|
+
@raw_content.encode!("UTF-8", :invalid => :replace, :replace => '?') if @raw_content.encoding != "UTF-8"
|
54
|
+
@raw_content.encode!("ASCII-8BIT", :invalid => :replace, :replace => '?') if !@raw_content.valid_encoding?
|
55
|
+
end
|
56
|
+
|
57
|
+
# Normalize whitespace (as much to make debugging sessions look nice as anything else)
|
58
|
+
@raw_content.gsub!(/\s{2,}/, ' ')
|
59
|
+
@raw_content.gsub!(/\r/, "\n")
|
60
|
+
@raw_content.gsub!(/\n{3,}/, "\n\n")
|
61
|
+
@raw_content.gsub!(/(\<br(\s\/)?\>){2,}/, "</p><p>")
|
62
|
+
|
63
|
+
# Remove scripts manually, Sanitize and/or Nokogiri seem to go a bit funny with them
|
64
|
+
@raw_content.gsub!(/\<script .*?\<\/script\>/im, '')
|
65
|
+
|
66
|
+
# Get rid of "smart" quotes and other Unicode nonsense
|
67
|
+
@raw_content.force_encoding("ASCII-8BIT") if RUBY_VERSION > "1.9"
|
68
|
+
@raw_content.gsub!("\xe2\x80\x89", " ")
|
69
|
+
@raw_content.gsub!("\xe2\x80\x99", "'")
|
70
|
+
@raw_content.gsub!("\xe2\x80\x98", "'")
|
71
|
+
@raw_content.gsub!("\xe2\x80\x9c", '"')
|
72
|
+
@raw_content.gsub!("\xe2\x80\x9d", '"')
|
73
|
+
@raw_content.gsub!("\xe2\x80\xf6", '.')
|
74
|
+
@raw_content.force_encoding("UTF-8") if RUBY_VERSION > "1.9"
|
75
|
+
|
76
|
+
|
77
|
+
# Sanitize the HTML
|
78
|
+
@raw_content = Sanitize.clean(@raw_content,
|
79
|
+
:elements => OK_ELEMENTS,
|
80
|
+
:attributes => OK_ATTRIBUTES,
|
81
|
+
:remove_contents => true,
|
82
|
+
:output_encoding => 'utf-8'
|
83
|
+
)
|
84
|
+
|
85
|
+
@doc = Nokogiri::HTML(@raw_content, nil, 'utf-8')
|
86
|
+
|
87
|
+
# Do a pre clean up of elements.
|
88
|
+
@doc.css("div, span, table, tr, td, pre").each do |el|
|
89
|
+
# Any block elements with no child block elements can become paragraphs
|
90
|
+
if (BLOCK_OUTPUT_ELEMENTS & el.inner_html.scan(/\<(\w+)/).flatten).empty?
|
91
|
+
el.name = "p"
|
92
|
+
elsif el.name != "span"
|
93
|
+
el.name = "div"
|
94
|
+
end
|
95
|
+
|
96
|
+
# Any SPANs that aren't within paragraphs can become paragraphs too
|
97
|
+
el.name = "p" if el.name == "span" && !el.path.scan(/[a-z]+/).include?('p')
|
98
|
+
|
99
|
+
el.remove if (FATAL_WORDS & (el['id'].to_s + ' ' + el['class'].to_s).downcase.strip.scan(/[a-z]+/)).size > 0
|
100
|
+
end
|
101
|
+
|
102
|
+
analyze
|
103
|
+
end
|
104
|
+
|
105
|
+
# Return the content from best match number of index (default 0) and, optionally, clean it to plain-text
|
106
|
+
def content(clean = false, index = 0)
|
107
|
+
return @content[[clean, index]] if @content[[clean, index]]
|
108
|
+
return '' if !@content_candidates || @content_candidates.empty?
|
109
|
+
|
110
|
+
content_branch = content_at(index)
|
111
|
+
orphans_to_remove = []
|
112
|
+
|
113
|
+
#ap content_branch.to_html
|
114
|
+
#exit
|
115
|
+
|
116
|
+
# Go through every piece of the content and rip out sections that contain too many tags compared to words
|
117
|
+
# This is usually indicative of "widgets" or link bar sections
|
118
|
+
content_branch.css('*').each_with_index do |el, i|
|
119
|
+
next unless el
|
120
|
+
|
121
|
+
if el.name == "h1"
|
122
|
+
el.remove
|
123
|
+
next
|
124
|
+
end
|
125
|
+
|
126
|
+
if el.name == "h2" && content_branch.inner_html.scan('<h2').size == 1
|
127
|
+
el.remove
|
128
|
+
end
|
129
|
+
|
130
|
+
# Remove elements that contain words but there are more tags than words overall
|
131
|
+
# First, count the words
|
132
|
+
#word_count = 0
|
133
|
+
#el.traverse do |subel|
|
134
|
+
# if subel.text? && subel.path !~ /\/a\// && subel.path !~ /\/(h1|h2|h3|h4|h5|h6)\//
|
135
|
+
# word_count += (subel.text.downcase.scan(/[a-z]{4,}/) - META_WORDS).size
|
136
|
+
# end
|
137
|
+
#end
|
138
|
+
#
|
139
|
+
## .. then count the tags
|
140
|
+
#
|
141
|
+
#inner_tags = el.inner_html.scan(/\<\w.*?\>/).size
|
142
|
+
#if word_count < inner_tags && inner_tags > 3 && word_count < 250
|
143
|
+
# puts "At #{el.name} #{el['id']} #{el['class']} containing '#{el.text[0..20]}' we have #{word_count} valid words to #{el.inner_html.scan(/\<\w.*?\>/).size} tags"
|
144
|
+
# #puts "Removing #{el.name} #{el['id']} #{el['class']} TOO MANY TAGS FOR WORDS"
|
145
|
+
# el.remove
|
146
|
+
# next
|
147
|
+
#end
|
148
|
+
|
149
|
+
# If there are at least 2 words and a third of them are "meta words," remove the element
|
150
|
+
#inner_words = el.text.to_s.downcase.scan(/[a-z]{3,}/)
|
151
|
+
#if BLOCK_OUTPUT_ELEMENTS.include?(el.name) && inner_words.size >= 2
|
152
|
+
# if ((inner_words & META_WORDS).size >= (inner_words.size / 3))
|
153
|
+
# el.remove
|
154
|
+
# end
|
155
|
+
#end
|
156
|
+
|
157
|
+
if el.text && el.text.strip.length < 3 && !%w{img}.include?(el.name) && el.inner_html !~ /\<img/
|
158
|
+
el.remove
|
159
|
+
next
|
160
|
+
end
|
161
|
+
|
162
|
+
if el.name == "p" && el.text !~ /(\.|\?|\!|\"|\')(\s|$)/ && el.inner_html !~ /\<img/
|
163
|
+
el.remove
|
164
|
+
next
|
165
|
+
end
|
166
|
+
|
167
|
+
# If the ID or class of the element contains a fatally bad word, get rid of it
|
168
|
+
if (BAD_WORDS & (el['id'].to_s + ' ' + el['class'].to_s).downcase.scan(/[a-z]+/)).length > 0
|
169
|
+
#puts "Removing #{el.name} #{el['id']} #{el['class']} BAD"
|
170
|
+
el.remove
|
171
|
+
next
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
# If a title was found early in the result document but had text before it, remove that text - it's probably crap
|
176
|
+
orphans_to_remove.each { |el| el.remove }
|
177
|
+
|
178
|
+
# Clean up the HTML again - Nokogiri outputs it with full doctype and crap
|
179
|
+
clean_html = strip(Sanitize.clean(content_branch.to_html, :elements => (clean ? BLOCK_OUTPUT_ELEMENTS : OUTPUT_ELEMENTS), :attributes => (clean ? OK_CLEAN_ATTRIBUTES : OK_ATTRIBUTES)))
|
180
|
+
|
181
|
+
# If the content is desired as "clean" (i.e. plain-text), do some quick fix-ups
|
182
|
+
if clean
|
183
|
+
# Get rid of line break tags, make list items look nice, remove all other HTML tags, and clean up spaces and newlines
|
184
|
+
clean_html.gsub!(/<br.*?>/, "\n")
|
185
|
+
clean_html.gsub!(/<li>/, '* ')
|
186
|
+
clean_html.gsub!(/<\w+>/, '')
|
187
|
+
clean_html.gsub!(/<\/\w+>/, "\n")
|
188
|
+
clean_html.gsub!(/\ +/, ' ')
|
189
|
+
clean_html.gsub!(/^\s+\n/, "\n")
|
190
|
+
clean_html.gsub!(/\n{2,}/, "\n")
|
191
|
+
clean_html.strip!
|
192
|
+
end
|
193
|
+
|
194
|
+
# If tags butt up against each other across lines, remove the line break(s)
|
195
|
+
clean_html.gsub!(/\>\n+\</, '><')
|
196
|
+
|
197
|
+
# Get rid of images whose sources are relative (TODO: Make this optional)
|
198
|
+
clean_html.gsub!(/\<img .*?\>/i) do |img_tag|
|
199
|
+
img_tag =~ /\Whttp/ ? img_tag : ''
|
200
|
+
end
|
201
|
+
|
202
|
+
# Remove empty tags
|
203
|
+
clean_html.gsub!(/<(\w+)><\/\1>/, "")
|
204
|
+
|
205
|
+
# Just a messy, hacky way to make output look nicer with subsequent paragraphs..
|
206
|
+
clean_html.gsub!(/<\/(div|p|h1|h2|h3|h4|h5|h6)>/, '</\1>' + "\n\n")
|
207
|
+
|
208
|
+
@content[[clean, index]] = clean_html
|
209
|
+
end
|
210
|
+
|
211
|
+
def sentences(qty = 3)
|
212
|
+
clean_content = Sanitize.clean(content, :elements => NON_HEADER_ELEMENTS, :attributes => OK_CLEAN_ATTRIBUTES, :remove_contents => %w{h1 h2 h3 h4 h5 h6})
|
213
|
+
|
214
|
+
fodder = ''
|
215
|
+
doc = Nokogiri::HTML(clean_content, nil, 'utf-8')
|
216
|
+
|
217
|
+
doc.traverse do |el|
|
218
|
+
path_segments = el.path.scan(/[a-z]+/)[2..-1]
|
219
|
+
next unless path_segments && path_segments.length > 1
|
220
|
+
if el.text? && el.text.strip.length < 3
|
221
|
+
el.remove
|
222
|
+
next
|
223
|
+
end
|
224
|
+
if el.text? && NON_HEADER_ELEMENTS.include?(path_segments[-2])
|
225
|
+
text = el.text.strip
|
226
|
+
text += "." if text !~ /[\.\!\?\"\']$/
|
227
|
+
fodder += text + "\n"
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
231
|
+
fodder = content(true) if fodder.to_s.length < 50
|
232
|
+
fodder.gsub!(/\b\w\W\s/, '')
|
233
|
+
|
234
|
+
#sentences = fodder.scan(/([\&\w\s\-\'\,\+\.\/\\\:\#\(\)\=\"\?\!]+?[\.\?\!])(\s|\Z)/im).map { |s| s.first }
|
235
|
+
sentences = fodder.scan(/(.+?[\.\?\!])(\s|\Z)/im).map { |s| s.first.strip }
|
236
|
+
|
237
|
+
sentences.compact!
|
238
|
+
sentences.map! { |s| s.strip }
|
239
|
+
sentences.map! { |s| s.sub(/^[^\"\'a-z0-9\(\[]+/im, '') }
|
240
|
+
sentences.map! { |s| s.sub(/[^a-z0-9\'\"\)\]\.\!\:\?]+$/im, '') }
|
241
|
+
sentences.map! { |s| s.gsub(/\s+/m, ' ') }
|
242
|
+
sentences.first(qty)
|
243
|
+
end
|
244
|
+
|
245
|
+
def images(qty = 3)
|
246
|
+
doc = Nokogiri::HTML(content, nil, 'utf-8')
|
247
|
+
images = []
|
248
|
+
doc.css("img").each do |img|
|
249
|
+
images << img['src']
|
250
|
+
break if images.length == qty
|
251
|
+
end
|
252
|
+
images
|
253
|
+
end
|
254
|
+
|
255
|
+
# Remove leading and trailing spaces on lines throughout a string (a bit like String#strip, but for multi-lines)
|
256
|
+
def strip(s)
|
257
|
+
s.gsub(/^\s+/, '').gsub(/\s+$/, '')
|
258
|
+
end
|
259
|
+
end
|
260
|
+
end
|
261
|
+
end
|
@@ -0,0 +1,171 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Pismo
|
4
|
+
module Reader
|
5
|
+
class Cluster < Base
|
6
|
+
|
7
|
+
# Adapted from : http://rubyforge.org/projects/extractcontent/
|
8
|
+
#
|
9
|
+
# Portions of this code are :
|
10
|
+
# Copyright (c) 2007/2008 Nakatani Shuyo / Cybozu Labs Inc. All rights reserved.
|
11
|
+
#
|
12
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
13
|
+
# a copy of this software and associated documentation files (the
|
14
|
+
# "Software"), to deal in the Software without restriction, including
|
15
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
16
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
17
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
18
|
+
# the following conditions:
|
19
|
+
#
|
20
|
+
# The above copyright notice and this permission notice shall be
|
21
|
+
# included in all copies or substantial portions of the Software.
|
22
|
+
#
|
23
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
24
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
25
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
26
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
27
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
28
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
29
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
30
|
+
|
31
|
+
# Default option parameters
|
32
|
+
DEFAULTS = {
|
33
|
+
:threshold => 100, # threshold for score of the text
|
34
|
+
:min_length => 80, # minimum length of evaluated blocks
|
35
|
+
:decay_factor => 0.73, # decay factor for block score
|
36
|
+
:continuous_factor => 1.62, # continuous factor for block score ( the larger, the harder to continue )
|
37
|
+
:no_body_factor => 0.72, # no body factor that reduces block score if waste expressions are present
|
38
|
+
:punctuation_weight => 10, # score weight for punctuation
|
39
|
+
:punctuations => /(\.[^A-Za-z0-9]|,[^0-9]|!|\?)/, # punctuation characters
|
40
|
+
:waste_expressions => /Copyright|All Rights Reserved/i, # characteristic keywords including footer
|
41
|
+
:debug => false, # if true, output block information to stdout
|
42
|
+
}
|
43
|
+
|
44
|
+
# Analyze the structure of the HTML document and score content blocks for likelihood of containing useful content
|
45
|
+
def analyze
|
46
|
+
|
47
|
+
opt = DEFAULTS.clone
|
48
|
+
opt.merge!(@options)
|
49
|
+
|
50
|
+
@sections = []
|
51
|
+
factor = continuous = 1.0
|
52
|
+
body = ''
|
53
|
+
score = 0
|
54
|
+
|
55
|
+
# The content is split into blocks of divs
|
56
|
+
list = @raw_content.split(/<\/?(?:div)[^>]*>/)
|
57
|
+
list.each do |block|
|
58
|
+
next unless block
|
59
|
+
block.gsub!(/\n/, '')
|
60
|
+
|
61
|
+
# Ignore blocks that have no tex
|
62
|
+
next if has_only_tags?(block)
|
63
|
+
|
64
|
+
# Each new block iterated over makes it less likely for it to belong
|
65
|
+
# to the existing block
|
66
|
+
continuous /= opt[:continuous_factor] if body.length > 0
|
67
|
+
|
68
|
+
# Clean up and strip block of html tags for scoring
|
69
|
+
clean = clean_block(block)
|
70
|
+
#clean = strip_tags(block)
|
71
|
+
next if clean.length < opt[:min_length]
|
72
|
+
|
73
|
+
# Calculate scores for clustering of blocks
|
74
|
+
|
75
|
+
# c represents how probable it is for this block to be a content block
|
76
|
+
c = (clean.length + clean.scan(opt[:punctuations]).length * opt[:punctuation_weight]) * factor
|
77
|
+
|
78
|
+
# The further down the document we go (i.e. the more blocks we see),
|
79
|
+
# the less likely they are to be valid content blocks
|
80
|
+
factor *= opt[:decay_factor]
|
81
|
+
|
82
|
+
# The not body rate represents how likely this is to be a junk block
|
83
|
+
not_body_rate = block.scan(opt[:waste_expressions]).length
|
84
|
+
|
85
|
+
# The block score is reduced if there is a not_body_rate
|
86
|
+
c *= (opt[:no_body_factor] ** not_body_rate) if not_body_rate>0
|
87
|
+
|
88
|
+
# c1 represents how probable it is for this block to belong to the
|
89
|
+
# existing block or if it is a new one
|
90
|
+
c1 = c * continuous
|
91
|
+
|
92
|
+
puts "----- #{c}*#{continuous}=#{c1} #{clean.length} \n\n" if opt[:debug]
|
93
|
+
|
94
|
+
if c1 > opt[:threshold]
|
95
|
+
# Treat continuous blocks as cluster
|
96
|
+
body += block + "\n"
|
97
|
+
score += c1
|
98
|
+
continuous = opt[:continuous_factor]
|
99
|
+
elsif c > opt[:threshold]
|
100
|
+
# Continuous block end
|
101
|
+
@sections << { :body => body, :score => score }
|
102
|
+
body = block + "\n"
|
103
|
+
score = c
|
104
|
+
continuous = opt[:continuous_factor]
|
105
|
+
else
|
106
|
+
# We drop blocks that don't have a high enough c score
|
107
|
+
end
|
108
|
+
end
|
109
|
+
# Add the last block as we've finished iterating
|
110
|
+
@sections << { :body => body, :score => score } if body
|
111
|
+
# Sort the sections by score
|
112
|
+
sorted_sections = @sections.sort_by { |section| section[:score] }
|
113
|
+
# Convert to nokogiri representation for compatibility with the content method
|
114
|
+
@content_candidates = sorted_sections.reverse.map { |section| Nokogiri::HTML(section[:body], nil, 'utf-8') }
|
115
|
+
end
|
116
|
+
|
117
|
+
def content_at(index)
|
118
|
+
@content_candidates[index]
|
119
|
+
end
|
120
|
+
|
121
|
+
protected
|
122
|
+
|
123
|
+
# Checks if the given block has only tags without text.
|
124
|
+
def has_only_tags?(block)
|
125
|
+
block.gsub(/<[^>]*>/im, '').strip.length == 0
|
126
|
+
end
|
127
|
+
|
128
|
+
# Eliminates link heavy blocks and blocks that are lists of links and
|
129
|
+
# then returns block stripped of tags
|
130
|
+
def clean_block(block)
|
131
|
+
# Return empty block if it is a list of links
|
132
|
+
return "" if is_link_list?(block)
|
133
|
+
|
134
|
+
# Return empty block if it is a very link heavy block
|
135
|
+
count = 0
|
136
|
+
no_links = block.gsub(/<a\s[^>]*>.*?<\/a\s*>/im){count+=1;''}.gsub(/<form\s[^>]*>.*?<\/form\s*>/im, '')
|
137
|
+
return "" if no_links.length < 20 * count
|
138
|
+
|
139
|
+
strip_tags(no_links)
|
140
|
+
end
|
141
|
+
|
142
|
+
# Determines whether a block is link list or not
|
143
|
+
def is_link_list?(st)
|
144
|
+
if st =~ /<(?:ul|dl|ol)(.+?)<\/(?:ul|dl|ol)>/im
|
145
|
+
listpart = $1
|
146
|
+
outside = st.gsub(/<(?:ul|dl)(.+?)<\/(?:ul|dl)>/imn, '').gsub(/<.+?>/mn, '').gsub(/\s+/, ' ')
|
147
|
+
list = listpart.split(/<li[^>]*>/)
|
148
|
+
list.shift
|
149
|
+
rate = evaluate_list(list)
|
150
|
+
outside.length <= st.length / (45 / rate)
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
# Estimates how much degree of link list
|
155
|
+
def evaluate_list(list)
|
156
|
+
return 1 if list.length == 0
|
157
|
+
hit = 0
|
158
|
+
list.each do |line|
|
159
|
+
hit +=1 if line =~ /<a\s+href=(['"]?)([^"'\s]+)\1/imn
|
160
|
+
end
|
161
|
+
return 9 * (1.0 * hit / list.length) ** 2 + 1
|
162
|
+
end
|
163
|
+
|
164
|
+
# Removes all html tags and attributes from html
|
165
|
+
def strip_tags(html)
|
166
|
+
strip(Sanitize.clean(html, :elements => [], :attributes => []))
|
167
|
+
end
|
168
|
+
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|
@@ -0,0 +1,154 @@
|
|
1
|
+
module Pismo
|
2
|
+
module Reader
|
3
|
+
class Tree < Base
|
4
|
+
|
5
|
+
# Analyze the structure of the HTML document and score branches for likelihood of containing useful content
|
6
|
+
def analyze
|
7
|
+
@tree = {}
|
8
|
+
subels = {}
|
9
|
+
|
10
|
+
t1 = Time.now.to_i + (Time.now.usec.to_f / 1000000)
|
11
|
+
|
12
|
+
@doc.css(COULD_CONTAIN_FULL_CONTENT.join(", ")).each do |el|
|
13
|
+
# Assume that no content we'll want comes in a total package of fewer than 80 characters!
|
14
|
+
next unless el.text.to_s.strip.length >= 80
|
15
|
+
|
16
|
+
path_segments = el.path.scan(/[a-z]+/)[2..-1] || []
|
17
|
+
depth = path_segments.length
|
18
|
+
|
19
|
+
local_ids = (el['id'].to_s + ' ' + el['class'].to_s).downcase.strip.scan(/[a-z]+/)
|
20
|
+
ids = local_ids
|
21
|
+
|
22
|
+
cp = el.parent
|
23
|
+
(depth - 1).times do
|
24
|
+
ids += (cp['id'].to_s + ' ' + cp['class'].to_s).downcase.strip.scan(/[a-z]+/)
|
25
|
+
cp = cp.parent
|
26
|
+
end if depth > 1
|
27
|
+
|
28
|
+
#puts "IDS"
|
29
|
+
#ap ids
|
30
|
+
#puts "LOCAL IDS"
|
31
|
+
#ap local_ids
|
32
|
+
|
33
|
+
branch = {}
|
34
|
+
branch[:ids] = ids
|
35
|
+
branch[:local_ids] = local_ids
|
36
|
+
branch[:score] = -(BAD_WORDS & ids).size
|
37
|
+
branch[:score] += ((GOOD_WORDS & ids).size * 2)
|
38
|
+
next if branch[:score] < -5
|
39
|
+
|
40
|
+
#puts "#{ids.join(",")} - #{branch[:score].to_s} - #{el.text.to_s.strip.length}"
|
41
|
+
|
42
|
+
# Elements that have an ID or class are more likely to be our winners
|
43
|
+
branch[:score] += 2 unless local_ids.empty?
|
44
|
+
|
45
|
+
branch[:name] = el.name
|
46
|
+
branch[:depth] = depth
|
47
|
+
branch[:path] = el.path
|
48
|
+
|
49
|
+
branch[:raw_word_count] = 0
|
50
|
+
branch[:word_count] = 0
|
51
|
+
branch[:child_count] = 0
|
52
|
+
branch[:bad_child_count] = 0
|
53
|
+
branch[:score_steps] = []
|
54
|
+
|
55
|
+
|
56
|
+
el.traverse do |subel|
|
57
|
+
div_at_end_of_branch = false if subel.name == "div"
|
58
|
+
path = subel.path
|
59
|
+
subels[path] ||= {}
|
60
|
+
subels[path][:path_segments] ||= (path.scan(/[a-z]+/)[2..-1] || [])
|
61
|
+
subels[path][:is_text] ||= subel.text?
|
62
|
+
|
63
|
+
if subels[path][:is_text]
|
64
|
+
subels[path][:text] ||= subel.text.downcase.scan(/[a-z]+/)
|
65
|
+
next if subels[path][:text].empty?
|
66
|
+
|
67
|
+
subels[path][:raw_word_count] ||= subels[path][:text].size
|
68
|
+
subels[path][:word_count] ||= (%{a h1 h2 h3 h4 h5 h6 h6}.include?(subel.parent.name) ? 0 : subels[path][:text].select { |word| word.length > 3 }.size)
|
69
|
+
subels[path][:meta_matches] ||= (subels[path][:text] & META_WORDS).size
|
70
|
+
|
71
|
+
branch[:raw_word_count] += subels[path][:raw_word_count]
|
72
|
+
branch[:word_count] += subels[path][:word_count] - subels[path][:meta_matches]
|
73
|
+
end
|
74
|
+
|
75
|
+
subels[path][:ids] ||= (subel['id'].to_s + ' ' + subel['class'].to_s).gsub(/[^a-z]/, ' ').downcase.strip.split(/\s+/)
|
76
|
+
subels[path][:bad_child_count_inc] = (BAD_WORDS & subels[path][:ids]).size - (GOOD_WORDS & subels[path][:ids]).size
|
77
|
+
subels[path][:child_count_inc] = subels[path][:ids].empty? ? 0 : 1
|
78
|
+
|
79
|
+
branch[:bad_child_count] += subels[path][:bad_child_count_inc]
|
80
|
+
branch[:child_count] += subels[path][:child_count_inc]
|
81
|
+
end
|
82
|
+
|
83
|
+
branch[:score] += 2 if branch[:name] == "div"
|
84
|
+
branch[:score] += 4 if el.text.scan(/\,\s/).size > 10
|
85
|
+
branch[:score_steps] << "lots of commas!" if el.text.scan(/\,\s/).size > 5
|
86
|
+
branch[:score] *= 3
|
87
|
+
|
88
|
+
|
89
|
+
branch[:score] *= 0.7 if el.children && el.children.size < 3
|
90
|
+
branch[:score] *= 1.25 if branch[:raw_word_count] > 10
|
91
|
+
next if branch[:raw_word_count] < 10
|
92
|
+
branch[:score] += [branch[:word_count], 1].max ** 0.5
|
93
|
+
|
94
|
+
|
95
|
+
word_child_count_ratio = branch[:word_count].to_f / [branch[:child_count], 1].max
|
96
|
+
branch[:word_child_count_ratio] = word_child_count_ratio
|
97
|
+
|
98
|
+
if branch[:raw_word_count] > 100
|
99
|
+
good_word_ratio = branch[:word_count].to_f / branch[:raw_word_count]
|
100
|
+
branch[:score] += good_word_ratio * 12
|
101
|
+
|
102
|
+
if word_child_count_ratio > 50
|
103
|
+
branch[:score] *= 1.5
|
104
|
+
elsif word_child_count_ratio > 30
|
105
|
+
branch[:score] *= 1.2
|
106
|
+
elsif word_child_count_ratio > 15
|
107
|
+
branch[:score] *= 1.1
|
108
|
+
elsif word_child_count_ratio < 4
|
109
|
+
branch[:score] *= 0.9
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
branch[:score_steps] << "s1: #{branch[:score]}"
|
114
|
+
|
115
|
+
bad_child_ratio = branch[:bad_child_count].to_f / [branch[:child_count], 1].max
|
116
|
+
branch[:bad_child_ratio] = bad_child_ratio
|
117
|
+
branch[:score] += 3 if bad_child_ratio < 0.0
|
118
|
+
branch[:score] -= 3 if bad_child_ratio > 0.15
|
119
|
+
branch[:score] -= 2 if bad_child_ratio > 0.25
|
120
|
+
branch[:score] -= 2 if bad_child_ratio > 0.4
|
121
|
+
branch[:score] -= 4 if bad_child_ratio > 0.5
|
122
|
+
branch[:score] -= 5 if bad_child_ratio > 0.7
|
123
|
+
branch[:score] -= 5 if branch[:bad_child_count] > 20
|
124
|
+
|
125
|
+
branch[:score] += depth
|
126
|
+
branch[:score] *= 0.8 if ids.length > 10
|
127
|
+
|
128
|
+
|
129
|
+
|
130
|
+
@tree[el.path] = branch
|
131
|
+
end
|
132
|
+
|
133
|
+
|
134
|
+
sorted_tree = @tree.sort_by { |k, v| v[:score] }
|
135
|
+
|
136
|
+
#ap @doc.at(sorted_tree.first[0]).text
|
137
|
+
|
138
|
+
# Sort the branches by their score in reverse order
|
139
|
+
@content_candidates = sorted_tree.reverse.first([5, sorted_tree.length].min)
|
140
|
+
|
141
|
+
#ap @content_candidates #.map { |i| [i[0], i[1][:name], i[1][:ids].join(','), i[1][:score] ]}
|
142
|
+
#t2 = Time.now.to_i + (Time.now.usec.to_f / 1000000)
|
143
|
+
#puts t2 - t1
|
144
|
+
#exit
|
145
|
+
|
146
|
+
end
|
147
|
+
|
148
|
+
def content_at(index)
|
149
|
+
@doc.at(@content_candidates[index].first)
|
150
|
+
end
|
151
|
+
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|