pismo 0.7.2 → 0.7.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,261 @@
1
+ # encoding: us-ascii
2
+
3
+ require 'nokogiri'
4
+ require 'sanitize'
5
+ begin; require 'ap'; rescue LoadError; end
6
+
7
+ module Pismo
8
+ module Reader
9
+ class Base
10
+ attr_reader :raw_content, :doc, :content_candidates, :options
11
+
12
+ # Elements to keep for /input/ sanitization
13
+ OK_ELEMENTS = %w{a td br th tbody table tr div span img strong em b i body html head title p h1 h2 h3 h4 h5 h6 pre code tt ul li ol blockquote font big small section article abbr audio video cite dd dt figure caption sup form dl dt dd center}
14
+
15
+ # Build a tree of attributes that are allowed for each element.. doing it this messy way due to how Sanitize works, alas
16
+ OK_ATTRIBUTES = {}
17
+ OK_CLEAN_ATTRIBUTES = {}
18
+ OK_ELEMENTS.each { |el| OK_ATTRIBUTES[el] = %w{id class href name content type alt title src} }
19
+ OK_ELEMENTS.each { |el| OK_CLEAN_ATTRIBUTES[el] = %w{href title src alt} }
20
+
21
+
22
+ # Words that we'd like to see in class and ID names for "content"
23
+ GOOD_WORDS = %w{content post blogpost main story body entry text desc asset hentry single entrytext postcontent bodycontent}.uniq
24
+
25
+ # Words that indicate crap in general
26
+ BAD_WORDS = %w{reply metadata options commenting comments comment about footer header outer credit sidebar widget subscribe clearfix date social bookmarks links share video watch excerpt related supplement accessibility offscreen meta title signup blq secondary feedback featured clearfix small job jobs listing listings navigation nav byline addcomment postcomment trackback neighbor ads commentform fbfans login similar thumb link blogroll grid twitter wrapper container nav sitesub printfooter editsection visualclear catlinks hidden toc contentsub caption disqus rss shoutbox sponsor blogcomments}.uniq
27
+
28
+ # Words that kill a branch dead
29
+ FATAL_WORDS = %w{comments comment bookmarks social links ads related similar footer digg totop metadata sitesub nav sidebar commenting options addcomment leaderboard offscreen job prevlink prevnext navigation reply-link hide hidden sidebox archives vcard}
30
+
31
+ META_WORDS = %w{january february march april may june july august september october november december jan feb mar apr may jun jul aug sep oct nov dec st th rd nd comments written posted on at published 2000 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 updated last gmt est pst pdt edt cet cdt cst article feature featured filed under comment comments follow twitter facebook email e-mail register story continue continues reading read inside more page next related response responses respond contact street phone tel email e-mail fax info tags tagged tag thanks credit creative commons copy nbsp lt gt this friend printable version subscribe rss mail follow twitter article via leave}.uniq
32
+
33
+ WONT_CONTAIN_FULL_CONTENT = %w{h1 h2 h3 h4 h5 h6 h6 li ol ul br a img meta cite strong em i b input head small big code title sup sub dd dt}
34
+ COULD_CONTAIN_FULL_CONTENT = %w{body div p table tr td article pre blockquote tbody section}
35
+
36
+ ## Output sanitization element sets
37
+ BLOCK_OUTPUT_ELEMENTS = %w{div p h2 h3 h4 h5 h6 li dl pre ul ol blockquote section article audio video cite dd dt figure caption br table tr td thead tbody tfoot}
38
+ INLINE_OUTPUT_ELEMENTS = %w{a img b strong em i br code sup font small big dd dt}
39
+ OUTPUT_ELEMENTS = BLOCK_OUTPUT_ELEMENTS + INLINE_OUTPUT_ELEMENTS
40
+ NON_HEADER_ELEMENTS = %w{p br}
41
+
42
+ # Create a document object based on the raw HTML content provided
43
+ def initialize(raw_content, options = {})
44
+ @options = options
45
+ @raw_content = Pismo::Document.clean_html(raw_content)
46
+ build_doc
47
+ end
48
+
49
+ def build_doc
50
+ @content = {}
51
+
52
+ if RUBY_VERSION > "1.9"
53
+ @raw_content.encode!("UTF-8", :invalid => :replace, :replace => '?') if @raw_content.encoding != "UTF-8"
54
+ @raw_content.encode!("ASCII-8BIT", :invalid => :replace, :replace => '?') if !@raw_content.valid_encoding?
55
+ end
56
+
57
+ # Normalize whitespace (as much to make debugging sessions look nice as anything else)
58
+ @raw_content.gsub!(/\s{2,}/, ' ')
59
+ @raw_content.gsub!(/\r/, "\n")
60
+ @raw_content.gsub!(/\n{3,}/, "\n\n")
61
+ @raw_content.gsub!(/(\<br(\s\/)?\>){2,}/, "</p><p>")
62
+
63
+ # Remove scripts manually, Sanitize and/or Nokogiri seem to go a bit funny with them
64
+ @raw_content.gsub!(/\<script .*?\<\/script\>/im, '')
65
+
66
+ # Get rid of "smart" quotes and other Unicode nonsense
67
+ @raw_content.force_encoding("ASCII-8BIT") if RUBY_VERSION > "1.9"
68
+ @raw_content.gsub!("\xe2\x80\x89", " ")
69
+ @raw_content.gsub!("\xe2\x80\x99", "'")
70
+ @raw_content.gsub!("\xe2\x80\x98", "'")
71
+ @raw_content.gsub!("\xe2\x80\x9c", '"')
72
+ @raw_content.gsub!("\xe2\x80\x9d", '"')
73
+ @raw_content.gsub!("\xe2\x80\xf6", '.')
74
+ @raw_content.force_encoding("UTF-8") if RUBY_VERSION > "1.9"
75
+
76
+
77
+ # Sanitize the HTML
78
+ @raw_content = Sanitize.clean(@raw_content,
79
+ :elements => OK_ELEMENTS,
80
+ :attributes => OK_ATTRIBUTES,
81
+ :remove_contents => true,
82
+ :output_encoding => 'utf-8'
83
+ )
84
+
85
+ @doc = Nokogiri::HTML(@raw_content, nil, 'utf-8')
86
+
87
+ # Do a pre clean up of elements.
88
+ @doc.css("div, span, table, tr, td, pre").each do |el|
89
+ # Any block elements with no child block elements can become paragraphs
90
+ if (BLOCK_OUTPUT_ELEMENTS & el.inner_html.scan(/\<(\w+)/).flatten).empty?
91
+ el.name = "p"
92
+ elsif el.name != "span"
93
+ el.name = "div"
94
+ end
95
+
96
+ # Any SPANs that aren't within paragraphs can become paragraphs too
97
+ el.name = "p" if el.name == "span" && !el.path.scan(/[a-z]+/).include?('p')
98
+
99
+ el.remove if (FATAL_WORDS & (el['id'].to_s + ' ' + el['class'].to_s).downcase.strip.scan(/[a-z]+/)).size > 0
100
+ end
101
+
102
+ analyze
103
+ end
104
+
105
+ # Return the content from best match number of index (default 0) and, optionally, clean it to plain-text
106
+ def content(clean = false, index = 0)
107
+ return @content[[clean, index]] if @content[[clean, index]]
108
+ return '' if !@content_candidates || @content_candidates.empty?
109
+
110
+ content_branch = content_at(index)
111
+ orphans_to_remove = []
112
+
113
+ #ap content_branch.to_html
114
+ #exit
115
+
116
+ # Go through every piece of the content and rip out sections that contain too many tags compared to words
117
+ # This is usually indicative of "widgets" or link bar sections
118
+ content_branch.css('*').each_with_index do |el, i|
119
+ next unless el
120
+
121
+ if el.name == "h1"
122
+ el.remove
123
+ next
124
+ end
125
+
126
+ if el.name == "h2" && content_branch.inner_html.scan('<h2').size == 1
127
+ el.remove
128
+ end
129
+
130
+ # Remove elements that contain words but there are more tags than words overall
131
+ # First, count the words
132
+ #word_count = 0
133
+ #el.traverse do |subel|
134
+ # if subel.text? && subel.path !~ /\/a\// && subel.path !~ /\/(h1|h2|h3|h4|h5|h6)\//
135
+ # word_count += (subel.text.downcase.scan(/[a-z]{4,}/) - META_WORDS).size
136
+ # end
137
+ #end
138
+ #
139
+ ## .. then count the tags
140
+ #
141
+ #inner_tags = el.inner_html.scan(/\<\w.*?\>/).size
142
+ #if word_count < inner_tags && inner_tags > 3 && word_count < 250
143
+ # puts "At #{el.name} #{el['id']} #{el['class']} containing '#{el.text[0..20]}' we have #{word_count} valid words to #{el.inner_html.scan(/\<\w.*?\>/).size} tags"
144
+ # #puts "Removing #{el.name} #{el['id']} #{el['class']} TOO MANY TAGS FOR WORDS"
145
+ # el.remove
146
+ # next
147
+ #end
148
+
149
+ # If there are at least 2 words and a third of them are "meta words," remove the element
150
+ #inner_words = el.text.to_s.downcase.scan(/[a-z]{3,}/)
151
+ #if BLOCK_OUTPUT_ELEMENTS.include?(el.name) && inner_words.size >= 2
152
+ # if ((inner_words & META_WORDS).size >= (inner_words.size / 3))
153
+ # el.remove
154
+ # end
155
+ #end
156
+
157
+ if el.text && el.text.strip.length < 3 && !%w{img}.include?(el.name) && el.inner_html !~ /\<img/
158
+ el.remove
159
+ next
160
+ end
161
+
162
+ if el.name == "p" && el.text !~ /(\.|\?|\!|\"|\')(\s|$)/ && el.inner_html !~ /\<img/
163
+ el.remove
164
+ next
165
+ end
166
+
167
+ # If the ID or class of the element contains a fatally bad word, get rid of it
168
+ if (BAD_WORDS & (el['id'].to_s + ' ' + el['class'].to_s).downcase.scan(/[a-z]+/)).length > 0
169
+ #puts "Removing #{el.name} #{el['id']} #{el['class']} BAD"
170
+ el.remove
171
+ next
172
+ end
173
+ end
174
+
175
+ # If a title was found early in the result document but had text before it, remove that text - it's probably crap
176
+ orphans_to_remove.each { |el| el.remove }
177
+
178
+ # Clean up the HTML again - Nokogiri outputs it with full doctype and crap
179
+ clean_html = strip(Sanitize.clean(content_branch.to_html, :elements => (clean ? BLOCK_OUTPUT_ELEMENTS : OUTPUT_ELEMENTS), :attributes => (clean ? OK_CLEAN_ATTRIBUTES : OK_ATTRIBUTES)))
180
+
181
+ # If the content is desired as "clean" (i.e. plain-text), do some quick fix-ups
182
+ if clean
183
+ # Get rid of line break tags, make list items look nice, remove all other HTML tags, and clean up spaces and newlines
184
+ clean_html.gsub!(/<br.*?>/, "\n")
185
+ clean_html.gsub!(/<li>/, '* ')
186
+ clean_html.gsub!(/<\w+>/, '')
187
+ clean_html.gsub!(/<\/\w+>/, "\n")
188
+ clean_html.gsub!(/\ +/, ' ')
189
+ clean_html.gsub!(/^\s+\n/, "\n")
190
+ clean_html.gsub!(/\n{2,}/, "\n")
191
+ clean_html.strip!
192
+ end
193
+
194
+ # If tags butt up against each other across lines, remove the line break(s)
195
+ clean_html.gsub!(/\>\n+\</, '><')
196
+
197
+ # Get rid of images whose sources are relative (TODO: Make this optional)
198
+ clean_html.gsub!(/\<img .*?\>/i) do |img_tag|
199
+ img_tag =~ /\Whttp/ ? img_tag : ''
200
+ end
201
+
202
+ # Remove empty tags
203
+ clean_html.gsub!(/<(\w+)><\/\1>/, "")
204
+
205
+ # Just a messy, hacky way to make output look nicer with subsequent paragraphs..
206
+ clean_html.gsub!(/<\/(div|p|h1|h2|h3|h4|h5|h6)>/, '</\1>' + "\n\n")
207
+
208
+ @content[[clean, index]] = clean_html
209
+ end
210
+
211
+ def sentences(qty = 3)
212
+ clean_content = Sanitize.clean(content, :elements => NON_HEADER_ELEMENTS, :attributes => OK_CLEAN_ATTRIBUTES, :remove_contents => %w{h1 h2 h3 h4 h5 h6})
213
+
214
+ fodder = ''
215
+ doc = Nokogiri::HTML(clean_content, nil, 'utf-8')
216
+
217
+ doc.traverse do |el|
218
+ path_segments = el.path.scan(/[a-z]+/)[2..-1]
219
+ next unless path_segments && path_segments.length > 1
220
+ if el.text? && el.text.strip.length < 3
221
+ el.remove
222
+ next
223
+ end
224
+ if el.text? && NON_HEADER_ELEMENTS.include?(path_segments[-2])
225
+ text = el.text.strip
226
+ text += "." if text !~ /[\.\!\?\"\']$/
227
+ fodder += text + "\n"
228
+ end
229
+ end
230
+
231
+ fodder = content(true) if fodder.to_s.length < 50
232
+ fodder.gsub!(/\b\w\W\s/, '')
233
+
234
+ #sentences = fodder.scan(/([\&\w\s\-\'\,\+\.\/\\\:\#\(\)\=\"\?\!]+?[\.\?\!])(\s|\Z)/im).map { |s| s.first }
235
+ sentences = fodder.scan(/(.+?[\.\?\!])(\s|\Z)/im).map { |s| s.first.strip }
236
+
237
+ sentences.compact!
238
+ sentences.map! { |s| s.strip }
239
+ sentences.map! { |s| s.sub(/^[^\"\'a-z0-9\(\[]+/im, '') }
240
+ sentences.map! { |s| s.sub(/[^a-z0-9\'\"\)\]\.\!\:\?]+$/im, '') }
241
+ sentences.map! { |s| s.gsub(/\s+/m, ' ') }
242
+ sentences.first(qty)
243
+ end
244
+
245
+ def images(qty = 3)
246
+ doc = Nokogiri::HTML(content, nil, 'utf-8')
247
+ images = []
248
+ doc.css("img").each do |img|
249
+ images << img['src']
250
+ break if images.length == qty
251
+ end
252
+ images
253
+ end
254
+
255
+ # Remove leading and trailing spaces on lines throughout a string (a bit like String#strip, but for multi-lines)
256
+ def strip(s)
257
+ s.gsub(/^\s+/, '').gsub(/\s+$/, '')
258
+ end
259
+ end
260
+ end
261
+ end
@@ -0,0 +1,171 @@
1
+ # encoding: utf-8
2
+
3
+ module Pismo
4
+ module Reader
5
+ class Cluster < Base
6
+
7
+ # Adapted from : http://rubyforge.org/projects/extractcontent/
8
+ #
9
+ # Portions of this code are :
10
+ # Copyright (c) 2007/2008 Nakatani Shuyo / Cybozu Labs Inc. All rights reserved.
11
+ #
12
+ # Permission is hereby granted, free of charge, to any person obtaining
13
+ # a copy of this software and associated documentation files (the
14
+ # "Software"), to deal in the Software without restriction, including
15
+ # without limitation the rights to use, copy, modify, merge, publish,
16
+ # distribute, sublicense, and/or sell copies of the Software, and to
17
+ # permit persons to whom the Software is furnished to do so, subject to
18
+ # the following conditions:
19
+ #
20
+ # The above copyright notice and this permission notice shall be
21
+ # included in all copies or substantial portions of the Software.
22
+ #
23
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
27
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
28
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
29
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
30
+
31
+ # Default option parameters
32
+ DEFAULTS = {
33
+ :threshold => 100, # threshold for score of the text
34
+ :min_length => 80, # minimum length of evaluated blocks
35
+ :decay_factor => 0.73, # decay factor for block score
36
+ :continuous_factor => 1.62, # continuous factor for block score ( the larger, the harder to continue )
37
+ :no_body_factor => 0.72, # no body factor that reduces block score if waste expressions are present
38
+ :punctuation_weight => 10, # score weight for punctuation
39
+ :punctuations => /(\.[^A-Za-z0-9]|,[^0-9]|!|\?)/, # punctuation characters
40
+ :waste_expressions => /Copyright|All Rights Reserved/i, # characteristic keywords including footer
41
+ :debug => false, # if true, output block information to stdout
42
+ }
43
+
44
+ # Analyze the structure of the HTML document and score content blocks for likelihood of containing useful content
45
+ def analyze
46
+
47
+ opt = DEFAULTS.clone
48
+ opt.merge!(@options)
49
+
50
+ @sections = []
51
+ factor = continuous = 1.0
52
+ body = ''
53
+ score = 0
54
+
55
+ # The content is split into blocks of divs
56
+ list = @raw_content.split(/<\/?(?:div)[^>]*>/)
57
+ list.each do |block|
58
+ next unless block
59
+ block.gsub!(/\n/, '')
60
+
61
+ # Ignore blocks that have no tex
62
+ next if has_only_tags?(block)
63
+
64
+ # Each new block iterated over makes it less likely for it to belong
65
+ # to the existing block
66
+ continuous /= opt[:continuous_factor] if body.length > 0
67
+
68
+ # Clean up and strip block of html tags for scoring
69
+ clean = clean_block(block)
70
+ #clean = strip_tags(block)
71
+ next if clean.length < opt[:min_length]
72
+
73
+ # Calculate scores for clustering of blocks
74
+
75
+ # c represents how probable it is for this block to be a content block
76
+ c = (clean.length + clean.scan(opt[:punctuations]).length * opt[:punctuation_weight]) * factor
77
+
78
+ # The further down the document we go (i.e. the more blocks we see),
79
+ # the less likely they are to be valid content blocks
80
+ factor *= opt[:decay_factor]
81
+
82
+ # The not body rate represents how likely this is to be a junk block
83
+ not_body_rate = block.scan(opt[:waste_expressions]).length
84
+
85
+ # The block score is reduced if there is a not_body_rate
86
+ c *= (opt[:no_body_factor] ** not_body_rate) if not_body_rate>0
87
+
88
+ # c1 represents how probable it is for this block to belong to the
89
+ # existing block or if it is a new one
90
+ c1 = c * continuous
91
+
92
+ puts "----- #{c}*#{continuous}=#{c1} #{clean.length} \n\n" if opt[:debug]
93
+
94
+ if c1 > opt[:threshold]
95
+ # Treat continuous blocks as cluster
96
+ body += block + "\n"
97
+ score += c1
98
+ continuous = opt[:continuous_factor]
99
+ elsif c > opt[:threshold]
100
+ # Continuous block end
101
+ @sections << { :body => body, :score => score }
102
+ body = block + "\n"
103
+ score = c
104
+ continuous = opt[:continuous_factor]
105
+ else
106
+ # We drop blocks that don't have a high enough c score
107
+ end
108
+ end
109
+ # Add the last block as we've finished iterating
110
+ @sections << { :body => body, :score => score } if body
111
+ # Sort the sections by score
112
+ sorted_sections = @sections.sort_by { |section| section[:score] }
113
+ # Convert to nokogiri representation for compatibility with the content method
114
+ @content_candidates = sorted_sections.reverse.map { |section| Nokogiri::HTML(section[:body], nil, 'utf-8') }
115
+ end
116
+
117
+ def content_at(index)
118
+ @content_candidates[index]
119
+ end
120
+
121
+ protected
122
+
123
+ # Checks if the given block has only tags without text.
124
+ def has_only_tags?(block)
125
+ block.gsub(/<[^>]*>/im, '').strip.length == 0
126
+ end
127
+
128
+ # Eliminates link heavy blocks and blocks that are lists of links and
129
+ # then returns block stripped of tags
130
+ def clean_block(block)
131
+ # Return empty block if it is a list of links
132
+ return "" if is_link_list?(block)
133
+
134
+ # Return empty block if it is a very link heavy block
135
+ count = 0
136
+ no_links = block.gsub(/<a\s[^>]*>.*?<\/a\s*>/im){count+=1;''}.gsub(/<form\s[^>]*>.*?<\/form\s*>/im, '')
137
+ return "" if no_links.length < 20 * count
138
+
139
+ strip_tags(no_links)
140
+ end
141
+
142
+ # Determines whether a block is link list or not
143
+ def is_link_list?(st)
144
+ if st =~ /<(?:ul|dl|ol)(.+?)<\/(?:ul|dl|ol)>/im
145
+ listpart = $1
146
+ outside = st.gsub(/<(?:ul|dl)(.+?)<\/(?:ul|dl)>/imn, '').gsub(/<.+?>/mn, '').gsub(/\s+/, ' ')
147
+ list = listpart.split(/<li[^>]*>/)
148
+ list.shift
149
+ rate = evaluate_list(list)
150
+ outside.length <= st.length / (45 / rate)
151
+ end
152
+ end
153
+
154
+ # Estimates how much degree of link list
155
+ def evaluate_list(list)
156
+ return 1 if list.length == 0
157
+ hit = 0
158
+ list.each do |line|
159
+ hit +=1 if line =~ /<a\s+href=(['"]?)([^"'\s]+)\1/imn
160
+ end
161
+ return 9 * (1.0 * hit / list.length) ** 2 + 1
162
+ end
163
+
164
+ # Removes all html tags and attributes from html
165
+ def strip_tags(html)
166
+ strip(Sanitize.clean(html, :elements => [], :attributes => []))
167
+ end
168
+
169
+ end
170
+ end
171
+ end
@@ -0,0 +1,154 @@
1
+ module Pismo
2
+ module Reader
3
+ class Tree < Base
4
+
5
+ # Analyze the structure of the HTML document and score branches for likelihood of containing useful content
6
+ def analyze
7
+ @tree = {}
8
+ subels = {}
9
+
10
+ t1 = Time.now.to_i + (Time.now.usec.to_f / 1000000)
11
+
12
+ @doc.css(COULD_CONTAIN_FULL_CONTENT.join(", ")).each do |el|
13
+ # Assume that no content we'll want comes in a total package of fewer than 80 characters!
14
+ next unless el.text.to_s.strip.length >= 80
15
+
16
+ path_segments = el.path.scan(/[a-z]+/)[2..-1] || []
17
+ depth = path_segments.length
18
+
19
+ local_ids = (el['id'].to_s + ' ' + el['class'].to_s).downcase.strip.scan(/[a-z]+/)
20
+ ids = local_ids
21
+
22
+ cp = el.parent
23
+ (depth - 1).times do
24
+ ids += (cp['id'].to_s + ' ' + cp['class'].to_s).downcase.strip.scan(/[a-z]+/)
25
+ cp = cp.parent
26
+ end if depth > 1
27
+
28
+ #puts "IDS"
29
+ #ap ids
30
+ #puts "LOCAL IDS"
31
+ #ap local_ids
32
+
33
+ branch = {}
34
+ branch[:ids] = ids
35
+ branch[:local_ids] = local_ids
36
+ branch[:score] = -(BAD_WORDS & ids).size
37
+ branch[:score] += ((GOOD_WORDS & ids).size * 2)
38
+ next if branch[:score] < -5
39
+
40
+ #puts "#{ids.join(",")} - #{branch[:score].to_s} - #{el.text.to_s.strip.length}"
41
+
42
+ # Elements that have an ID or class are more likely to be our winners
43
+ branch[:score] += 2 unless local_ids.empty?
44
+
45
+ branch[:name] = el.name
46
+ branch[:depth] = depth
47
+ branch[:path] = el.path
48
+
49
+ branch[:raw_word_count] = 0
50
+ branch[:word_count] = 0
51
+ branch[:child_count] = 0
52
+ branch[:bad_child_count] = 0
53
+ branch[:score_steps] = []
54
+
55
+
56
+ el.traverse do |subel|
57
+ div_at_end_of_branch = false if subel.name == "div"
58
+ path = subel.path
59
+ subels[path] ||= {}
60
+ subels[path][:path_segments] ||= (path.scan(/[a-z]+/)[2..-1] || [])
61
+ subels[path][:is_text] ||= subel.text?
62
+
63
+ if subels[path][:is_text]
64
+ subels[path][:text] ||= subel.text.downcase.scan(/[a-z]+/)
65
+ next if subels[path][:text].empty?
66
+
67
+ subels[path][:raw_word_count] ||= subels[path][:text].size
68
+ subels[path][:word_count] ||= (%{a h1 h2 h3 h4 h5 h6 h6}.include?(subel.parent.name) ? 0 : subels[path][:text].select { |word| word.length > 3 }.size)
69
+ subels[path][:meta_matches] ||= (subels[path][:text] & META_WORDS).size
70
+
71
+ branch[:raw_word_count] += subels[path][:raw_word_count]
72
+ branch[:word_count] += subels[path][:word_count] - subels[path][:meta_matches]
73
+ end
74
+
75
+ subels[path][:ids] ||= (subel['id'].to_s + ' ' + subel['class'].to_s).gsub(/[^a-z]/, ' ').downcase.strip.split(/\s+/)
76
+ subels[path][:bad_child_count_inc] = (BAD_WORDS & subels[path][:ids]).size - (GOOD_WORDS & subels[path][:ids]).size
77
+ subels[path][:child_count_inc] = subels[path][:ids].empty? ? 0 : 1
78
+
79
+ branch[:bad_child_count] += subels[path][:bad_child_count_inc]
80
+ branch[:child_count] += subels[path][:child_count_inc]
81
+ end
82
+
83
+ branch[:score] += 2 if branch[:name] == "div"
84
+ branch[:score] += 4 if el.text.scan(/\,\s/).size > 10
85
+ branch[:score_steps] << "lots of commas!" if el.text.scan(/\,\s/).size > 5
86
+ branch[:score] *= 3
87
+
88
+
89
+ branch[:score] *= 0.7 if el.children && el.children.size < 3
90
+ branch[:score] *= 1.25 if branch[:raw_word_count] > 10
91
+ next if branch[:raw_word_count] < 10
92
+ branch[:score] += [branch[:word_count], 1].max ** 0.5
93
+
94
+
95
+ word_child_count_ratio = branch[:word_count].to_f / [branch[:child_count], 1].max
96
+ branch[:word_child_count_ratio] = word_child_count_ratio
97
+
98
+ if branch[:raw_word_count] > 100
99
+ good_word_ratio = branch[:word_count].to_f / branch[:raw_word_count]
100
+ branch[:score] += good_word_ratio * 12
101
+
102
+ if word_child_count_ratio > 50
103
+ branch[:score] *= 1.5
104
+ elsif word_child_count_ratio > 30
105
+ branch[:score] *= 1.2
106
+ elsif word_child_count_ratio > 15
107
+ branch[:score] *= 1.1
108
+ elsif word_child_count_ratio < 4
109
+ branch[:score] *= 0.9
110
+ end
111
+ end
112
+
113
+ branch[:score_steps] << "s1: #{branch[:score]}"
114
+
115
+ bad_child_ratio = branch[:bad_child_count].to_f / [branch[:child_count], 1].max
116
+ branch[:bad_child_ratio] = bad_child_ratio
117
+ branch[:score] += 3 if bad_child_ratio < 0.0
118
+ branch[:score] -= 3 if bad_child_ratio > 0.15
119
+ branch[:score] -= 2 if bad_child_ratio > 0.25
120
+ branch[:score] -= 2 if bad_child_ratio > 0.4
121
+ branch[:score] -= 4 if bad_child_ratio > 0.5
122
+ branch[:score] -= 5 if bad_child_ratio > 0.7
123
+ branch[:score] -= 5 if branch[:bad_child_count] > 20
124
+
125
+ branch[:score] += depth
126
+ branch[:score] *= 0.8 if ids.length > 10
127
+
128
+
129
+
130
+ @tree[el.path] = branch
131
+ end
132
+
133
+
134
+ sorted_tree = @tree.sort_by { |k, v| v[:score] }
135
+
136
+ #ap @doc.at(sorted_tree.first[0]).text
137
+
138
+ # Sort the branches by their score in reverse order
139
+ @content_candidates = sorted_tree.reverse.first([5, sorted_tree.length].min)
140
+
141
+ #ap @content_candidates #.map { |i| [i[0], i[1][:name], i[1][:ids].join(','), i[1][:score] ]}
142
+ #t2 = Time.now.to_i + (Time.now.usec.to_f / 1000000)
143
+ #puts t2 - t1
144
+ #exit
145
+
146
+ end
147
+
148
+ def content_at(index)
149
+ @doc.at(@content_candidates[index].first)
150
+ end
151
+
152
+ end
153
+ end
154
+ end