nddrylliog_pismo 0.7.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. data/.document +5 -0
  2. data/.gitignore +29 -0
  3. data/Gemfile +4 -0
  4. data/LICENSE +23 -0
  5. data/NOTICE +4 -0
  6. data/README.markdown +131 -0
  7. data/Rakefile +72 -0
  8. data/bin/pismo +45 -0
  9. data/lib/pismo.rb +82 -0
  10. data/lib/pismo/document.rb +67 -0
  11. data/lib/pismo/external_attributes.rb +14 -0
  12. data/lib/pismo/internal_attributes.rb +316 -0
  13. data/lib/pismo/reader.rb +19 -0
  14. data/lib/pismo/reader/base.rb +259 -0
  15. data/lib/pismo/reader/cluster.rb +171 -0
  16. data/lib/pismo/reader/tree.rb +154 -0
  17. data/lib/pismo/stopwords.txt +1002 -0
  18. data/lib/pismo/version.rb +3 -0
  19. data/pismo.gemspec +30 -0
  20. data/test/corpus/bbcnews.html +2131 -0
  21. data/test/corpus/bbcnews2.html +1575 -0
  22. data/test/corpus/briancray.html +269 -0
  23. data/test/corpus/cant_read.html +426 -0
  24. data/test/corpus/factor.html +1362 -0
  25. data/test/corpus/gmane.html +138 -0
  26. data/test/corpus/huffington.html +2932 -0
  27. data/test/corpus/metadata_expected.yaml +72 -0
  28. data/test/corpus/metadata_expected.yaml.old +122 -0
  29. data/test/corpus/queness.html +919 -0
  30. data/test/corpus/reader_expected.yaml +39 -0
  31. data/test/corpus/readers/cluster_expected.yaml +45 -0
  32. data/test/corpus/readers/tree_expected.yaml +55 -0
  33. data/test/corpus/rubyinside.html +318 -0
  34. data/test/corpus/rww.html +1351 -0
  35. data/test/corpus/spolsky.html +298 -0
  36. data/test/corpus/techcrunch.html +1285 -0
  37. data/test/corpus/tweet.html +360 -0
  38. data/test/corpus/youtube.html +2348 -0
  39. data/test/corpus/zefrank.html +535 -0
  40. data/test/helper.rb +15 -0
  41. data/test/test_corpus.rb +54 -0
  42. data/test/test_pismo_document.rb +34 -0
  43. metadata +156 -0
@@ -0,0 +1,14 @@
1
+ module Pismo
2
+ # External attributes return data that comes from external services or programs (e.g. Delicious tags)
3
+ module ExternalAttributes
4
+ #include HTTParty
5
+ #
6
+ #def delicious_tags
7
+ # delicious_info["top_tags"].sort_by { |k, v| v }.reverse.first(5) rescue []
8
+ #end
9
+ #
10
+ #def delicious_info
11
+ # @delicious_info ||= self.class.get('http://feeds.delicious.com/v2/json/urlinfo/' + Digest::MD5.hexdigest(@url)).first rescue nil
12
+ #end
13
+ end
14
+ end
@@ -0,0 +1,316 @@
1
+ module Pismo
2
+ # Internal attributes are different pieces of data we can extract from a document's content
3
+ module InternalAttributes
4
+ # Returns the title of the page/content - attempts to strip site name, etc, if possible
5
+ def title(all = false)
6
+ # TODO: Memoizations
7
+ title = @doc.match(
8
+ [
9
+ '#pname a', # Google Code style
10
+ '.entryheader h1', # Ruby Inside/Kubrick
11
+ '.entry-title a', # Common Blogger/Blogspot rules
12
+ '.post-title a',
13
+ '.post_title a',
14
+ '.posttitle a',
15
+ '.post-header h1',
16
+ '.entry-title',
17
+ '.post-title',
18
+ '.post h1',
19
+ '.post h3 a',
20
+ 'a.datitle', # Slashdot style
21
+ '.posttitle',
22
+ '.post_title',
23
+ '.pageTitle',
24
+ '#main h1.title',
25
+ '.title h1',
26
+ '.post h2',
27
+ 'h2.title',
28
+ '.entry h2 a',
29
+ '.entry h2', # Common style
30
+ '.boite_titre a',
31
+ ['meta[@name="title"]', lambda { |el| el.attr('content') }],
32
+ 'h1.headermain',
33
+ 'h1.title',
34
+ '.mxb h1', # BBC News
35
+ '#content h1',
36
+ '#content h2',
37
+ '#content h3',
38
+ 'a[@rel="bookmark"]',
39
+ '.products h2',
40
+ '.caption h3',
41
+ '#main h2',
42
+ '#body h1',
43
+ '#wrapper h1',
44
+ '#page h1',
45
+ '.asset-header h1',
46
+ '#body_content h2'
47
+ ],
48
+ all
49
+ )
50
+
51
+ # If all else fails, go to the HTML title
52
+ if all
53
+ return [html_title] if !title
54
+ return ([*title] + [html_title]).uniq
55
+ else
56
+ return html_title if !title
57
+ return title
58
+ end
59
+ end
60
+
61
+ def titles
62
+ title(true)
63
+ end
64
+
65
+
66
+ # HTML title
67
+ def html_title
68
+ title = @doc.match('title')
69
+ return unless title
70
+ title
71
+ end
72
+
73
+ # Return an estimate of when the page/content was created
74
+ # As clients of this library should be doing HTTP retrieval themselves, they can fall to the
75
+ # Last-Updated HTTP header if they so wish. This method is just rough and based on content only.
76
+ def datetime
77
+ # TODO: Clean all this mess up
78
+
79
+ mo = %r{(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December)}i
80
+
81
+ regexen = [
82
+ /#{mo}\b\s+\d+\D{1,10}\d{4}/i,
83
+ /(on\s+)?\d+\s+#{mo}\s+\D{1,10}\d+/i,
84
+ /(on[^\d+]{1,10})\d+(th|st|rd)?.{1,10}#{mo}\b[^\d]{1,10}\d+/i,
85
+ /\b\d{4}\-\d{2}\-\d{2}\b/i,
86
+ /\d+(th|st|rd).{1,10}#{mo}\b[^\d]{1,10}\d+/i,
87
+ /\d+\s+#{mo}\b[^\d]{1,10}\d+/i,
88
+ /on\s+#{mo}\s+\d+/i,
89
+ /#{mo}\s+\d+/i,
90
+ /\d{4}[\.\/\-]\d{2}[\.\/\-]\d{2}/,
91
+ /\d{2}[\.\/\-]\d{2}[\.\/\-]\d{4}/
92
+ ]
93
+
94
+ datetime = 10
95
+
96
+ regexen.each do |r|
97
+ break if datetime = @doc.to_html[r]
98
+ end
99
+
100
+ return unless datetime && datetime.length > 4
101
+
102
+ # Clean up the string for use by Chronic
103
+ datetime.strip!
104
+ datetime.gsub!(/(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)[^\w]*/i, '')
105
+ datetime.gsub!(/(mon|tues|tue|weds|wed|thurs|thur|thu|fri|sat|sun)[^\w]*/i, '')
106
+ datetime.sub!(/on\s+/, '')
107
+ datetime.gsub!(/\,/, '')
108
+ datetime.sub!(/(\d+)(th|st|rd)/, '\1')
109
+
110
+ Chronic.parse(datetime) || datetime
111
+ end
112
+
113
+ # Returns the author of the page/content
114
+ def author(all = false)
115
+ author = @doc.match([
116
+ '.post-author .fn',
117
+ '.wire_author',
118
+ '.cnnByline b',
119
+ '.editorlink',
120
+ '.authors p',
121
+ ['meta[@name="author"]', lambda { |el| el.attr('content') }], # Traditional meta tag style
122
+ ['meta[@name="Author"]', lambda { |el| el.attr('content') }], # CNN style
123
+ ['meta[@name="AUTHOR"]', lambda { |el| el.attr('content') }], # CNN style
124
+ '.byline a', # Ruby Inside style
125
+ '.byline',
126
+ '.post_subheader_left a', # TechCrunch style
127
+ '.byl', # BBC News style
128
+ '.articledata .author a',
129
+ '#owners a', # Google Code style
130
+ '.author a',
131
+ '.author',
132
+ '.auth a',
133
+ '.auth',
134
+ '.cT-storyDetails h5', # smh.com.au - worth dropping maybe..
135
+ ['meta[@name="byl"]', lambda { |el| el.attr('content') }],
136
+ '.timestamp a',
137
+ '.fn a',
138
+ '.fn',
139
+ '.byline-author',
140
+ '.ArticleAuthor a',
141
+ '.blog_meta a',
142
+ 'cite a',
143
+ 'cite',
144
+ '.contributor_details h4 a',
145
+ '.meta a'
146
+ ], all)
147
+
148
+ return unless author
149
+
150
+ # Strip off any "By [whoever]" section
151
+ if String === author
152
+ author.sub!(/^(post(ed)?\s)?by\W+/i, '')
153
+ author.tr!('^a-zA-Z 0-9\'', '|')
154
+ author = author.split(/\|{2,}/).first.to_s
155
+ author.gsub!(/\s+/, ' ')
156
+ author.gsub!(/\|/, '')
157
+ author.strip!
158
+ elsif Array === author
159
+ author.map! { |a| a.sub(/^(post(ed)?\s)?by\W+/i, '') }.uniq!
160
+ end
161
+
162
+ author
163
+ end
164
+
165
+ def authors
166
+ author(true)
167
+ end
168
+
169
+
170
+ # Returns the "description" of the page, usually comes from a meta tag
171
+ def description
172
+ @doc.match([
173
+ ['meta[@name="description"]', lambda { |el| el.attr('content') }],
174
+ ['meta[@name="Description"]', lambda { |el| el.attr('content') }],
175
+ ['meta[@name="DESCRIPTION"]', lambda { |el| el.attr('content') }],
176
+ 'rdf:Description[@name="dc:description"]',
177
+ '.description'
178
+ ])
179
+ end
180
+
181
+ # Returns the "lede(s)" or first paragraph(s) of the story/page
182
+ def lede(all = false)
183
+ lede = @doc.match([
184
+ '.post-text p',
185
+ '.post-body p',
186
+ '#blogpost p',
187
+ '.story-teaser',
188
+ '.article .body p',
189
+ '//div[@class="entrytext"]//p[string-length()>40]', # Ruby Inside / Kubrick style
190
+ 'section p',
191
+ '.entry .text p',
192
+ '.hentry .content p',
193
+ '.entry-content p',
194
+ '#wikicontent p', # Google Code style
195
+ '.wikistyle p', # GitHub style
196
+ '//td[@class="storybody"]/p[string-length()>40]', # BBC News style
197
+ '//div[@class="entry"]//p[string-length()>100]',
198
+ # The below is a horrible, horrible way to pluck out lead paras from crappy Blogspot blogs that
199
+ # don't use <p> tags..
200
+ ['.entry-content', lambda { |el| el.inner_html[/(#{el.inner_text[0..4].strip}.*?)\<br/, 1] }],
201
+ ['.entry', lambda { |el| el.inner_html[/(#{el.inner_text[0..4].strip}.*?)\<br/, 1] }],
202
+ '.entry',
203
+ '#content p',
204
+ '#article p',
205
+ '.post-body',
206
+ '.entry-content',
207
+ '.document_description_short p', # Scribd
208
+ '.single-post p'
209
+ ], all)
210
+
211
+ # TODO: Improve sentence extraction - this is dire even if it "works for now"
212
+ if lede && String === lede
213
+ return (lede[/^(.*?[\.\!\?]\s){1,3}/m] || lede).to_s.strip
214
+ elsif lede && Array === lede
215
+ return lede.map { |l| l.to_s[/^(.*?[\.\!\?]\s){1,3}/m].strip || l }.uniq
216
+ else
217
+ return reader_doc && !reader_doc.sentences(4).empty? ? reader_doc.sentences(4).join(' ') : nil
218
+ end
219
+ end
220
+
221
+ def ledes
222
+ lede(true) rescue []
223
+ end
224
+
225
+ # Returns a string containing the first [limit] sentences as determined by the Reader algorithm
226
+ def sentences(limit = 3)
227
+ reader_doc && !reader_doc.sentences.empty? ? reader_doc.sentences(limit).join(' ') : nil
228
+ end
229
+
230
+ # Returns any images with absolute URLs in the document
231
+ def images(limit = 3)
232
+ reader_doc && !reader_doc.images.empty? ? reader_doc.images(limit) : nil
233
+ end
234
+
235
+ # Returns the "keywords" in the document (not the meta keywords - they're next to useless now)
236
+ def keywords(options = {})
237
+ options = { :stem_at => 20, :word_length_limit => 15, :limit => 20, :remove_stopwords => true, :minimum_score => 2 }.merge(options)
238
+
239
+ words = {}
240
+
241
+ # Convert doc to lowercase, scrub out most HTML tags, then keep track of words
242
+ cached_title = title.to_s
243
+ content_to_use = body.to_s.downcase + " " + description.to_s.downcase
244
+
245
+ # old regex for safe keeping -- \b[a-z][a-z\+\.\'\+\#\-]*\b
246
+ content_to_use.downcase.gsub(/\<[^\>]{1,100}\>/, '').gsub(/\.+\s+/, ' ').gsub(/\&\w+\;/, '').scan(/(\b|\s|\A)([a-z0-9][a-z0-9\+\.\'\+\#\-\\]*)(\b|\s|\Z)/i).map{ |ta1| ta1[1] }.compact.each do |word|
247
+ next if word.length > options[:word_length_limit]
248
+ word.gsub!(/^[\']/, '')
249
+ word.gsub!(/[\.\-\']$/, '')
250
+ next if options[:hints] && !options[:hints].include?(word)
251
+ words[word] ||= 0
252
+ words[word] += (cached_title.downcase =~ /\b#{word}\b/ ? 5 : 1)
253
+ end
254
+
255
+ # Stem the words and stop words if necessary
256
+ d = words.keys.uniq.map { |a| a.length > options[:stem_at] ? a.stem : a }
257
+ s = Pismo.stopwords.map { |a| a.length > options[:stem_at] ? a.stem : a }
258
+
259
+ words.delete_if { |k1, v1| v1 < options[:minimum_score] }
260
+ words.delete_if { |k1, v1| s.include?(k1) } if options[:remove_stopwords]
261
+ words.sort_by { |k2, v2| v2 }.reverse.first(options[:limit])
262
+ end
263
+
264
+ def reader_doc
265
+ @reader_doc ||= Reader::Document.create(@doc.to_s, @options)
266
+ end
267
+
268
+ # Returns body text as determined by Reader algorithm
269
+ def body
270
+ @body ||= reader_doc.content(true).strip
271
+ end
272
+
273
+ # Returns body text as determined by Reader algorithm WITH basic HTML formatting intact
274
+ def html_body
275
+ @html_body ||= reader_doc.content.strip
276
+ end
277
+
278
+ # Returns URL to the site's favicon
279
+ def favicon
280
+ url = @doc.match([['link[@rel="fluid-icon"]', lambda { |el| el.attr('href') }], # Get a Fluid icon if possible..
281
+ ['link[@rel="shortcut icon"]', lambda { |el| el.attr('href') }],
282
+ ['link[@rel="icon"]', lambda { |el| el.attr('href') }]])
283
+ if url && url !~ /^http/ && @url
284
+ url = URI.join(@url , url).to_s
285
+ end
286
+
287
+ url
288
+ end
289
+
290
+ # Returns URL(s) of Web feed(s)
291
+ def feed(all = false)
292
+ url = @doc.match([['link[@type="application/rss+xml"]', lambda { |el| el.attr('href') }],
293
+ ['link[@type="application/atom+xml"]', lambda { |el| el.attr('href') }]], all
294
+ )
295
+
296
+ if url && String === url && url !~ /^http/ && @url
297
+ url = URI.join(@url , url).to_s
298
+ elsif url && Array === url
299
+ url.map! do |u|
300
+ if u !~ /^http/ && @url
301
+ URI.join(@url, u).to_s
302
+ else
303
+ u
304
+ end
305
+ end
306
+ url.uniq!
307
+ end
308
+
309
+ url
310
+ end
311
+
312
+ def feeds
313
+ feed(true)
314
+ end
315
+ end
316
+ end
@@ -0,0 +1,19 @@
1
+ module Pismo
2
+ module Reader
3
+ class Document
4
+
5
+ def self.create(raw_content, options = {})
6
+ type = options.delete(:reader)
7
+ case type
8
+ when :score
9
+ Pismo::Reader::Tree.new(raw_content, options)
10
+ when :cluster
11
+ Pismo::Reader::Cluster.new(raw_content, options)
12
+ else
13
+ Pismo::Reader::Tree.new(raw_content, options)
14
+ end
15
+ end
16
+
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,259 @@
1
+ require 'nokogiri'
2
+ require 'sanitize'
3
+ begin; require 'ap'; rescue LoadError; end
4
+
5
+ module Pismo
6
+ module Reader
7
+ class Base
8
+ attr_reader :raw_content, :doc, :content_candidates, :options
9
+
10
+ # Elements to keep for /input/ sanitization
11
+ OK_ELEMENTS = %w{a td br th tbody table tr div span img strong em b i body html head title p h1 h2 h3 h4 h5 h6 pre code tt ul li ol blockquote font big small section article abbr audio video cite dd dt figure caption sup form dl dt dd center}
12
+
13
+ # Build a tree of attributes that are allowed for each element.. doing it this messy way due to how Sanitize works, alas
14
+ OK_ATTRIBUTES = {}
15
+ OK_CLEAN_ATTRIBUTES = {}
16
+ OK_ELEMENTS.each { |el| OK_ATTRIBUTES[el] = %w{id class href name content type alt title src} }
17
+ OK_ELEMENTS.each { |el| OK_CLEAN_ATTRIBUTES[el] = %w{href title src alt} }
18
+
19
+
20
+ # Words that we'd like to see in class and ID names for "content"
21
+ GOOD_WORDS = %w{content post blogpost main story body entry text desc asset hentry single entrytext postcontent bodycontent}.uniq
22
+
23
+ # Words that indicate crap in general
24
+ BAD_WORDS = %w{reply metadata options commenting comments comment about footer header outer credit sidebar widget subscribe clearfix date social bookmarks links share video watch excerpt related supplement accessibility offscreen meta title signup blq secondary feedback featured clearfix small job jobs listing listings navigation nav byline addcomment postcomment trackback neighbor ads commentform fbfans login similar thumb link blogroll grid twitter wrapper container nav sitesub printfooter editsection visualclear catlinks hidden toc contentsub caption disqus rss shoutbox sponsor blogcomments}.uniq
25
+
26
+ # Words that kill a branch dead
27
+ FATAL_WORDS = %w{comments comment bookmarks social links ads related similar footer digg totop metadata sitesub nav sidebar commenting options addcomment leaderboard offscreen job prevlink prevnext navigation reply-link hide hidden sidebox archives vcard}
28
+
29
+ META_WORDS = %w{january february march april may june july august september october november december jan feb mar apr may jun jul aug sep oct nov dec st th rd nd comments written posted on at published 2000 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 updated last gmt est pst pdt edt cet cdt cst article feature featured filed under comment comments follow twitter facebook email e-mail register story continue continues reading read inside more page next related response responses respond contact street phone tel email e-mail fax info tags tagged tag thanks credit creative commons copy nbsp lt gt this friend printable version subscribe rss mail follow twitter article via leave}.uniq
30
+
31
+ WONT_CONTAIN_FULL_CONTENT = %w{h1 h2 h3 h4 h5 h6 h6 li ol ul br a img meta cite strong em i b input head small big code title sup sub dd dt}
32
+ COULD_CONTAIN_FULL_CONTENT = %w{body div p table tr td article pre blockquote tbody section}
33
+
34
+ ## Output sanitization element sets
35
+ BLOCK_OUTPUT_ELEMENTS = %w{div p h2 h3 h4 h5 h6 li dl pre ul ol blockquote section article audio video cite dd dt figure caption br table tr td thead tbody tfoot}
36
+ INLINE_OUTPUT_ELEMENTS = %w{a img b strong em i br code sup font small big dd dt}
37
+ OUTPUT_ELEMENTS = BLOCK_OUTPUT_ELEMENTS + INLINE_OUTPUT_ELEMENTS
38
+ NON_HEADER_ELEMENTS = %w{p br}
39
+
40
+ # Create a document object based on the raw HTML content provided
41
+ def initialize(raw_content, options = {})
42
+ @options = options
43
+ @raw_content = Pismo::Document.clean_html(raw_content)
44
+ build_doc
45
+ end
46
+
47
+ def build_doc
48
+ @content = {}
49
+
50
+ if RUBY_VERSION > "1.9"
51
+ @raw_content.encode!("UTF-8", :invalid => :replace, :replace => '?') if @raw_content.encoding != "UTF-8"
52
+ @raw_content.encode!("ASCII-8BIT", :invalid => :replace, :replace => '?') if !@raw_content.valid_encoding?
53
+ end
54
+
55
+ # Normalize whitespace (as much to make debugging sessions look nice as anything else)
56
+ @raw_content.gsub!(/\s{2,}/, ' ')
57
+ @raw_content.gsub!(/\r/, "\n")
58
+ @raw_content.gsub!(/\n{3,}/, "\n\n")
59
+ @raw_content.gsub!(/(\<br(\s\/)?\>){2,}/, "</p><p>")
60
+
61
+ # Remove scripts manually, Sanitize and/or Nokogiri seem to go a bit funny with them
62
+ @raw_content.gsub!(/\<script .*?\<\/script\>/im, '')
63
+
64
+ # Get rid of bullshit "smart" quotes and other Unicode nonsense
65
+ @raw_content.force_encoding("ASCII-8BIT") if RUBY_VERSION > "1.9"
66
+ @raw_content.gsub!("\xe2\x80\x89", " ")
67
+ @raw_content.gsub!("\xe2\x80\x99", "'")
68
+ @raw_content.gsub!("\xe2\x80\x98", "'")
69
+ @raw_content.gsub!("\xe2\x80\x9c", '"')
70
+ @raw_content.gsub!("\xe2\x80\x9d", '"')
71
+ @raw_content.gsub!("\xe2\x80\xf6", '.')
72
+ @raw_content.force_encoding("UTF-8") if RUBY_VERSION > "1.9"
73
+
74
+
75
+ # Sanitize the HTML
76
+ @raw_content = Sanitize.clean(@raw_content,
77
+ :elements => OK_ELEMENTS,
78
+ :attributes => OK_ATTRIBUTES,
79
+ :remove_contents => true,
80
+ :output_encoding => 'utf-8'
81
+ )
82
+
83
+ @doc = Nokogiri::HTML(@raw_content, nil, 'utf-8')
84
+
85
+ # Do a pre clean up of elements.
86
+ @doc.css("div, span, table, tr, td, pre").each do |el|
87
+ # Any block elements with no child block elements can become paragraphs
88
+ if (BLOCK_OUTPUT_ELEMENTS & el.inner_html.scan(/\<(\w+)/).flatten).empty?
89
+ el.name = "p"
90
+ elsif el.name != "span"
91
+ el.name = "div"
92
+ end
93
+
94
+ # Any SPANs that aren't within paragraphs can become paragraphs too
95
+ el.name = "p" if el.name == "span" && !el.path.scan(/[a-z]+/).include?('p')
96
+
97
+ el.remove if (FATAL_WORDS & (el['id'].to_s + ' ' + el['class'].to_s).downcase.strip.scan(/[a-z]+/)).size > 0
98
+ end
99
+
100
+ analyze
101
+ end
102
+
103
+ # Return the content from best match number of index (default 0) and, optionally, clean it to plain-text
104
+ def content(clean = false, index = 0)
105
+ return @content[[clean, index]] if @content[[clean, index]]
106
+ return '' if !@content_candidates || @content_candidates.empty?
107
+
108
+ content_branch = content_at(index)
109
+ orphans_to_remove = []
110
+
111
+ #ap content_branch.to_html
112
+ #exit
113
+
114
+ # Go through every piece of the content and rip out sections that contain too many tags compared to words
115
+ # This is usually indicative of "widgets" or link bar sections
116
+ content_branch.css('*').each_with_index do |el, i|
117
+ next unless el
118
+
119
+ if el.name == "h1"
120
+ el.remove
121
+ next
122
+ end
123
+
124
+ if el.name == "h2" && content_branch.inner_html.scan('<h2').size == 1
125
+ el.remove
126
+ end
127
+
128
+ # Remove elements that contain words but there are more tags than words overall
129
+ # First, count the words
130
+ #word_count = 0
131
+ #el.traverse do |subel|
132
+ # if subel.text? && subel.path !~ /\/a\// && subel.path !~ /\/(h1|h2|h3|h4|h5|h6)\//
133
+ # word_count += (subel.text.downcase.scan(/[a-z]{4,}/) - META_WORDS).size
134
+ # end
135
+ #end
136
+ #
137
+ ## .. then count the tags
138
+ #
139
+ #inner_tags = el.inner_html.scan(/\<\w.*?\>/).size
140
+ #if word_count < inner_tags && inner_tags > 3 && word_count < 250
141
+ # puts "At #{el.name} #{el['id']} #{el['class']} containing '#{el.text[0..20]}' we have #{word_count} valid words to #{el.inner_html.scan(/\<\w.*?\>/).size} tags"
142
+ # #puts "Removing #{el.name} #{el['id']} #{el['class']} TOO MANY TAGS FOR WORDS"
143
+ # el.remove
144
+ # next
145
+ #end
146
+
147
+ # If there are at least 2 words and a third of them are "meta words," remove the element
148
+ #inner_words = el.text.to_s.downcase.scan(/[a-z]{3,}/)
149
+ #if BLOCK_OUTPUT_ELEMENTS.include?(el.name) && inner_words.size >= 2
150
+ # if ((inner_words & META_WORDS).size >= (inner_words.size / 3))
151
+ # el.remove
152
+ # end
153
+ #end
154
+
155
+ if el.text && el.text.strip.length < 3 && !%w{img}.include?(el.name) && el.inner_html !~ /\<img/
156
+ el.remove
157
+ next
158
+ end
159
+
160
+ if el.name == "p" && el.text !~ /(\.|\?|\!|\"|\')(\s|$)/ && el.inner_html !~ /\<img/
161
+ el.remove
162
+ next
163
+ end
164
+
165
+ # If the ID or class of the element contains a fatally bad word, get rid of it
166
+ if (BAD_WORDS & (el['id'].to_s + ' ' + el['class'].to_s).downcase.scan(/[a-z]+/)).length > 0
167
+ #puts "Removing #{el.name} #{el['id']} #{el['class']} BAD"
168
+ el.remove
169
+ next
170
+ end
171
+ end
172
+
173
+ # If a title was found early in the result document but had text before it, remove that text - it's probably crap
174
+ orphans_to_remove.each { |el| el.remove }
175
+
176
+ # Clean up the HTML again - Nokogiri outputs it with full doctype and crap
177
+ clean_html = strip(Sanitize.clean(content_branch.to_html, :elements => (clean ? BLOCK_OUTPUT_ELEMENTS : OUTPUT_ELEMENTS), :attributes => (clean ? OK_CLEAN_ATTRIBUTES : OK_ATTRIBUTES)))
178
+
179
+ # If the content is desired as "clean" (i.e. plain-text), do some quick fix-ups
180
+ if clean
181
+ # Get rid of line break tags, make list items look nice, remove all other HTML tags, and clean up spaces and newlines
182
+ clean_html.gsub!(/<br.*?>/, "\n")
183
+ clean_html.gsub!(/<li>/, '* ')
184
+ clean_html.gsub!(/<\w+>/, '')
185
+ clean_html.gsub!(/<\/\w+>/, "\n")
186
+ clean_html.gsub!(/\ +/, ' ')
187
+ clean_html.gsub!(/^\s+\n/, "\n")
188
+ clean_html.gsub!(/\n{2,}/, "\n")
189
+ clean_html.strip!
190
+ end
191
+
192
+ # If tags butt up against each other across lines, remove the line break(s)
193
+ clean_html.gsub!(/\>\n+\</, '><')
194
+
195
+ # Get rid of images whose sources are relative (TODO: Make this optional)
196
+ clean_html.gsub!(/\<img .*?\>/i) do |img_tag|
197
+ img_tag =~ /\Whttp/ ? img_tag : ''
198
+ end
199
+
200
+ # Remove empty tags
201
+ clean_html.gsub!(/<(\w+)><\/\1>/, "")
202
+
203
+ # Just a messy, hacky way to make output look nicer with subsequent paragraphs..
204
+ clean_html.gsub!(/<\/(div|p|h1|h2|h3|h4|h5|h6)>/, '</\1>' + "\n\n")
205
+
206
+ @content[[clean, index]] = clean_html
207
+ end
208
+
209
+ def sentences(qty = 3)
210
+ clean_content = Sanitize.clean(content, :elements => NON_HEADER_ELEMENTS, :attributes => OK_CLEAN_ATTRIBUTES, :remove_contents => %w{h1 h2 h3 h4 h5 h6})
211
+
212
+ fodder = ''
213
+ doc = Nokogiri::HTML(clean_content, nil, 'utf-8')
214
+
215
+ doc.traverse do |el|
216
+ path_segments = el.path.scan(/[a-z]+/)[2..-1]
217
+ next unless path_segments && path_segments.length > 1
218
+ if el.text? && el.text.strip.length < 3
219
+ el.remove
220
+ next
221
+ end
222
+ if el.text? && NON_HEADER_ELEMENTS.include?(path_segments[-2])
223
+ text = el.text.strip
224
+ text += "." if text !~ /[\.\!\?\"\']$/
225
+ fodder += text + "\n"
226
+ end
227
+ end
228
+
229
+ fodder = content(true) if fodder.to_s.length < 50
230
+ fodder.gsub!(/\b\w\W\s/, '')
231
+
232
+ #sentences = fodder.scan(/([\&\w\s\-\'\,\+\.\/\\\:\#\(\)\=\"\?\!]+?[\.\?\!])(\s|\Z)/im).map { |s| s.first }
233
+ sentences = fodder.scan(/(.+?[\.\?\!])(\s|\Z)/im).map { |s| s.first.strip }
234
+
235
+ sentences.compact!
236
+ sentences.map! { |s| s.strip }
237
+ sentences.map! { |s| s.sub(/^[^\"\'a-z0-9\(\[]+/im, '') }
238
+ sentences.map! { |s| s.sub(/[^a-z0-9\'\"\)\]\.\!\:\?]+$/im, '') }
239
+ sentences.map! { |s| s.gsub(/\s+/m, ' ') }
240
+ sentences.first(qty)
241
+ end
242
+
243
+ def images(qty = 3)
244
+ doc = Nokogiri::HTML(content, nil, 'utf-8')
245
+ images = []
246
+ doc.css("img").each do |img|
247
+ images << img['src']
248
+ break if images.length == qty
249
+ end
250
+ images
251
+ end
252
+
253
+ # Remove leading and trailing spaces on lines throughout a string (a bit like String#strip, but for multi-lines)
254
+ def strip(s)
255
+ s.gsub(/^\s+/, '').gsub(/\s+$/, '')
256
+ end
257
+ end
258
+ end
259
+ end