nddrylliog_pismo 0.7.3
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +29 -0
- data/Gemfile +4 -0
- data/LICENSE +23 -0
- data/NOTICE +4 -0
- data/README.markdown +131 -0
- data/Rakefile +72 -0
- data/bin/pismo +45 -0
- data/lib/pismo.rb +82 -0
- data/lib/pismo/document.rb +67 -0
- data/lib/pismo/external_attributes.rb +14 -0
- data/lib/pismo/internal_attributes.rb +316 -0
- data/lib/pismo/reader.rb +19 -0
- data/lib/pismo/reader/base.rb +259 -0
- data/lib/pismo/reader/cluster.rb +171 -0
- data/lib/pismo/reader/tree.rb +154 -0
- data/lib/pismo/stopwords.txt +1002 -0
- data/lib/pismo/version.rb +3 -0
- data/pismo.gemspec +30 -0
- data/test/corpus/bbcnews.html +2131 -0
- data/test/corpus/bbcnews2.html +1575 -0
- data/test/corpus/briancray.html +269 -0
- data/test/corpus/cant_read.html +426 -0
- data/test/corpus/factor.html +1362 -0
- data/test/corpus/gmane.html +138 -0
- data/test/corpus/huffington.html +2932 -0
- data/test/corpus/metadata_expected.yaml +72 -0
- data/test/corpus/metadata_expected.yaml.old +122 -0
- data/test/corpus/queness.html +919 -0
- data/test/corpus/reader_expected.yaml +39 -0
- data/test/corpus/readers/cluster_expected.yaml +45 -0
- data/test/corpus/readers/tree_expected.yaml +55 -0
- data/test/corpus/rubyinside.html +318 -0
- data/test/corpus/rww.html +1351 -0
- data/test/corpus/spolsky.html +298 -0
- data/test/corpus/techcrunch.html +1285 -0
- data/test/corpus/tweet.html +360 -0
- data/test/corpus/youtube.html +2348 -0
- data/test/corpus/zefrank.html +535 -0
- data/test/helper.rb +15 -0
- data/test/test_corpus.rb +54 -0
- data/test/test_pismo_document.rb +34 -0
- metadata +156 -0
@@ -0,0 +1,14 @@
|
|
1
|
+
module Pismo
|
2
|
+
# External attributes return data that comes from external services or programs (e.g. Delicious tags)
|
3
|
+
module ExternalAttributes
|
4
|
+
#include HTTParty
|
5
|
+
#
|
6
|
+
#def delicious_tags
|
7
|
+
# delicious_info["top_tags"].sort_by { |k, v| v }.reverse.first(5) rescue []
|
8
|
+
#end
|
9
|
+
#
|
10
|
+
#def delicious_info
|
11
|
+
# @delicious_info ||= self.class.get('http://feeds.delicious.com/v2/json/urlinfo/' + Digest::MD5.hexdigest(@url)).first rescue nil
|
12
|
+
#end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,316 @@
|
|
1
|
+
module Pismo
|
2
|
+
# Internal attributes are different pieces of data we can extract from a document's content
|
3
|
+
module InternalAttributes
|
4
|
+
# Returns the title of the page/content - attempts to strip site name, etc, if possible
|
5
|
+
def title(all = false)
|
6
|
+
# TODO: Memoizations
|
7
|
+
title = @doc.match(
|
8
|
+
[
|
9
|
+
'#pname a', # Google Code style
|
10
|
+
'.entryheader h1', # Ruby Inside/Kubrick
|
11
|
+
'.entry-title a', # Common Blogger/Blogspot rules
|
12
|
+
'.post-title a',
|
13
|
+
'.post_title a',
|
14
|
+
'.posttitle a',
|
15
|
+
'.post-header h1',
|
16
|
+
'.entry-title',
|
17
|
+
'.post-title',
|
18
|
+
'.post h1',
|
19
|
+
'.post h3 a',
|
20
|
+
'a.datitle', # Slashdot style
|
21
|
+
'.posttitle',
|
22
|
+
'.post_title',
|
23
|
+
'.pageTitle',
|
24
|
+
'#main h1.title',
|
25
|
+
'.title h1',
|
26
|
+
'.post h2',
|
27
|
+
'h2.title',
|
28
|
+
'.entry h2 a',
|
29
|
+
'.entry h2', # Common style
|
30
|
+
'.boite_titre a',
|
31
|
+
['meta[@name="title"]', lambda { |el| el.attr('content') }],
|
32
|
+
'h1.headermain',
|
33
|
+
'h1.title',
|
34
|
+
'.mxb h1', # BBC News
|
35
|
+
'#content h1',
|
36
|
+
'#content h2',
|
37
|
+
'#content h3',
|
38
|
+
'a[@rel="bookmark"]',
|
39
|
+
'.products h2',
|
40
|
+
'.caption h3',
|
41
|
+
'#main h2',
|
42
|
+
'#body h1',
|
43
|
+
'#wrapper h1',
|
44
|
+
'#page h1',
|
45
|
+
'.asset-header h1',
|
46
|
+
'#body_content h2'
|
47
|
+
],
|
48
|
+
all
|
49
|
+
)
|
50
|
+
|
51
|
+
# If all else fails, go to the HTML title
|
52
|
+
if all
|
53
|
+
return [html_title] if !title
|
54
|
+
return ([*title] + [html_title]).uniq
|
55
|
+
else
|
56
|
+
return html_title if !title
|
57
|
+
return title
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def titles
|
62
|
+
title(true)
|
63
|
+
end
|
64
|
+
|
65
|
+
|
66
|
+
# HTML title
|
67
|
+
def html_title
|
68
|
+
title = @doc.match('title')
|
69
|
+
return unless title
|
70
|
+
title
|
71
|
+
end
|
72
|
+
|
73
|
+
# Return an estimate of when the page/content was created
|
74
|
+
# As clients of this library should be doing HTTP retrieval themselves, they can fall to the
|
75
|
+
# Last-Updated HTTP header if they so wish. This method is just rough and based on content only.
|
76
|
+
def datetime
|
77
|
+
# TODO: Clean all this mess up
|
78
|
+
|
79
|
+
mo = %r{(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December)}i
|
80
|
+
|
81
|
+
regexen = [
|
82
|
+
/#{mo}\b\s+\d+\D{1,10}\d{4}/i,
|
83
|
+
/(on\s+)?\d+\s+#{mo}\s+\D{1,10}\d+/i,
|
84
|
+
/(on[^\d+]{1,10})\d+(th|st|rd)?.{1,10}#{mo}\b[^\d]{1,10}\d+/i,
|
85
|
+
/\b\d{4}\-\d{2}\-\d{2}\b/i,
|
86
|
+
/\d+(th|st|rd).{1,10}#{mo}\b[^\d]{1,10}\d+/i,
|
87
|
+
/\d+\s+#{mo}\b[^\d]{1,10}\d+/i,
|
88
|
+
/on\s+#{mo}\s+\d+/i,
|
89
|
+
/#{mo}\s+\d+/i,
|
90
|
+
/\d{4}[\.\/\-]\d{2}[\.\/\-]\d{2}/,
|
91
|
+
/\d{2}[\.\/\-]\d{2}[\.\/\-]\d{4}/
|
92
|
+
]
|
93
|
+
|
94
|
+
datetime = 10
|
95
|
+
|
96
|
+
regexen.each do |r|
|
97
|
+
break if datetime = @doc.to_html[r]
|
98
|
+
end
|
99
|
+
|
100
|
+
return unless datetime && datetime.length > 4
|
101
|
+
|
102
|
+
# Clean up the string for use by Chronic
|
103
|
+
datetime.strip!
|
104
|
+
datetime.gsub!(/(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)[^\w]*/i, '')
|
105
|
+
datetime.gsub!(/(mon|tues|tue|weds|wed|thurs|thur|thu|fri|sat|sun)[^\w]*/i, '')
|
106
|
+
datetime.sub!(/on\s+/, '')
|
107
|
+
datetime.gsub!(/\,/, '')
|
108
|
+
datetime.sub!(/(\d+)(th|st|rd)/, '\1')
|
109
|
+
|
110
|
+
Chronic.parse(datetime) || datetime
|
111
|
+
end
|
112
|
+
|
113
|
+
# Returns the author of the page/content
|
114
|
+
def author(all = false)
|
115
|
+
author = @doc.match([
|
116
|
+
'.post-author .fn',
|
117
|
+
'.wire_author',
|
118
|
+
'.cnnByline b',
|
119
|
+
'.editorlink',
|
120
|
+
'.authors p',
|
121
|
+
['meta[@name="author"]', lambda { |el| el.attr('content') }], # Traditional meta tag style
|
122
|
+
['meta[@name="Author"]', lambda { |el| el.attr('content') }], # CNN style
|
123
|
+
['meta[@name="AUTHOR"]', lambda { |el| el.attr('content') }], # CNN style
|
124
|
+
'.byline a', # Ruby Inside style
|
125
|
+
'.byline',
|
126
|
+
'.post_subheader_left a', # TechCrunch style
|
127
|
+
'.byl', # BBC News style
|
128
|
+
'.articledata .author a',
|
129
|
+
'#owners a', # Google Code style
|
130
|
+
'.author a',
|
131
|
+
'.author',
|
132
|
+
'.auth a',
|
133
|
+
'.auth',
|
134
|
+
'.cT-storyDetails h5', # smh.com.au - worth dropping maybe..
|
135
|
+
['meta[@name="byl"]', lambda { |el| el.attr('content') }],
|
136
|
+
'.timestamp a',
|
137
|
+
'.fn a',
|
138
|
+
'.fn',
|
139
|
+
'.byline-author',
|
140
|
+
'.ArticleAuthor a',
|
141
|
+
'.blog_meta a',
|
142
|
+
'cite a',
|
143
|
+
'cite',
|
144
|
+
'.contributor_details h4 a',
|
145
|
+
'.meta a'
|
146
|
+
], all)
|
147
|
+
|
148
|
+
return unless author
|
149
|
+
|
150
|
+
# Strip off any "By [whoever]" section
|
151
|
+
if String === author
|
152
|
+
author.sub!(/^(post(ed)?\s)?by\W+/i, '')
|
153
|
+
author.tr!('^a-zA-Z 0-9\'', '|')
|
154
|
+
author = author.split(/\|{2,}/).first.to_s
|
155
|
+
author.gsub!(/\s+/, ' ')
|
156
|
+
author.gsub!(/\|/, '')
|
157
|
+
author.strip!
|
158
|
+
elsif Array === author
|
159
|
+
author.map! { |a| a.sub(/^(post(ed)?\s)?by\W+/i, '') }.uniq!
|
160
|
+
end
|
161
|
+
|
162
|
+
author
|
163
|
+
end
|
164
|
+
|
165
|
+
def authors
|
166
|
+
author(true)
|
167
|
+
end
|
168
|
+
|
169
|
+
|
170
|
+
# Returns the "description" of the page, usually comes from a meta tag
|
171
|
+
def description
|
172
|
+
@doc.match([
|
173
|
+
['meta[@name="description"]', lambda { |el| el.attr('content') }],
|
174
|
+
['meta[@name="Description"]', lambda { |el| el.attr('content') }],
|
175
|
+
['meta[@name="DESCRIPTION"]', lambda { |el| el.attr('content') }],
|
176
|
+
'rdf:Description[@name="dc:description"]',
|
177
|
+
'.description'
|
178
|
+
])
|
179
|
+
end
|
180
|
+
|
181
|
+
# Returns the "lede(s)" or first paragraph(s) of the story/page
|
182
|
+
def lede(all = false)
|
183
|
+
lede = @doc.match([
|
184
|
+
'.post-text p',
|
185
|
+
'.post-body p',
|
186
|
+
'#blogpost p',
|
187
|
+
'.story-teaser',
|
188
|
+
'.article .body p',
|
189
|
+
'//div[@class="entrytext"]//p[string-length()>40]', # Ruby Inside / Kubrick style
|
190
|
+
'section p',
|
191
|
+
'.entry .text p',
|
192
|
+
'.hentry .content p',
|
193
|
+
'.entry-content p',
|
194
|
+
'#wikicontent p', # Google Code style
|
195
|
+
'.wikistyle p', # GitHub style
|
196
|
+
'//td[@class="storybody"]/p[string-length()>40]', # BBC News style
|
197
|
+
'//div[@class="entry"]//p[string-length()>100]',
|
198
|
+
# The below is a horrible, horrible way to pluck out lead paras from crappy Blogspot blogs that
|
199
|
+
# don't use <p> tags..
|
200
|
+
['.entry-content', lambda { |el| el.inner_html[/(#{el.inner_text[0..4].strip}.*?)\<br/, 1] }],
|
201
|
+
['.entry', lambda { |el| el.inner_html[/(#{el.inner_text[0..4].strip}.*?)\<br/, 1] }],
|
202
|
+
'.entry',
|
203
|
+
'#content p',
|
204
|
+
'#article p',
|
205
|
+
'.post-body',
|
206
|
+
'.entry-content',
|
207
|
+
'.document_description_short p', # Scribd
|
208
|
+
'.single-post p'
|
209
|
+
], all)
|
210
|
+
|
211
|
+
# TODO: Improve sentence extraction - this is dire even if it "works for now"
|
212
|
+
if lede && String === lede
|
213
|
+
return (lede[/^(.*?[\.\!\?]\s){1,3}/m] || lede).to_s.strip
|
214
|
+
elsif lede && Array === lede
|
215
|
+
return lede.map { |l| l.to_s[/^(.*?[\.\!\?]\s){1,3}/m].strip || l }.uniq
|
216
|
+
else
|
217
|
+
return reader_doc && !reader_doc.sentences(4).empty? ? reader_doc.sentences(4).join(' ') : nil
|
218
|
+
end
|
219
|
+
end
|
220
|
+
|
221
|
+
def ledes
|
222
|
+
lede(true) rescue []
|
223
|
+
end
|
224
|
+
|
225
|
+
# Returns a string containing the first [limit] sentences as determined by the Reader algorithm
|
226
|
+
def sentences(limit = 3)
|
227
|
+
reader_doc && !reader_doc.sentences.empty? ? reader_doc.sentences(limit).join(' ') : nil
|
228
|
+
end
|
229
|
+
|
230
|
+
# Returns any images with absolute URLs in the document
|
231
|
+
def images(limit = 3)
|
232
|
+
reader_doc && !reader_doc.images.empty? ? reader_doc.images(limit) : nil
|
233
|
+
end
|
234
|
+
|
235
|
+
# Returns the "keywords" in the document (not the meta keywords - they're next to useless now)
|
236
|
+
def keywords(options = {})
|
237
|
+
options = { :stem_at => 20, :word_length_limit => 15, :limit => 20, :remove_stopwords => true, :minimum_score => 2 }.merge(options)
|
238
|
+
|
239
|
+
words = {}
|
240
|
+
|
241
|
+
# Convert doc to lowercase, scrub out most HTML tags, then keep track of words
|
242
|
+
cached_title = title.to_s
|
243
|
+
content_to_use = body.to_s.downcase + " " + description.to_s.downcase
|
244
|
+
|
245
|
+
# old regex for safe keeping -- \b[a-z][a-z\+\.\'\+\#\-]*\b
|
246
|
+
content_to_use.downcase.gsub(/\<[^\>]{1,100}\>/, '').gsub(/\.+\s+/, ' ').gsub(/\&\w+\;/, '').scan(/(\b|\s|\A)([a-z0-9][a-z0-9\+\.\'\+\#\-\\]*)(\b|\s|\Z)/i).map{ |ta1| ta1[1] }.compact.each do |word|
|
247
|
+
next if word.length > options[:word_length_limit]
|
248
|
+
word.gsub!(/^[\']/, '')
|
249
|
+
word.gsub!(/[\.\-\']$/, '')
|
250
|
+
next if options[:hints] && !options[:hints].include?(word)
|
251
|
+
words[word] ||= 0
|
252
|
+
words[word] += (cached_title.downcase =~ /\b#{word}\b/ ? 5 : 1)
|
253
|
+
end
|
254
|
+
|
255
|
+
# Stem the words and stop words if necessary
|
256
|
+
d = words.keys.uniq.map { |a| a.length > options[:stem_at] ? a.stem : a }
|
257
|
+
s = Pismo.stopwords.map { |a| a.length > options[:stem_at] ? a.stem : a }
|
258
|
+
|
259
|
+
words.delete_if { |k1, v1| v1 < options[:minimum_score] }
|
260
|
+
words.delete_if { |k1, v1| s.include?(k1) } if options[:remove_stopwords]
|
261
|
+
words.sort_by { |k2, v2| v2 }.reverse.first(options[:limit])
|
262
|
+
end
|
263
|
+
|
264
|
+
def reader_doc
|
265
|
+
@reader_doc ||= Reader::Document.create(@doc.to_s, @options)
|
266
|
+
end
|
267
|
+
|
268
|
+
# Returns body text as determined by Reader algorithm
|
269
|
+
def body
|
270
|
+
@body ||= reader_doc.content(true).strip
|
271
|
+
end
|
272
|
+
|
273
|
+
# Returns body text as determined by Reader algorithm WITH basic HTML formatting intact
|
274
|
+
def html_body
|
275
|
+
@html_body ||= reader_doc.content.strip
|
276
|
+
end
|
277
|
+
|
278
|
+
# Returns URL to the site's favicon
|
279
|
+
def favicon
|
280
|
+
url = @doc.match([['link[@rel="fluid-icon"]', lambda { |el| el.attr('href') }], # Get a Fluid icon if possible..
|
281
|
+
['link[@rel="shortcut icon"]', lambda { |el| el.attr('href') }],
|
282
|
+
['link[@rel="icon"]', lambda { |el| el.attr('href') }]])
|
283
|
+
if url && url !~ /^http/ && @url
|
284
|
+
url = URI.join(@url , url).to_s
|
285
|
+
end
|
286
|
+
|
287
|
+
url
|
288
|
+
end
|
289
|
+
|
290
|
+
# Returns URL(s) of Web feed(s)
|
291
|
+
def feed(all = false)
|
292
|
+
url = @doc.match([['link[@type="application/rss+xml"]', lambda { |el| el.attr('href') }],
|
293
|
+
['link[@type="application/atom+xml"]', lambda { |el| el.attr('href') }]], all
|
294
|
+
)
|
295
|
+
|
296
|
+
if url && String === url && url !~ /^http/ && @url
|
297
|
+
url = URI.join(@url , url).to_s
|
298
|
+
elsif url && Array === url
|
299
|
+
url.map! do |u|
|
300
|
+
if u !~ /^http/ && @url
|
301
|
+
URI.join(@url, u).to_s
|
302
|
+
else
|
303
|
+
u
|
304
|
+
end
|
305
|
+
end
|
306
|
+
url.uniq!
|
307
|
+
end
|
308
|
+
|
309
|
+
url
|
310
|
+
end
|
311
|
+
|
312
|
+
def feeds
|
313
|
+
feed(true)
|
314
|
+
end
|
315
|
+
end
|
316
|
+
end
|
data/lib/pismo/reader.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
module Pismo
|
2
|
+
module Reader
|
3
|
+
class Document
|
4
|
+
|
5
|
+
def self.create(raw_content, options = {})
|
6
|
+
type = options.delete(:reader)
|
7
|
+
case type
|
8
|
+
when :score
|
9
|
+
Pismo::Reader::Tree.new(raw_content, options)
|
10
|
+
when :cluster
|
11
|
+
Pismo::Reader::Cluster.new(raw_content, options)
|
12
|
+
else
|
13
|
+
Pismo::Reader::Tree.new(raw_content, options)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,259 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'sanitize'
|
3
|
+
begin; require 'ap'; rescue LoadError; end
|
4
|
+
|
5
|
+
module Pismo
|
6
|
+
module Reader
|
7
|
+
class Base
|
8
|
+
attr_reader :raw_content, :doc, :content_candidates, :options
|
9
|
+
|
10
|
+
# Elements to keep for /input/ sanitization
|
11
|
+
OK_ELEMENTS = %w{a td br th tbody table tr div span img strong em b i body html head title p h1 h2 h3 h4 h5 h6 pre code tt ul li ol blockquote font big small section article abbr audio video cite dd dt figure caption sup form dl dt dd center}
|
12
|
+
|
13
|
+
# Build a tree of attributes that are allowed for each element.. doing it this messy way due to how Sanitize works, alas
|
14
|
+
OK_ATTRIBUTES = {}
|
15
|
+
OK_CLEAN_ATTRIBUTES = {}
|
16
|
+
OK_ELEMENTS.each { |el| OK_ATTRIBUTES[el] = %w{id class href name content type alt title src} }
|
17
|
+
OK_ELEMENTS.each { |el| OK_CLEAN_ATTRIBUTES[el] = %w{href title src alt} }
|
18
|
+
|
19
|
+
|
20
|
+
# Words that we'd like to see in class and ID names for "content"
|
21
|
+
GOOD_WORDS = %w{content post blogpost main story body entry text desc asset hentry single entrytext postcontent bodycontent}.uniq
|
22
|
+
|
23
|
+
# Words that indicate crap in general
|
24
|
+
BAD_WORDS = %w{reply metadata options commenting comments comment about footer header outer credit sidebar widget subscribe clearfix date social bookmarks links share video watch excerpt related supplement accessibility offscreen meta title signup blq secondary feedback featured clearfix small job jobs listing listings navigation nav byline addcomment postcomment trackback neighbor ads commentform fbfans login similar thumb link blogroll grid twitter wrapper container nav sitesub printfooter editsection visualclear catlinks hidden toc contentsub caption disqus rss shoutbox sponsor blogcomments}.uniq
|
25
|
+
|
26
|
+
# Words that kill a branch dead
|
27
|
+
FATAL_WORDS = %w{comments comment bookmarks social links ads related similar footer digg totop metadata sitesub nav sidebar commenting options addcomment leaderboard offscreen job prevlink prevnext navigation reply-link hide hidden sidebox archives vcard}
|
28
|
+
|
29
|
+
META_WORDS = %w{january february march april may june july august september october november december jan feb mar apr may jun jul aug sep oct nov dec st th rd nd comments written posted on at published 2000 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 updated last gmt est pst pdt edt cet cdt cst article feature featured filed under comment comments follow twitter facebook email e-mail register story continue continues reading read inside more page next related response responses respond contact street phone tel email e-mail fax info tags tagged tag thanks credit creative commons copy nbsp lt gt this friend printable version subscribe rss mail follow twitter article via leave}.uniq
|
30
|
+
|
31
|
+
WONT_CONTAIN_FULL_CONTENT = %w{h1 h2 h3 h4 h5 h6 h6 li ol ul br a img meta cite strong em i b input head small big code title sup sub dd dt}
|
32
|
+
COULD_CONTAIN_FULL_CONTENT = %w{body div p table tr td article pre blockquote tbody section}
|
33
|
+
|
34
|
+
## Output sanitization element sets
|
35
|
+
BLOCK_OUTPUT_ELEMENTS = %w{div p h2 h3 h4 h5 h6 li dl pre ul ol blockquote section article audio video cite dd dt figure caption br table tr td thead tbody tfoot}
|
36
|
+
INLINE_OUTPUT_ELEMENTS = %w{a img b strong em i br code sup font small big dd dt}
|
37
|
+
OUTPUT_ELEMENTS = BLOCK_OUTPUT_ELEMENTS + INLINE_OUTPUT_ELEMENTS
|
38
|
+
NON_HEADER_ELEMENTS = %w{p br}
|
39
|
+
|
40
|
+
# Create a document object based on the raw HTML content provided
|
41
|
+
def initialize(raw_content, options = {})
|
42
|
+
@options = options
|
43
|
+
@raw_content = Pismo::Document.clean_html(raw_content)
|
44
|
+
build_doc
|
45
|
+
end
|
46
|
+
|
47
|
+
def build_doc
|
48
|
+
@content = {}
|
49
|
+
|
50
|
+
if RUBY_VERSION > "1.9"
|
51
|
+
@raw_content.encode!("UTF-8", :invalid => :replace, :replace => '?') if @raw_content.encoding != "UTF-8"
|
52
|
+
@raw_content.encode!("ASCII-8BIT", :invalid => :replace, :replace => '?') if !@raw_content.valid_encoding?
|
53
|
+
end
|
54
|
+
|
55
|
+
# Normalize whitespace (as much to make debugging sessions look nice as anything else)
|
56
|
+
@raw_content.gsub!(/\s{2,}/, ' ')
|
57
|
+
@raw_content.gsub!(/\r/, "\n")
|
58
|
+
@raw_content.gsub!(/\n{3,}/, "\n\n")
|
59
|
+
@raw_content.gsub!(/(\<br(\s\/)?\>){2,}/, "</p><p>")
|
60
|
+
|
61
|
+
# Remove scripts manually, Sanitize and/or Nokogiri seem to go a bit funny with them
|
62
|
+
@raw_content.gsub!(/\<script .*?\<\/script\>/im, '')
|
63
|
+
|
64
|
+
# Get rid of bullshit "smart" quotes and other Unicode nonsense
|
65
|
+
@raw_content.force_encoding("ASCII-8BIT") if RUBY_VERSION > "1.9"
|
66
|
+
@raw_content.gsub!("\xe2\x80\x89", " ")
|
67
|
+
@raw_content.gsub!("\xe2\x80\x99", "'")
|
68
|
+
@raw_content.gsub!("\xe2\x80\x98", "'")
|
69
|
+
@raw_content.gsub!("\xe2\x80\x9c", '"')
|
70
|
+
@raw_content.gsub!("\xe2\x80\x9d", '"')
|
71
|
+
@raw_content.gsub!("\xe2\x80\xf6", '.')
|
72
|
+
@raw_content.force_encoding("UTF-8") if RUBY_VERSION > "1.9"
|
73
|
+
|
74
|
+
|
75
|
+
# Sanitize the HTML
|
76
|
+
@raw_content = Sanitize.clean(@raw_content,
|
77
|
+
:elements => OK_ELEMENTS,
|
78
|
+
:attributes => OK_ATTRIBUTES,
|
79
|
+
:remove_contents => true,
|
80
|
+
:output_encoding => 'utf-8'
|
81
|
+
)
|
82
|
+
|
83
|
+
@doc = Nokogiri::HTML(@raw_content, nil, 'utf-8')
|
84
|
+
|
85
|
+
# Do a pre clean up of elements.
|
86
|
+
@doc.css("div, span, table, tr, td, pre").each do |el|
|
87
|
+
# Any block elements with no child block elements can become paragraphs
|
88
|
+
if (BLOCK_OUTPUT_ELEMENTS & el.inner_html.scan(/\<(\w+)/).flatten).empty?
|
89
|
+
el.name = "p"
|
90
|
+
elsif el.name != "span"
|
91
|
+
el.name = "div"
|
92
|
+
end
|
93
|
+
|
94
|
+
# Any SPANs that aren't within paragraphs can become paragraphs too
|
95
|
+
el.name = "p" if el.name == "span" && !el.path.scan(/[a-z]+/).include?('p')
|
96
|
+
|
97
|
+
el.remove if (FATAL_WORDS & (el['id'].to_s + ' ' + el['class'].to_s).downcase.strip.scan(/[a-z]+/)).size > 0
|
98
|
+
end
|
99
|
+
|
100
|
+
analyze
|
101
|
+
end
|
102
|
+
|
103
|
+
# Return the content from best match number of index (default 0) and, optionally, clean it to plain-text
|
104
|
+
def content(clean = false, index = 0)
|
105
|
+
return @content[[clean, index]] if @content[[clean, index]]
|
106
|
+
return '' if !@content_candidates || @content_candidates.empty?
|
107
|
+
|
108
|
+
content_branch = content_at(index)
|
109
|
+
orphans_to_remove = []
|
110
|
+
|
111
|
+
#ap content_branch.to_html
|
112
|
+
#exit
|
113
|
+
|
114
|
+
# Go through every piece of the content and rip out sections that contain too many tags compared to words
|
115
|
+
# This is usually indicative of "widgets" or link bar sections
|
116
|
+
content_branch.css('*').each_with_index do |el, i|
|
117
|
+
next unless el
|
118
|
+
|
119
|
+
if el.name == "h1"
|
120
|
+
el.remove
|
121
|
+
next
|
122
|
+
end
|
123
|
+
|
124
|
+
if el.name == "h2" && content_branch.inner_html.scan('<h2').size == 1
|
125
|
+
el.remove
|
126
|
+
end
|
127
|
+
|
128
|
+
# Remove elements that contain words but there are more tags than words overall
|
129
|
+
# First, count the words
|
130
|
+
#word_count = 0
|
131
|
+
#el.traverse do |subel|
|
132
|
+
# if subel.text? && subel.path !~ /\/a\// && subel.path !~ /\/(h1|h2|h3|h4|h5|h6)\//
|
133
|
+
# word_count += (subel.text.downcase.scan(/[a-z]{4,}/) - META_WORDS).size
|
134
|
+
# end
|
135
|
+
#end
|
136
|
+
#
|
137
|
+
## .. then count the tags
|
138
|
+
#
|
139
|
+
#inner_tags = el.inner_html.scan(/\<\w.*?\>/).size
|
140
|
+
#if word_count < inner_tags && inner_tags > 3 && word_count < 250
|
141
|
+
# puts "At #{el.name} #{el['id']} #{el['class']} containing '#{el.text[0..20]}' we have #{word_count} valid words to #{el.inner_html.scan(/\<\w.*?\>/).size} tags"
|
142
|
+
# #puts "Removing #{el.name} #{el['id']} #{el['class']} TOO MANY TAGS FOR WORDS"
|
143
|
+
# el.remove
|
144
|
+
# next
|
145
|
+
#end
|
146
|
+
|
147
|
+
# If there are at least 2 words and a third of them are "meta words," remove the element
|
148
|
+
#inner_words = el.text.to_s.downcase.scan(/[a-z]{3,}/)
|
149
|
+
#if BLOCK_OUTPUT_ELEMENTS.include?(el.name) && inner_words.size >= 2
|
150
|
+
# if ((inner_words & META_WORDS).size >= (inner_words.size / 3))
|
151
|
+
# el.remove
|
152
|
+
# end
|
153
|
+
#end
|
154
|
+
|
155
|
+
if el.text && el.text.strip.length < 3 && !%w{img}.include?(el.name) && el.inner_html !~ /\<img/
|
156
|
+
el.remove
|
157
|
+
next
|
158
|
+
end
|
159
|
+
|
160
|
+
if el.name == "p" && el.text !~ /(\.|\?|\!|\"|\')(\s|$)/ && el.inner_html !~ /\<img/
|
161
|
+
el.remove
|
162
|
+
next
|
163
|
+
end
|
164
|
+
|
165
|
+
# If the ID or class of the element contains a fatally bad word, get rid of it
|
166
|
+
if (BAD_WORDS & (el['id'].to_s + ' ' + el['class'].to_s).downcase.scan(/[a-z]+/)).length > 0
|
167
|
+
#puts "Removing #{el.name} #{el['id']} #{el['class']} BAD"
|
168
|
+
el.remove
|
169
|
+
next
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
# If a title was found early in the result document but had text before it, remove that text - it's probably crap
|
174
|
+
orphans_to_remove.each { |el| el.remove }
|
175
|
+
|
176
|
+
# Clean up the HTML again - Nokogiri outputs it with full doctype and crap
|
177
|
+
clean_html = strip(Sanitize.clean(content_branch.to_html, :elements => (clean ? BLOCK_OUTPUT_ELEMENTS : OUTPUT_ELEMENTS), :attributes => (clean ? OK_CLEAN_ATTRIBUTES : OK_ATTRIBUTES)))
|
178
|
+
|
179
|
+
# If the content is desired as "clean" (i.e. plain-text), do some quick fix-ups
|
180
|
+
if clean
|
181
|
+
# Get rid of line break tags, make list items look nice, remove all other HTML tags, and clean up spaces and newlines
|
182
|
+
clean_html.gsub!(/<br.*?>/, "\n")
|
183
|
+
clean_html.gsub!(/<li>/, '* ')
|
184
|
+
clean_html.gsub!(/<\w+>/, '')
|
185
|
+
clean_html.gsub!(/<\/\w+>/, "\n")
|
186
|
+
clean_html.gsub!(/\ +/, ' ')
|
187
|
+
clean_html.gsub!(/^\s+\n/, "\n")
|
188
|
+
clean_html.gsub!(/\n{2,}/, "\n")
|
189
|
+
clean_html.strip!
|
190
|
+
end
|
191
|
+
|
192
|
+
# If tags butt up against each other across lines, remove the line break(s)
|
193
|
+
clean_html.gsub!(/\>\n+\</, '><')
|
194
|
+
|
195
|
+
# Get rid of images whose sources are relative (TODO: Make this optional)
|
196
|
+
clean_html.gsub!(/\<img .*?\>/i) do |img_tag|
|
197
|
+
img_tag =~ /\Whttp/ ? img_tag : ''
|
198
|
+
end
|
199
|
+
|
200
|
+
# Remove empty tags
|
201
|
+
clean_html.gsub!(/<(\w+)><\/\1>/, "")
|
202
|
+
|
203
|
+
# Just a messy, hacky way to make output look nicer with subsequent paragraphs..
|
204
|
+
clean_html.gsub!(/<\/(div|p|h1|h2|h3|h4|h5|h6)>/, '</\1>' + "\n\n")
|
205
|
+
|
206
|
+
@content[[clean, index]] = clean_html
|
207
|
+
end
|
208
|
+
|
209
|
+
def sentences(qty = 3)
|
210
|
+
clean_content = Sanitize.clean(content, :elements => NON_HEADER_ELEMENTS, :attributes => OK_CLEAN_ATTRIBUTES, :remove_contents => %w{h1 h2 h3 h4 h5 h6})
|
211
|
+
|
212
|
+
fodder = ''
|
213
|
+
doc = Nokogiri::HTML(clean_content, nil, 'utf-8')
|
214
|
+
|
215
|
+
doc.traverse do |el|
|
216
|
+
path_segments = el.path.scan(/[a-z]+/)[2..-1]
|
217
|
+
next unless path_segments && path_segments.length > 1
|
218
|
+
if el.text? && el.text.strip.length < 3
|
219
|
+
el.remove
|
220
|
+
next
|
221
|
+
end
|
222
|
+
if el.text? && NON_HEADER_ELEMENTS.include?(path_segments[-2])
|
223
|
+
text = el.text.strip
|
224
|
+
text += "." if text !~ /[\.\!\?\"\']$/
|
225
|
+
fodder += text + "\n"
|
226
|
+
end
|
227
|
+
end
|
228
|
+
|
229
|
+
fodder = content(true) if fodder.to_s.length < 50
|
230
|
+
fodder.gsub!(/\b\w\W\s/, '')
|
231
|
+
|
232
|
+
#sentences = fodder.scan(/([\&\w\s\-\'\,\+\.\/\\\:\#\(\)\=\"\?\!]+?[\.\?\!])(\s|\Z)/im).map { |s| s.first }
|
233
|
+
sentences = fodder.scan(/(.+?[\.\?\!])(\s|\Z)/im).map { |s| s.first.strip }
|
234
|
+
|
235
|
+
sentences.compact!
|
236
|
+
sentences.map! { |s| s.strip }
|
237
|
+
sentences.map! { |s| s.sub(/^[^\"\'a-z0-9\(\[]+/im, '') }
|
238
|
+
sentences.map! { |s| s.sub(/[^a-z0-9\'\"\)\]\.\!\:\?]+$/im, '') }
|
239
|
+
sentences.map! { |s| s.gsub(/\s+/m, ' ') }
|
240
|
+
sentences.first(qty)
|
241
|
+
end
|
242
|
+
|
243
|
+
def images(qty = 3)
|
244
|
+
doc = Nokogiri::HTML(content, nil, 'utf-8')
|
245
|
+
images = []
|
246
|
+
doc.css("img").each do |img|
|
247
|
+
images << img['src']
|
248
|
+
break if images.length == qty
|
249
|
+
end
|
250
|
+
images
|
251
|
+
end
|
252
|
+
|
253
|
+
# Remove leading and trailing spaces on lines throughout a string (a bit like String#strip, but for multi-lines)
|
254
|
+
def strip(s)
|
255
|
+
s.gsub(/^\s+/, '').gsub(/\s+$/, '')
|
256
|
+
end
|
257
|
+
end
|
258
|
+
end
|
259
|
+
end
|