jruby-readability 0.5.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. data/lib/jruby-readability.rb +1 -0
  2. data/lib/readability.rb +475 -0
  3. metadata +130 -0
@@ -0,0 +1 @@
1
+ require 'readability'
@@ -0,0 +1,475 @@
1
+ require 'rubygems'
2
+ require 'nokogiri'
3
+ require 'guess_html_encoding'
4
+
5
+ module Readability
6
+ class Document
7
+ DEFAULT_OPTIONS = {
8
+ :retry_length => 250,
9
+ :min_text_length => 25,
10
+ :remove_unlikely_candidates => true,
11
+ :weight_classes => true,
12
+ :clean_conditionally => true,
13
+ :remove_empty_nodes => true,
14
+ :min_image_width => 130,
15
+ :min_image_height => 80,
16
+ :ignore_image_format => []
17
+ }.freeze
18
+
19
+ attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image
20
+
21
+ def initialize(input, options = {})
22
+ @options = DEFAULT_OPTIONS.merge(options)
23
+ @input = input
24
+
25
+ if RUBY_VERSION =~ /^1\.9\./ && !@options[:encoding]
26
+ @input = GuessHtmlEncoding.encode(@input, @options[:html_headers]) unless @options[:do_not_guess_encoding]
27
+ @options[:encoding] = @input.encoding.to_s
28
+ end
29
+
30
+ @input = @input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>')
31
+ @remove_unlikely_candidates = @options[:remove_unlikely_candidates]
32
+ @weight_classes = @options[:weight_classes]
33
+ @clean_conditionally = @options[:clean_conditionally]
34
+ @best_candidate_has_image = true
35
+ make_html
36
+ end
37
+
38
+ def prepare_candidates
39
+ @html.css("script, style").each { |i| i.remove }
40
+ remove_unlikely_candidates! if @remove_unlikely_candidates
41
+ transform_misused_divs_into_paragraphs!
42
+
43
+ @candidates = score_paragraphs(options[:min_text_length])
44
+ @best_candidate = select_best_candidate(@candidates)
45
+ end
46
+
47
+ def make_html
48
+ @html = Nokogiri::HTML(@input, nil, @options[:encoding])
49
+ # In case document has no body, such as from empty string or redirect
50
+ @html = Nokogiri::HTML('<body />', nil, @options[:encoding]) if @html.css('body').length == 0
51
+
52
+ # Remove html comment tags
53
+ @html.xpath('//comment()').each { |i| i.remove }
54
+ end
55
+
56
+ def images(content=nil, reload=false)
57
+ begin
58
+ require 'fastimage'
59
+ rescue LoadError
60
+ raise "Please install fastimage in order to use the #images feature."
61
+ end
62
+
63
+ @best_candidate_has_image = false if reload
64
+
65
+ prepare_candidates
66
+ list_images = []
67
+ tested_images = []
68
+ content = @best_candidate[:elem] unless reload
69
+
70
+ return list_images if content.nil?
71
+ elements = content.css("img").map(&:attributes)
72
+
73
+ elements.each do |element|
74
+ next unless element["src"]
75
+
76
+ url = element["src"].value
77
+ height = element["height"].nil? ? 0 : element["height"].value.to_i
78
+ width = element["width"].nil? ? 0 : element["width"].value.to_i
79
+
80
+ if url =~ /\Ahttps?:\/\//i && (height.zero? || width.zero?)
81
+ image = get_image_size(url)
82
+ next unless image
83
+ else
84
+ image = {:width => width, :height => height}
85
+ end
86
+
87
+ image[:format] = File.extname(url).gsub(".", "")
88
+
89
+ if tested_images.include?(url)
90
+ debug("Image was tested: #{url}")
91
+ next
92
+ end
93
+
94
+ tested_images.push(url)
95
+ if image_meets_criteria?(image)
96
+ list_images << url
97
+ else
98
+ debug("Image discarded: #{url} - height: #{image[:height]} - width: #{image[:width]} - format: #{image[:format]}")
99
+ end
100
+ end
101
+
102
+ (list_images.empty? and content != @html) ? images(@html, true) : list_images
103
+ end
104
+
105
+ def get_image_size(url)
106
+ begin
107
+ w, h = FastImage.size(url)
108
+ raise "Couldn't get size." if w.nil? || h.nil?
109
+ {width: w, height: h}
110
+ rescue => e
111
+ debug("Image error: #{e}")
112
+ nil
113
+ end
114
+ end
115
+
116
+ def image_meets_criteria?(image)
117
+ return false if options[:ignore_image_format].include?(image[:format].downcase)
118
+ image[:width] >= (options[:min_image_width] || 0) && image[:height] >= (options[:min_image_height] || 0)
119
+ end
120
+
121
+ REGEXES = {
122
+ :unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
123
+ :okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i,
124
+ :positiveRe => /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
125
+ :negativeRe => /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
126
+ :divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
127
+ :replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
128
+ :replaceFontsRe => /<(\/?)font[^>]*>/i,
129
+ :trimRe => /^\s+|\s+$/,
130
+ :normalizeRe => /\s{2,}/,
131
+ :killBreaksRe => /(<br\s*\/?>(\s|&nbsp;?)*){1,}/,
132
+ :videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
133
+ }
134
+
135
+ def title
136
+ title = @html.css("title").first
137
+ title ? title.text : nil
138
+ end
139
+
140
+ # Look through the @html document looking for the author
141
+ # Precedence Information here on the wiki: (TODO attach wiki URL if it is accepted)
142
+ # Returns nil if no author is detected
143
+ def author
144
+ # Let's grab this author:
145
+ # <meta name="dc.creator" content="Finch - http://www.getfinch.com" />
146
+ author_elements = @html.xpath('//meta[@name = "dc.creator"]')
147
+ unless author_elements.empty?
148
+ author_elements.each do |element|
149
+ if element['content']
150
+ return element['content'].strip
151
+ end
152
+ end
153
+ end
154
+
155
+ # Now let's try to grab this
156
+ # <span class="byline author vcard"><span>By</span><cite class="fn">Austin Fonacier</cite></span>
157
+ # <div class="author">By</div><div class="author vcard"><a class="url fn" href="http://austinlivesinyoapp.com/">Austin Fonacier</a></div>
158
+ author_elements = @html.xpath('//*[contains(@class, "vcard")]//*[contains(@class, "fn")]')
159
+ unless author_elements.empty?
160
+ author_elements.each do |element|
161
+ if element.text
162
+ return element.text.strip
163
+ end
164
+ end
165
+ end
166
+
167
+ # Now let's try to grab this
168
+ # <a rel="author" href="http://dbanksdesign.com">Danny Banks (rel)</a>
169
+ # TODO: strip out the (rel)?
170
+ author_elements = @html.xpath('//a[@rel = "author"]')
171
+ unless author_elements.empty?
172
+ author_elements.each do |element|
173
+ if element.text
174
+ return element.text.strip
175
+ end
176
+ end
177
+ end
178
+
179
+ author_elements = @html.xpath('//*[@id = "author"]')
180
+ unless author_elements.empty?
181
+ author_elements.each do |element|
182
+ if element.text
183
+ return element.text.strip
184
+ end
185
+ end
186
+ end
187
+ end
188
+
189
+ def content(remove_unlikely_candidates = :default)
190
+ @remove_unlikely_candidates = false if remove_unlikely_candidates == false
191
+
192
+ prepare_candidates
193
+ article = get_article(@candidates, @best_candidate)
194
+
195
+ cleaned_article = sanitize(article, @candidates, options)
196
+ if article.text.strip.length < options[:retry_length]
197
+ if @remove_unlikely_candidates
198
+ @remove_unlikely_candidates = false
199
+ elsif @weight_classes
200
+ @weight_classes = false
201
+ elsif @clean_conditionally
202
+ @clean_conditionally = false
203
+ else
204
+ # nothing we can do
205
+ return cleaned_article
206
+ end
207
+
208
+ make_html
209
+ content
210
+ else
211
+ cleaned_article
212
+ end
213
+ end
214
+
215
+ def get_article(candidates, best_candidate)
216
+ # Now that we have the top candidate, look through its siblings for content that might also be related.
217
+ # Things like preambles, content split by ads that we removed, etc.
218
+
219
+ sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
220
+ output = Nokogiri::XML::Node.new('div', @html)
221
+ best_candidate[:elem].parent.children.each do |sibling|
222
+ append = false
223
+ append = true if sibling == best_candidate[:elem]
224
+ append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
225
+
226
+ if sibling.name.downcase == "p"
227
+ link_density = get_link_density(sibling)
228
+ node_content = sibling.text
229
+ node_length = node_content.length
230
+
231
+ if node_length > 80 && link_density < 0.25
232
+ append = true
233
+ elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/
234
+ append = true
235
+ end
236
+ end
237
+
238
+ if append
239
+ sibling_dup = sibling.dup # otherwise the state of the document in processing will change, thus creating side effects
240
+ sibling_dup.name = "div" unless %w[div p].include?(sibling.name.downcase)
241
+ output << sibling_dup
242
+ end
243
+ end
244
+
245
+ output
246
+ end
247
+
248
+ def select_best_candidate(candidates)
249
+ sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
250
+
251
+ debug("Top 5 candidates:")
252
+ sorted_candidates[0...5].each do |candidate|
253
+ debug("Candidate #{candidate[:elem].name}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}")
254
+ end
255
+
256
+ best_candidate = sorted_candidates.first || { :elem => @html.css("body").first, :content_score => 0 }
257
+ debug("Best candidate #{best_candidate[:elem].name}##{best_candidate[:elem][:id]}.#{best_candidate[:elem][:class]} with score #{best_candidate[:content_score]}")
258
+
259
+ best_candidate
260
+ end
261
+
262
+ def get_link_density(elem)
263
+ link_length = elem.css("a").map(&:text).join("").length
264
+ text_length = elem.text.length
265
+ link_length / text_length.to_f
266
+ end
267
+
268
+ def score_paragraphs(min_text_length)
269
+ candidates = {}
270
+ @html.css("p,td").each do |elem|
271
+ parent_node = elem.parent
272
+ grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil
273
+ inner_text = elem.text
274
+
275
+ # If this paragraph is less than 25 characters, don't even count it.
276
+ next if inner_text.length < min_text_length
277
+
278
+ candidates[parent_node] ||= score_node(parent_node)
279
+ candidates[grand_parent_node] ||= score_node(grand_parent_node) if grand_parent_node
280
+
281
+ content_score = 1
282
+ content_score += inner_text.split(',').length
283
+ content_score += [(inner_text.length / 100).to_i, 3].min
284
+
285
+ candidates[parent_node][:content_score] += content_score
286
+ candidates[grand_parent_node][:content_score] += content_score / 2.0 if grand_parent_node
287
+ end
288
+
289
+ # Scale the final candidates score based on link density. Good content should have a
290
+ # relatively small link density (5% or less) and be mostly unaffected by this operation.
291
+ candidates.each do |elem, candidate|
292
+ candidate[:content_score] = candidate[:content_score] * (1 - get_link_density(elem))
293
+ end
294
+
295
+ candidates
296
+ end
297
+
298
+ def class_weight(e)
299
+ weight = 0
300
+ return weight unless @weight_classes
301
+
302
+ if e[:class] && e[:class] != ""
303
+ if e[:class] =~ REGEXES[:negativeRe]
304
+ weight -= 25
305
+ end
306
+
307
+ if e[:class] =~ REGEXES[:positiveRe]
308
+ weight += 25
309
+ end
310
+ end
311
+
312
+ if e[:id] && e[:id] != ""
313
+ if e[:id] =~ REGEXES[:negativeRe]
314
+ weight -= 25
315
+ end
316
+
317
+ if e[:id] =~ REGEXES[:positiveRe]
318
+ weight += 25
319
+ end
320
+ end
321
+
322
+ weight
323
+ end
324
+
325
+ def score_node(elem)
326
+ content_score = class_weight(elem)
327
+ case elem.name.downcase
328
+ when "div"
329
+ content_score += 5
330
+ when "blockquote"
331
+ content_score += 3
332
+ when "form"
333
+ content_score -= 3
334
+ when "th"
335
+ content_score -= 5
336
+ end
337
+ { :content_score => content_score, :elem => elem }
338
+ end
339
+
340
+ def debug(str)
341
+ puts str if options[:debug]
342
+ end
343
+
344
+ def remove_unlikely_candidates!
345
+ @html.css("*").each do |elem|
346
+ str = "#{elem[:class]}#{elem[:id]}"
347
+ if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && (elem.name.downcase != 'html') && (elem.name.downcase != 'body')
348
+ debug("Removing unlikely candidate - #{str}")
349
+ elem.remove
350
+ end
351
+ end
352
+ end
353
+
354
+ def transform_misused_divs_into_paragraphs!
355
+ @html.css("*").each do |elem|
356
+ if elem.name.downcase == "div"
357
+ # transform <div>s that do not contain other block elements into <p>s
358
+ if elem.inner_html !~ REGEXES[:divToPElementsRe]
359
+ debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
360
+ elem.name = "p"
361
+ end
362
+ else
363
+ # wrap text nodes in p tags
364
+ # elem.children.each do |child|
365
+ # if child.text?
366
+ # debug("wrapping text node with a p")
367
+ # child.swap("<p>#{child.text}</p>")
368
+ # end
369
+ # end
370
+ end
371
+ end
372
+ end
373
+
374
+ def sanitize(node, candidates, options = {})
375
+ node.css("h1, h2, h3, h4, h5, h6").each do |header|
376
+ header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33
377
+ end
378
+
379
+ node.css("form, object, iframe, embed").each do |elem|
380
+ elem.remove
381
+ end
382
+
383
+ if @options[:remove_empty_nodes]
384
+ # remove <p> tags that have no text content - this will also remove p tags that contain only images.
385
+ node.css("p").each do |elem|
386
+ elem.remove if elem.content.strip.empty?
387
+ end
388
+ end
389
+
390
+ # Conditionally clean <table>s, <ul>s, and <div>s
391
+ clean_conditionally(node, candidates, "table, ul, div")
392
+
393
+ # We'll sanitize all elements using a whitelist
394
+ base_whitelist = @options[:tags] || %w[div p]
395
+ # We'll add whitespace instead of block elements,
396
+ # so a<br>b will have a nice space between them
397
+ base_replace_with_whitespace = %w[br hr h1 h2 h3 h4 h5 h6 dl dd ol li ul address blockquote center]
398
+
399
+ # Use a hash for speed (don't want to make a million calls to include?)
400
+ whitelist = Hash.new
401
+ base_whitelist.each {|tag| whitelist[tag] = true }
402
+ replace_with_whitespace = Hash.new
403
+ base_replace_with_whitespace.each { |tag| replace_with_whitespace[tag] = true }
404
+
405
+ ([node] + node.css("*")).each do |el|
406
+ # If element is in whitelist, delete all its attributes
407
+ if whitelist[el.node_name]
408
+ el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
409
+
410
+ # Otherwise, replace the element with its contents
411
+ else
412
+ if replace_with_whitespace[el.node_name]
413
+ el.swap(Nokogiri::XML::Text.new(' ' << el.text << ' ', el.document))
414
+ else
415
+ el.swap(Nokogiri::XML::Text.new(el.text, el.document))
416
+ end
417
+ end
418
+
419
+ end
420
+
421
+ # Get rid of duplicate whitespace
422
+ node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ")
423
+ end
424
+
425
+ def clean_conditionally(node, candidates, selector)
426
+ return unless @clean_conditionally
427
+ node.css(selector).each do |el|
428
+ weight = class_weight(el)
429
+ content_score = candidates[el] ? candidates[el][:content_score] : 0
430
+ name = el.name.downcase
431
+
432
+ if weight + content_score < 0
433
+ el.remove
434
+ debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
435
+ elsif el.text.count(",") < 10
436
+ counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
437
+ counts["li"] -= 100
438
+
439
+ content_length = el.text.strip.length # Count the text length excluding any surrounding whitespace
440
+ link_density = get_link_density(el)
441
+ to_remove = false
442
+ reason = ""
443
+
444
+ if counts["img"] > counts["p"]
445
+ reason = "too many images"
446
+ to_remove = true
447
+ elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
448
+ reason = "more <li>s than <p>s"
449
+ to_remove = true
450
+ elsif counts["input"] > (counts["p"] / 3).to_i
451
+ reason = "less than 3x <p>s than <input>s"
452
+ to_remove = true
453
+ elsif content_length < (options[:min_text_length] || TEXT_LENGTH_THRESHOLD) && (counts["img"] == 0 || counts["img"] > 2)
454
+ reason = "too short a content length without a single image"
455
+ to_remove = true
456
+ elsif weight < 25 && link_density > 0.2
457
+ reason = "too many links for its weight (#{weight})"
458
+ to_remove = true
459
+ elsif weight >= 25 && link_density > 0.5
460
+ reason = "too many links for its weight (#{weight})"
461
+ to_remove = true
462
+ elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1
463
+ reason = "<embed>s with too short a content length, or too many <embed>s"
464
+ to_remove = true
465
+ end
466
+
467
+ if to_remove
468
+ debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
469
+ el.remove
470
+ end
471
+ end
472
+ end
473
+ end
474
+ end
475
+ end
metadata ADDED
@@ -0,0 +1,130 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: jruby-readability
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.5.6
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Andrew Cantino
9
+ - starrhorne
10
+ - libc
11
+ - Kyle Maxwell
12
+ autorequire:
13
+ bindir: bin
14
+ cert_chain: []
15
+ date: 2012-12-14 00:00:00.000000000 Z
16
+ dependencies:
17
+ - !ruby/object:Gem::Dependency
18
+ name: rspec
19
+ requirement: !ruby/object:Gem::Requirement
20
+ none: false
21
+ requirements:
22
+ - - ! '>='
23
+ - !ruby/object:Gem::Version
24
+ version: '2.8'
25
+ type: :development
26
+ prerelease: false
27
+ version_requirements: !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '2.8'
33
+ - !ruby/object:Gem::Dependency
34
+ name: rspec-expectations
35
+ requirement: !ruby/object:Gem::Requirement
36
+ none: false
37
+ requirements:
38
+ - - ! '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '2.8'
41
+ type: :development
42
+ prerelease: false
43
+ version_requirements: !ruby/object:Gem::Requirement
44
+ none: false
45
+ requirements:
46
+ - - ! '>='
47
+ - !ruby/object:Gem::Version
48
+ version: '2.8'
49
+ - !ruby/object:Gem::Dependency
50
+ name: rr
51
+ requirement: !ruby/object:Gem::Requirement
52
+ none: false
53
+ requirements:
54
+ - - ! '>='
55
+ - !ruby/object:Gem::Version
56
+ version: '1.0'
57
+ type: :development
58
+ prerelease: false
59
+ version_requirements: !ruby/object:Gem::Requirement
60
+ none: false
61
+ requirements:
62
+ - - ! '>='
63
+ - !ruby/object:Gem::Version
64
+ version: '1.0'
65
+ - !ruby/object:Gem::Dependency
66
+ name: nokogiri
67
+ requirement: !ruby/object:Gem::Requirement
68
+ none: false
69
+ requirements:
70
+ - - ! '>='
71
+ - !ruby/object:Gem::Version
72
+ version: 1.4.2
73
+ type: :runtime
74
+ prerelease: false
75
+ version_requirements: !ruby/object:Gem::Requirement
76
+ none: false
77
+ requirements:
78
+ - - ! '>='
79
+ - !ruby/object:Gem::Version
80
+ version: 1.4.2
81
+ - !ruby/object:Gem::Dependency
82
+ name: guess_html_encoding
83
+ requirement: !ruby/object:Gem::Requirement
84
+ none: false
85
+ requirements:
86
+ - - ! '>='
87
+ - !ruby/object:Gem::Version
88
+ version: 0.0.4
89
+ type: :runtime
90
+ prerelease: false
91
+ version_requirements: !ruby/object:Gem::Requirement
92
+ none: false
93
+ requirements:
94
+ - - ! '>='
95
+ - !ruby/object:Gem::Version
96
+ version: 0.0.4
97
+ description: Port of arc90's readability project to ruby
98
+ email:
99
+ - andrew@iterationlabs.com
100
+ executables: []
101
+ extensions: []
102
+ extra_rdoc_files: []
103
+ files:
104
+ - lib/jruby-readability.rb
105
+ - lib/readability.rb
106
+ homepage: http://github.com/iterationlabs/ruby-readability
107
+ licenses: []
108
+ post_install_message:
109
+ rdoc_options: []
110
+ require_paths:
111
+ - lib
112
+ required_ruby_version: !ruby/object:Gem::Requirement
113
+ none: false
114
+ requirements:
115
+ - - ! '>='
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ required_rubygems_version: !ruby/object:Gem::Requirement
119
+ none: false
120
+ requirements:
121
+ - - ! '>='
122
+ - !ruby/object:Gem::Version
123
+ version: '0'
124
+ requirements: []
125
+ rubyforge_project: ruby-readability
126
+ rubygems_version: 1.8.24
127
+ signing_key:
128
+ specification_version: 3
129
+ summary: Port of arc90's readability project to ruby
130
+ test_files: []