jruby-readability 0.5.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. data/lib/jruby-readability.rb +1 -0
  2. data/lib/readability.rb +475 -0
  3. metadata +130 -0
@@ -0,0 +1 @@
1
+ require 'readability'
@@ -0,0 +1,475 @@
1
+ require 'rubygems'
2
+ require 'nokogiri'
3
+ require 'guess_html_encoding'
4
+
5
+ module Readability
6
+ class Document
7
+ DEFAULT_OPTIONS = {
8
+ :retry_length => 250,
9
+ :min_text_length => 25,
10
+ :remove_unlikely_candidates => true,
11
+ :weight_classes => true,
12
+ :clean_conditionally => true,
13
+ :remove_empty_nodes => true,
14
+ :min_image_width => 130,
15
+ :min_image_height => 80,
16
+ :ignore_image_format => []
17
+ }.freeze
18
+
19
+ attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image
20
+
21
+ def initialize(input, options = {})
22
+ @options = DEFAULT_OPTIONS.merge(options)
23
+ @input = input
24
+
25
+ if RUBY_VERSION =~ /^1\.9\./ && !@options[:encoding]
26
+ @input = GuessHtmlEncoding.encode(@input, @options[:html_headers]) unless @options[:do_not_guess_encoding]
27
+ @options[:encoding] = @input.encoding.to_s
28
+ end
29
+
30
+ @input = @input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>')
31
+ @remove_unlikely_candidates = @options[:remove_unlikely_candidates]
32
+ @weight_classes = @options[:weight_classes]
33
+ @clean_conditionally = @options[:clean_conditionally]
34
+ @best_candidate_has_image = true
35
+ make_html
36
+ end
37
+
38
+ def prepare_candidates
39
+ @html.css("script, style").each { |i| i.remove }
40
+ remove_unlikely_candidates! if @remove_unlikely_candidates
41
+ transform_misused_divs_into_paragraphs!
42
+
43
+ @candidates = score_paragraphs(options[:min_text_length])
44
+ @best_candidate = select_best_candidate(@candidates)
45
+ end
46
+
47
+ def make_html
48
+ @html = Nokogiri::HTML(@input, nil, @options[:encoding])
49
+ # In case document has no body, such as from empty string or redirect
50
+ @html = Nokogiri::HTML('<body />', nil, @options[:encoding]) if @html.css('body').length == 0
51
+
52
+ # Remove html comment tags
53
+ @html.xpath('//comment()').each { |i| i.remove }
54
+ end
55
+
56
+ def images(content=nil, reload=false)
57
+ begin
58
+ require 'fastimage'
59
+ rescue LoadError
60
+ raise "Please install fastimage in order to use the #images feature."
61
+ end
62
+
63
+ @best_candidate_has_image = false if reload
64
+
65
+ prepare_candidates
66
+ list_images = []
67
+ tested_images = []
68
+ content = @best_candidate[:elem] unless reload
69
+
70
+ return list_images if content.nil?
71
+ elements = content.css("img").map(&:attributes)
72
+
73
+ elements.each do |element|
74
+ next unless element["src"]
75
+
76
+ url = element["src"].value
77
+ height = element["height"].nil? ? 0 : element["height"].value.to_i
78
+ width = element["width"].nil? ? 0 : element["width"].value.to_i
79
+
80
+ if url =~ /\Ahttps?:\/\//i && (height.zero? || width.zero?)
81
+ image = get_image_size(url)
82
+ next unless image
83
+ else
84
+ image = {:width => width, :height => height}
85
+ end
86
+
87
+ image[:format] = File.extname(url).gsub(".", "")
88
+
89
+ if tested_images.include?(url)
90
+ debug("Image was tested: #{url}")
91
+ next
92
+ end
93
+
94
+ tested_images.push(url)
95
+ if image_meets_criteria?(image)
96
+ list_images << url
97
+ else
98
+ debug("Image discarded: #{url} - height: #{image[:height]} - width: #{image[:width]} - format: #{image[:format]}")
99
+ end
100
+ end
101
+
102
+ (list_images.empty? and content != @html) ? images(@html, true) : list_images
103
+ end
104
+
105
+ def get_image_size(url)
106
+ begin
107
+ w, h = FastImage.size(url)
108
+ raise "Couldn't get size." if w.nil? || h.nil?
109
+ {width: w, height: h}
110
+ rescue => e
111
+ debug("Image error: #{e}")
112
+ nil
113
+ end
114
+ end
115
+
116
+ def image_meets_criteria?(image)
117
+ return false if options[:ignore_image_format].include?(image[:format].downcase)
118
+ image[:width] >= (options[:min_image_width] || 0) && image[:height] >= (options[:min_image_height] || 0)
119
+ end
120
+
121
+ REGEXES = {
122
+ :unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
123
+ :okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i,
124
+ :positiveRe => /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
125
+ :negativeRe => /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
126
+ :divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
127
+ :replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
128
+ :replaceFontsRe => /<(\/?)font[^>]*>/i,
129
+ :trimRe => /^\s+|\s+$/,
130
+ :normalizeRe => /\s{2,}/,
131
+ :killBreaksRe => /(<br\s*\/?>(\s|&nbsp;?)*){1,}/,
132
+ :videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
133
+ }
134
+
135
+ def title
136
+ title = @html.css("title").first
137
+ title ? title.text : nil
138
+ end
139
+
140
+ # Look through the @html document looking for the author
141
+ # Precedence Information here on the wiki: (TODO attach wiki URL if it is accepted)
142
+ # Returns nil if no author is detected
143
+ def author
144
+ # Let's grab this author:
145
+ # <meta name="dc.creator" content="Finch - http://www.getfinch.com" />
146
+ author_elements = @html.xpath('//meta[@name = "dc.creator"]')
147
+ unless author_elements.empty?
148
+ author_elements.each do |element|
149
+ if element['content']
150
+ return element['content'].strip
151
+ end
152
+ end
153
+ end
154
+
155
+ # Now let's try to grab this
156
+ # <span class="byline author vcard"><span>By</span><cite class="fn">Austin Fonacier</cite></span>
157
+ # <div class="author">By</div><div class="author vcard"><a class="url fn" href="http://austinlivesinyoapp.com/">Austin Fonacier</a></div>
158
+ author_elements = @html.xpath('//*[contains(@class, "vcard")]//*[contains(@class, "fn")]')
159
+ unless author_elements.empty?
160
+ author_elements.each do |element|
161
+ if element.text
162
+ return element.text.strip
163
+ end
164
+ end
165
+ end
166
+
167
+ # Now let's try to grab this
168
+ # <a rel="author" href="http://dbanksdesign.com">Danny Banks (rel)</a>
169
+ # TODO: strip out the (rel)?
170
+ author_elements = @html.xpath('//a[@rel = "author"]')
171
+ unless author_elements.empty?
172
+ author_elements.each do |element|
173
+ if element.text
174
+ return element.text.strip
175
+ end
176
+ end
177
+ end
178
+
179
+ author_elements = @html.xpath('//*[@id = "author"]')
180
+ unless author_elements.empty?
181
+ author_elements.each do |element|
182
+ if element.text
183
+ return element.text.strip
184
+ end
185
+ end
186
+ end
187
+ end
188
+
189
+ def content(remove_unlikely_candidates = :default)
190
+ @remove_unlikely_candidates = false if remove_unlikely_candidates == false
191
+
192
+ prepare_candidates
193
+ article = get_article(@candidates, @best_candidate)
194
+
195
+ cleaned_article = sanitize(article, @candidates, options)
196
+ if article.text.strip.length < options[:retry_length]
197
+ if @remove_unlikely_candidates
198
+ @remove_unlikely_candidates = false
199
+ elsif @weight_classes
200
+ @weight_classes = false
201
+ elsif @clean_conditionally
202
+ @clean_conditionally = false
203
+ else
204
+ # nothing we can do
205
+ return cleaned_article
206
+ end
207
+
208
+ make_html
209
+ content
210
+ else
211
+ cleaned_article
212
+ end
213
+ end
214
+
215
+ def get_article(candidates, best_candidate)
216
+ # Now that we have the top candidate, look through its siblings for content that might also be related.
217
+ # Things like preambles, content split by ads that we removed, etc.
218
+
219
+ sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
220
+ output = Nokogiri::XML::Node.new('div', @html)
221
+ best_candidate[:elem].parent.children.each do |sibling|
222
+ append = false
223
+ append = true if sibling == best_candidate[:elem]
224
+ append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
225
+
226
+ if sibling.name.downcase == "p"
227
+ link_density = get_link_density(sibling)
228
+ node_content = sibling.text
229
+ node_length = node_content.length
230
+
231
+ if node_length > 80 && link_density < 0.25
232
+ append = true
233
+ elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/
234
+ append = true
235
+ end
236
+ end
237
+
238
+ if append
239
+ sibling_dup = sibling.dup # otherwise the state of the document in processing will change, thus creating side effects
240
+ sibling_dup.name = "div" unless %w[div p].include?(sibling.name.downcase)
241
+ output << sibling_dup
242
+ end
243
+ end
244
+
245
+ output
246
+ end
247
+
248
+ def select_best_candidate(candidates)
249
+ sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
250
+
251
+ debug("Top 5 candidates:")
252
+ sorted_candidates[0...5].each do |candidate|
253
+ debug("Candidate #{candidate[:elem].name}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}")
254
+ end
255
+
256
+ best_candidate = sorted_candidates.first || { :elem => @html.css("body").first, :content_score => 0 }
257
+ debug("Best candidate #{best_candidate[:elem].name}##{best_candidate[:elem][:id]}.#{best_candidate[:elem][:class]} with score #{best_candidate[:content_score]}")
258
+
259
+ best_candidate
260
+ end
261
+
262
+ def get_link_density(elem)
263
+ link_length = elem.css("a").map(&:text).join("").length
264
+ text_length = elem.text.length
265
+ link_length / text_length.to_f
266
+ end
267
+
268
+ def score_paragraphs(min_text_length)
269
+ candidates = {}
270
+ @html.css("p,td").each do |elem|
271
+ parent_node = elem.parent
272
+ grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil
273
+ inner_text = elem.text
274
+
275
+ # If this paragraph is less than 25 characters, don't even count it.
276
+ next if inner_text.length < min_text_length
277
+
278
+ candidates[parent_node] ||= score_node(parent_node)
279
+ candidates[grand_parent_node] ||= score_node(grand_parent_node) if grand_parent_node
280
+
281
+ content_score = 1
282
+ content_score += inner_text.split(',').length
283
+ content_score += [(inner_text.length / 100).to_i, 3].min
284
+
285
+ candidates[parent_node][:content_score] += content_score
286
+ candidates[grand_parent_node][:content_score] += content_score / 2.0 if grand_parent_node
287
+ end
288
+
289
+ # Scale the final candidates score based on link density. Good content should have a
290
+ # relatively small link density (5% or less) and be mostly unaffected by this operation.
291
+ candidates.each do |elem, candidate|
292
+ candidate[:content_score] = candidate[:content_score] * (1 - get_link_density(elem))
293
+ end
294
+
295
+ candidates
296
+ end
297
+
298
+ def class_weight(e)
299
+ weight = 0
300
+ return weight unless @weight_classes
301
+
302
+ if e[:class] && e[:class] != ""
303
+ if e[:class] =~ REGEXES[:negativeRe]
304
+ weight -= 25
305
+ end
306
+
307
+ if e[:class] =~ REGEXES[:positiveRe]
308
+ weight += 25
309
+ end
310
+ end
311
+
312
+ if e[:id] && e[:id] != ""
313
+ if e[:id] =~ REGEXES[:negativeRe]
314
+ weight -= 25
315
+ end
316
+
317
+ if e[:id] =~ REGEXES[:positiveRe]
318
+ weight += 25
319
+ end
320
+ end
321
+
322
+ weight
323
+ end
324
+
325
+ def score_node(elem)
326
+ content_score = class_weight(elem)
327
+ case elem.name.downcase
328
+ when "div"
329
+ content_score += 5
330
+ when "blockquote"
331
+ content_score += 3
332
+ when "form"
333
+ content_score -= 3
334
+ when "th"
335
+ content_score -= 5
336
+ end
337
+ { :content_score => content_score, :elem => elem }
338
+ end
339
+
340
+ def debug(str)
341
+ puts str if options[:debug]
342
+ end
343
+
344
+ def remove_unlikely_candidates!
345
+ @html.css("*").each do |elem|
346
+ str = "#{elem[:class]}#{elem[:id]}"
347
+ if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && (elem.name.downcase != 'html') && (elem.name.downcase != 'body')
348
+ debug("Removing unlikely candidate - #{str}")
349
+ elem.remove
350
+ end
351
+ end
352
+ end
353
+
354
+ def transform_misused_divs_into_paragraphs!
355
+ @html.css("*").each do |elem|
356
+ if elem.name.downcase == "div"
357
+ # transform <div>s that do not contain other block elements into <p>s
358
+ if elem.inner_html !~ REGEXES[:divToPElementsRe]
359
+ debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
360
+ elem.name = "p"
361
+ end
362
+ else
363
+ # wrap text nodes in p tags
364
+ # elem.children.each do |child|
365
+ # if child.text?
366
+ # debug("wrapping text node with a p")
367
+ # child.swap("<p>#{child.text}</p>")
368
+ # end
369
+ # end
370
+ end
371
+ end
372
+ end
373
+
374
+ def sanitize(node, candidates, options = {})
375
+ node.css("h1, h2, h3, h4, h5, h6").each do |header|
376
+ header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33
377
+ end
378
+
379
+ node.css("form, object, iframe, embed").each do |elem|
380
+ elem.remove
381
+ end
382
+
383
+ if @options[:remove_empty_nodes]
384
+ # remove <p> tags that have no text content - this will also remove p tags that contain only images.
385
+ node.css("p").each do |elem|
386
+ elem.remove if elem.content.strip.empty?
387
+ end
388
+ end
389
+
390
+ # Conditionally clean <table>s, <ul>s, and <div>s
391
+ clean_conditionally(node, candidates, "table, ul, div")
392
+
393
+ # We'll sanitize all elements using a whitelist
394
+ base_whitelist = @options[:tags] || %w[div p]
395
+ # We'll add whitespace instead of block elements,
396
+ # so a<br>b will have a nice space between them
397
+ base_replace_with_whitespace = %w[br hr h1 h2 h3 h4 h5 h6 dl dd ol li ul address blockquote center]
398
+
399
+ # Use a hash for speed (don't want to make a million calls to include?)
400
+ whitelist = Hash.new
401
+ base_whitelist.each {|tag| whitelist[tag] = true }
402
+ replace_with_whitespace = Hash.new
403
+ base_replace_with_whitespace.each { |tag| replace_with_whitespace[tag] = true }
404
+
405
+ ([node] + node.css("*")).each do |el|
406
+ # If element is in whitelist, delete all its attributes
407
+ if whitelist[el.node_name]
408
+ el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
409
+
410
+ # Otherwise, replace the element with its contents
411
+ else
412
+ if replace_with_whitespace[el.node_name]
413
+ el.swap(Nokogiri::XML::Text.new(' ' << el.text << ' ', el.document))
414
+ else
415
+ el.swap(Nokogiri::XML::Text.new(el.text, el.document))
416
+ end
417
+ end
418
+
419
+ end
420
+
421
+ # Get rid of duplicate whitespace
422
+ node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ")
423
+ end
424
+
425
+ def clean_conditionally(node, candidates, selector)
426
+ return unless @clean_conditionally
427
+ node.css(selector).each do |el|
428
+ weight = class_weight(el)
429
+ content_score = candidates[el] ? candidates[el][:content_score] : 0
430
+ name = el.name.downcase
431
+
432
+ if weight + content_score < 0
433
+ el.remove
434
+ debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
435
+ elsif el.text.count(",") < 10
436
+ counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
437
+ counts["li"] -= 100
438
+
439
+ content_length = el.text.strip.length # Count the text length excluding any surrounding whitespace
440
+ link_density = get_link_density(el)
441
+ to_remove = false
442
+ reason = ""
443
+
444
+ if counts["img"] > counts["p"]
445
+ reason = "too many images"
446
+ to_remove = true
447
+ elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
448
+ reason = "more <li>s than <p>s"
449
+ to_remove = true
450
+ elsif counts["input"] > (counts["p"] / 3).to_i
451
+ reason = "less than 3x <p>s than <input>s"
452
+ to_remove = true
453
+ elsif content_length < (options[:min_text_length] || TEXT_LENGTH_THRESHOLD) && (counts["img"] == 0 || counts["img"] > 2)
454
+ reason = "too short a content length without a single image"
455
+ to_remove = true
456
+ elsif weight < 25 && link_density > 0.2
457
+ reason = "too many links for its weight (#{weight})"
458
+ to_remove = true
459
+ elsif weight >= 25 && link_density > 0.5
460
+ reason = "too many links for its weight (#{weight})"
461
+ to_remove = true
462
+ elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1
463
+ reason = "<embed>s with too short a content length, or too many <embed>s"
464
+ to_remove = true
465
+ end
466
+
467
+ if to_remove
468
+ debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
469
+ el.remove
470
+ end
471
+ end
472
+ end
473
+ end
474
+ end
475
+ end
metadata ADDED
@@ -0,0 +1,130 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: jruby-readability
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.5.6
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Andrew Cantino
9
+ - starrhorne
10
+ - libc
11
+ - Kyle Maxwell
12
+ autorequire:
13
+ bindir: bin
14
+ cert_chain: []
15
+ date: 2012-12-14 00:00:00.000000000 Z
16
+ dependencies:
17
+ - !ruby/object:Gem::Dependency
18
+ name: rspec
19
+ requirement: !ruby/object:Gem::Requirement
20
+ none: false
21
+ requirements:
22
+ - - ! '>='
23
+ - !ruby/object:Gem::Version
24
+ version: '2.8'
25
+ type: :development
26
+ prerelease: false
27
+ version_requirements: !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '2.8'
33
+ - !ruby/object:Gem::Dependency
34
+ name: rspec-expectations
35
+ requirement: !ruby/object:Gem::Requirement
36
+ none: false
37
+ requirements:
38
+ - - ! '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '2.8'
41
+ type: :development
42
+ prerelease: false
43
+ version_requirements: !ruby/object:Gem::Requirement
44
+ none: false
45
+ requirements:
46
+ - - ! '>='
47
+ - !ruby/object:Gem::Version
48
+ version: '2.8'
49
+ - !ruby/object:Gem::Dependency
50
+ name: rr
51
+ requirement: !ruby/object:Gem::Requirement
52
+ none: false
53
+ requirements:
54
+ - - ! '>='
55
+ - !ruby/object:Gem::Version
56
+ version: '1.0'
57
+ type: :development
58
+ prerelease: false
59
+ version_requirements: !ruby/object:Gem::Requirement
60
+ none: false
61
+ requirements:
62
+ - - ! '>='
63
+ - !ruby/object:Gem::Version
64
+ version: '1.0'
65
+ - !ruby/object:Gem::Dependency
66
+ name: nokogiri
67
+ requirement: !ruby/object:Gem::Requirement
68
+ none: false
69
+ requirements:
70
+ - - ! '>='
71
+ - !ruby/object:Gem::Version
72
+ version: 1.4.2
73
+ type: :runtime
74
+ prerelease: false
75
+ version_requirements: !ruby/object:Gem::Requirement
76
+ none: false
77
+ requirements:
78
+ - - ! '>='
79
+ - !ruby/object:Gem::Version
80
+ version: 1.4.2
81
+ - !ruby/object:Gem::Dependency
82
+ name: guess_html_encoding
83
+ requirement: !ruby/object:Gem::Requirement
84
+ none: false
85
+ requirements:
86
+ - - ! '>='
87
+ - !ruby/object:Gem::Version
88
+ version: 0.0.4
89
+ type: :runtime
90
+ prerelease: false
91
+ version_requirements: !ruby/object:Gem::Requirement
92
+ none: false
93
+ requirements:
94
+ - - ! '>='
95
+ - !ruby/object:Gem::Version
96
+ version: 0.0.4
97
+ description: Port of arc90's readability project to ruby
98
+ email:
99
+ - andrew@iterationlabs.com
100
+ executables: []
101
+ extensions: []
102
+ extra_rdoc_files: []
103
+ files:
104
+ - lib/jruby-readability.rb
105
+ - lib/readability.rb
106
+ homepage: http://github.com/iterationlabs/ruby-readability
107
+ licenses: []
108
+ post_install_message:
109
+ rdoc_options: []
110
+ require_paths:
111
+ - lib
112
+ required_ruby_version: !ruby/object:Gem::Requirement
113
+ none: false
114
+ requirements:
115
+ - - ! '>='
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ required_rubygems_version: !ruby/object:Gem::Requirement
119
+ none: false
120
+ requirements:
121
+ - - ! '>='
122
+ - !ruby/object:Gem::Version
123
+ version: '0'
124
+ requirements: []
125
+ rubyforge_project: ruby-readability
126
+ rubygems_version: 1.8.24
127
+ signing_key:
128
+ specification_version: 3
129
+ summary: Port of arc90's readability project to ruby
130
+ test_files: []