busk-ruby-readability 1.0.6 → 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/readability.rb CHANGED
@@ -1,35 +1,25 @@
1
1
  require 'rubygems'
2
2
  require 'nokogiri'
3
3
 
4
+ # so to run with non-Rails projects
5
+ class Object
6
+ def try(method)
7
+ send method if respond_to? method
8
+ end
9
+ end
10
+
4
11
  module Readability
5
12
  class Document
6
13
  TEXT_LENGTH_THRESHOLD = 25
7
14
  RETRY_LENGTH = 250
8
15
 
9
- attr_accessor :options, :html, :best_candidate
16
+ attr_accessor :document, :base_uri, :request, :options, :best_candidate
10
17
 
11
- def initialize(input, options = {})
12
- @input = input
18
+ def initialize(document, base_uri, request, options = {})
19
+ @document = document
20
+ @base_uri = base_uri
21
+ @request = request
13
22
  @options = options
14
- make_html
15
- end
16
-
17
- # def charset
18
- # @charset ||= begin
19
- # if content_type = @input.read.to_s.match(/(<meta\s*([^>]*)http-equiv=['"]?content-type['"]?([^>]*))/i)
20
- # if content_type = content_type[0].match(/charset=([\w-]*)/i)
21
- # content_type[1]
22
- # else
23
- # "utf-8"
24
- # end
25
- # else
26
- # "utf-8"
27
- # end
28
- # end
29
- # end
30
-
31
- def make_html
32
- @html = Nokogiri::HTML(@input, nil, 'UTF-8')
33
23
  end
34
24
 
35
25
  REGEXES = {
@@ -46,9 +36,15 @@ module Readability
46
36
  :videoRe => /http:\/\/(www\.)?(youtube|vimeo|ted|player\.vimeo)\.com/i
47
37
  }
48
38
 
39
+ # should we get rid of this?
40
+ def make_html
41
+ @document.encoding = 'UTF-8'
42
+ @best_candidate = nil
43
+ end
44
+
49
45
  def content(remove_unlikely_candidates = true)
50
- @html.css("script, style").each {|el| el.remove }
51
- @html.search('//comment()').each {|el| el.remove }
46
+ @document.css("script, style").each {|el| el.remove }
47
+ @document.search('//comment()').each {|el| el.remove }
52
48
 
53
49
  article = youtube if is_youtube? && remove_unlikely_candidates
54
50
  article = vimeo if is_vimeo? && remove_unlikely_candidates
@@ -75,23 +71,23 @@ module Readability
75
71
  end
76
72
 
77
73
  def is_youtube?
78
- (@input.base_uri.to_s =~ /^http:\/\/(www\.)?youtube.com/)
74
+ (@base_uri.to_s =~ /^http:\/\/(www\.)?youtube.com/)
79
75
  end
80
76
 
81
77
  def is_vimeo?
82
- (@input.base_uri.to_s =~ /^http:\/\/(www.)?vimeo.com/)
78
+ (@base_uri.to_s =~ /^http:\/\/(www.)?vimeo.com/)
83
79
  end
84
80
 
85
81
  def is_ted?
86
- (@input.base_uri.to_s =~ /^http:\/\/(www.)?ted.com\/talks/)
82
+ (@base_uri.to_s =~ /^http:\/\/(www.)?ted.com\/talks/)
87
83
  end
88
84
 
89
85
  def is_special_case?
90
- (@input.base_uri.to_s =~ REGEXES[:videoRe])
86
+ (@base_uri.to_s =~ REGEXES[:videoRe])
91
87
  end
92
88
 
93
89
  def youtube
94
- if @input.base_uri.request_uri =~ /\?v=([_\-a-z0-9]+)&?/i
90
+ if @request =~ /\?v=([_\-a-z0-9]+)&?/i
95
91
  Nokogiri::HTML.fragment <<-HTML
96
92
  <object width="706" height="422">
97
93
  <param name="movie" value="http://www.youtube.com/v/#{$1}?fs=1&amp;hl=en_US"></param>
@@ -106,8 +102,11 @@ module Readability
106
102
  end
107
103
 
108
104
  def vimeo
105
+ # matches non-channel or pages that used swfobject to print player
106
+ if @document.css("#clip_id")
107
+ Nokogiri::HTML.fragment("<iframe src=\"http://player.vimeo.com/video/#{@document.css("#clip_id").attr('value')}\" width=\"572\" height=\"322\" frameborder=\"0\"></iframe>")
109
108
  # matches channel pages
110
- if (player = @html.css(".player")).present?
109
+ elsif player = @document.css(".player")
111
110
  html = ""
112
111
  player.each do |video|
113
112
  if video.to_html =~ /clip_id=([0-9]+)/
@@ -115,16 +114,13 @@ module Readability
115
114
  end
116
115
  end
117
116
  Nokogiri::HTML.fragment(html)
118
- # matches non-channel or pages that used swfobject to print player
119
- elsif @html.to_html =~ /clip_id=([0-9]+)/
120
- Nokogiri::HTML.fragment("<iframe src=\"http://player.vimeo.com/video/#{$1}\" width=\"572\" height=\"322\" frameborder=\"0\"></iframe>")
121
117
  else
122
118
  nil
123
119
  end
124
120
  end
125
121
 
126
122
  def ted
127
- if (player = @html.css(".copy_paste")).present?
123
+ if (player = @document.css(".copy_paste")).present?
128
124
  unless player.first.attr("value").blank?
129
125
  Nokogiri::HTML.fragment(player.first.attr("value").to_s)
130
126
  else
@@ -140,8 +136,8 @@ module Readability
140
136
  # Things like preambles, content split by ads that we removed, etc.
141
137
 
142
138
  sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
143
- output = Nokogiri::XML::Node.new('div', @html)
144
- best_candidate[:elem].parent.andand.children.each do |sibling|
139
+ output = Nokogiri::XML::Node.new('div', @document)
140
+ best_candidate[:elem].parent.try(:children).each do |sibling|
145
141
  append = false
146
142
  append = true if sibling == best_candidate[:elem]
147
143
  append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
@@ -173,10 +169,10 @@ module Readability
173
169
 
174
170
  debug("Top 5 candidates:")
175
171
  sorted_candidates[0...5].each do |candidate|
176
- debug("Candidate #{candidate[:elem].andand.name}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}")
172
+ debug("Candidate #{candidate[:elem].try(:name)}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}")
177
173
  end
178
174
 
179
- best_candidate = sorted_candidates.first || { :elem => @html.css("body").first, :content_score => 0 }
175
+ best_candidate = sorted_candidates.first || { :elem => @document.css("body").first, :content_score => 0 }
180
176
  #debug("Best candidate #{best_candidate[:elem].andand.name} with score #{best_candidate[:content_score]}")
181
177
  best_candidate
182
178
  end
@@ -190,7 +186,7 @@ module Readability
190
186
 
191
187
  def score_paragraphs(min_text_length)
192
188
  candidates = {}
193
- @html.css("p,td").each do |elem|
189
+ @document.css("p,td").each do |elem|
194
190
  parent_node = elem.parent
195
191
  grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil
196
192
  inner_text = elem.text
@@ -263,7 +259,7 @@ module Readability
263
259
  end
264
260
 
265
261
  def remove_unlikely_candidates!
266
- @html.css("*").each do |elem|
262
+ @document.css("*").each do |elem|
267
263
  str = "#{elem[:class]}#{elem[:id]}"
268
264
  if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && elem.name.downcase != 'body'
269
265
  debug("Removing unlikely candidate - #{str}")
@@ -273,7 +269,7 @@ module Readability
273
269
  end
274
270
 
275
271
  def transform_misused_divs_into_paragraphs!
276
- @html.css("*").each do |elem|
272
+ @document.css("*").each do |elem|
277
273
  if elem.name.downcase == "div"
278
274
  # transform <div>s that do not contain other block elements into <p>s
279
275
  if elem.inner_html !~ REGEXES[:divToPElementsRe]