busk-ruby-readability 1.0.6 → 1.0.7

Sign up to get free protection for your applications and to get access to all the features.
data/lib/readability.rb CHANGED
@@ -1,35 +1,25 @@
1
1
  require 'rubygems'
2
2
  require 'nokogiri'
3
3
 
4
+ # so to run with non-Rails projects
5
+ class Object
6
+ def try(method)
7
+ send method if respond_to? method
8
+ end
9
+ end
10
+
4
11
  module Readability
5
12
  class Document
6
13
  TEXT_LENGTH_THRESHOLD = 25
7
14
  RETRY_LENGTH = 250
8
15
 
9
- attr_accessor :options, :html, :best_candidate
16
+ attr_accessor :document, :base_uri, :request, :options, :best_candidate
10
17
 
11
- def initialize(input, options = {})
12
- @input = input
18
+ def initialize(document, base_uri, request, options = {})
19
+ @document = document
20
+ @base_uri = base_uri
21
+ @request = request
13
22
  @options = options
14
- make_html
15
- end
16
-
17
- # def charset
18
- # @charset ||= begin
19
- # if content_type = @input.read.to_s.match(/(<meta\s*([^>]*)http-equiv=['"]?content-type['"]?([^>]*))/i)
20
- # if content_type = content_type[0].match(/charset=([\w-]*)/i)
21
- # content_type[1]
22
- # else
23
- # "utf-8"
24
- # end
25
- # else
26
- # "utf-8"
27
- # end
28
- # end
29
- # end
30
-
31
- def make_html
32
- @html = Nokogiri::HTML(@input, nil, 'UTF-8')
33
23
  end
34
24
 
35
25
  REGEXES = {
@@ -46,9 +36,15 @@ module Readability
46
36
  :videoRe => /http:\/\/(www\.)?(youtube|vimeo|ted|player\.vimeo)\.com/i
47
37
  }
48
38
 
39
+ # should we get rid of this?
40
+ def make_html
41
+ @document.encoding = 'UTF-8'
42
+ @best_candidate = nil
43
+ end
44
+
49
45
  def content(remove_unlikely_candidates = true)
50
- @html.css("script, style").each {|el| el.remove }
51
- @html.search('//comment()').each {|el| el.remove }
46
+ @document.css("script, style").each {|el| el.remove }
47
+ @document.search('//comment()').each {|el| el.remove }
52
48
 
53
49
  article = youtube if is_youtube? && remove_unlikely_candidates
54
50
  article = vimeo if is_vimeo? && remove_unlikely_candidates
@@ -75,23 +71,23 @@ module Readability
75
71
  end
76
72
 
77
73
  def is_youtube?
78
- (@input.base_uri.to_s =~ /^http:\/\/(www\.)?youtube.com/)
74
+ (@base_uri.to_s =~ /^http:\/\/(www\.)?youtube.com/)
79
75
  end
80
76
 
81
77
  def is_vimeo?
82
- (@input.base_uri.to_s =~ /^http:\/\/(www.)?vimeo.com/)
78
+ (@base_uri.to_s =~ /^http:\/\/(www.)?vimeo.com/)
83
79
  end
84
80
 
85
81
  def is_ted?
86
- (@input.base_uri.to_s =~ /^http:\/\/(www.)?ted.com\/talks/)
82
+ (@base_uri.to_s =~ /^http:\/\/(www.)?ted.com\/talks/)
87
83
  end
88
84
 
89
85
  def is_special_case?
90
- (@input.base_uri.to_s =~ REGEXES[:videoRe])
86
+ (@base_uri.to_s =~ REGEXES[:videoRe])
91
87
  end
92
88
 
93
89
  def youtube
94
- if @input.base_uri.request_uri =~ /\?v=([_\-a-z0-9]+)&?/i
90
+ if @request =~ /\?v=([_\-a-z0-9]+)&?/i
95
91
  Nokogiri::HTML.fragment <<-HTML
96
92
  <object width="706" height="422">
97
93
  <param name="movie" value="http://www.youtube.com/v/#{$1}?fs=1&amp;hl=en_US"></param>
@@ -106,8 +102,11 @@ module Readability
106
102
  end
107
103
 
108
104
  def vimeo
105
+ # matches non-channel or pages that used swfobject to print player
106
+ if @document.css("#clip_id")
107
+ Nokogiri::HTML.fragment("<iframe src=\"http://player.vimeo.com/video/#{@document.css("#clip_id").attr('value')}\" width=\"572\" height=\"322\" frameborder=\"0\"></iframe>")
109
108
  # matches channel pages
110
- if (player = @html.css(".player")).present?
109
+ elsif player = @document.css(".player")
111
110
  html = ""
112
111
  player.each do |video|
113
112
  if video.to_html =~ /clip_id=([0-9]+)/
@@ -115,16 +114,13 @@ module Readability
115
114
  end
116
115
  end
117
116
  Nokogiri::HTML.fragment(html)
118
- # matches non-channel or pages that used swfobject to print player
119
- elsif @html.to_html =~ /clip_id=([0-9]+)/
120
- Nokogiri::HTML.fragment("<iframe src=\"http://player.vimeo.com/video/#{$1}\" width=\"572\" height=\"322\" frameborder=\"0\"></iframe>")
121
117
  else
122
118
  nil
123
119
  end
124
120
  end
125
121
 
126
122
  def ted
127
- if (player = @html.css(".copy_paste")).present?
123
+ if (player = @document.css(".copy_paste")).present?
128
124
  unless player.first.attr("value").blank?
129
125
  Nokogiri::HTML.fragment(player.first.attr("value").to_s)
130
126
  else
@@ -140,8 +136,8 @@ module Readability
140
136
  # Things like preambles, content split by ads that we removed, etc.
141
137
 
142
138
  sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
143
- output = Nokogiri::XML::Node.new('div', @html)
144
- best_candidate[:elem].parent.andand.children.each do |sibling|
139
+ output = Nokogiri::XML::Node.new('div', @document)
140
+ best_candidate[:elem].parent.try(:children).each do |sibling|
145
141
  append = false
146
142
  append = true if sibling == best_candidate[:elem]
147
143
  append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
@@ -173,10 +169,10 @@ module Readability
173
169
 
174
170
  debug("Top 5 candidates:")
175
171
  sorted_candidates[0...5].each do |candidate|
176
- debug("Candidate #{candidate[:elem].andand.name}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}")
172
+ debug("Candidate #{candidate[:elem].try(:name)}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}")
177
173
  end
178
174
 
179
- best_candidate = sorted_candidates.first || { :elem => @html.css("body").first, :content_score => 0 }
175
+ best_candidate = sorted_candidates.first || { :elem => @document.css("body").first, :content_score => 0 }
180
176
  #debug("Best candidate #{best_candidate[:elem].andand.name} with score #{best_candidate[:content_score]}")
181
177
  best_candidate
182
178
  end
@@ -190,7 +186,7 @@ module Readability
190
186
 
191
187
  def score_paragraphs(min_text_length)
192
188
  candidates = {}
193
- @html.css("p,td").each do |elem|
189
+ @document.css("p,td").each do |elem|
194
190
  parent_node = elem.parent
195
191
  grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil
196
192
  inner_text = elem.text
@@ -263,7 +259,7 @@ module Readability
263
259
  end
264
260
 
265
261
  def remove_unlikely_candidates!
266
- @html.css("*").each do |elem|
262
+ @document.css("*").each do |elem|
267
263
  str = "#{elem[:class]}#{elem[:id]}"
268
264
  if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && elem.name.downcase != 'body'
269
265
  debug("Removing unlikely candidate - #{str}")
@@ -273,7 +269,7 @@ module Readability
273
269
  end
274
270
 
275
271
  def transform_misused_divs_into_paragraphs!
276
- @html.css("*").each do |elem|
272
+ @document.css("*").each do |elem|
277
273
  if elem.name.downcase == "div"
278
274
  # transform <div>s that do not contain other block elements into <p>s
279
275
  if elem.inner_html !~ REGEXES[:divToPElementsRe]