busk-ruby-readability 1.0.6 → 1.0.7
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/readability.rb +37 -41
- data/spec/fixtures/cant_read.html +426 -0
- data/spec/fixtures/sample.html +1198 -0
- data/spec/fixtures/samples/channel4-1-fragments.rb +14 -0
- data/spec/fixtures/samples/channel4-1.html +1330 -0
- data/spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb +31 -0
- data/spec/fixtures/samples/globemail-ottawa-cuts.html +2410 -0
- data/spec/fixtures/should_not_truncate.txt +1077 -0
- data/spec/fixtures/vimeo.com.html +1072 -0
- data/spec/readability_spec.rb +199 -0
- data/spec/spec.opts +4 -0
- data/spec/spec_helper.rb +13 -0
- metadata +17 -4
data/lib/readability.rb
CHANGED
@@ -1,35 +1,25 @@
|
|
1
1
|
require 'rubygems'
|
2
2
|
require 'nokogiri'
|
3
3
|
|
4
|
+
# so to run with non-Rails projects
|
5
|
+
class Object
|
6
|
+
def try(method)
|
7
|
+
send method if respond_to? method
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
4
11
|
module Readability
|
5
12
|
class Document
|
6
13
|
TEXT_LENGTH_THRESHOLD = 25
|
7
14
|
RETRY_LENGTH = 250
|
8
15
|
|
9
|
-
attr_accessor :
|
16
|
+
attr_accessor :document, :base_uri, :request, :options, :best_candidate
|
10
17
|
|
11
|
-
def initialize(
|
12
|
-
@
|
18
|
+
def initialize(document, base_uri, request, options = {})
|
19
|
+
@document = document
|
20
|
+
@base_uri = base_uri
|
21
|
+
@request = request
|
13
22
|
@options = options
|
14
|
-
make_html
|
15
|
-
end
|
16
|
-
|
17
|
-
# def charset
|
18
|
-
# @charset ||= begin
|
19
|
-
# if content_type = @input.read.to_s.match(/(<meta\s*([^>]*)http-equiv=['"]?content-type['"]?([^>]*))/i)
|
20
|
-
# if content_type = content_type[0].match(/charset=([\w-]*)/i)
|
21
|
-
# content_type[1]
|
22
|
-
# else
|
23
|
-
# "utf-8"
|
24
|
-
# end
|
25
|
-
# else
|
26
|
-
# "utf-8"
|
27
|
-
# end
|
28
|
-
# end
|
29
|
-
# end
|
30
|
-
|
31
|
-
def make_html
|
32
|
-
@html = Nokogiri::HTML(@input, nil, 'UTF-8')
|
33
23
|
end
|
34
24
|
|
35
25
|
REGEXES = {
|
@@ -46,9 +36,15 @@ module Readability
|
|
46
36
|
:videoRe => /http:\/\/(www\.)?(youtube|vimeo|ted|player\.vimeo)\.com/i
|
47
37
|
}
|
48
38
|
|
39
|
+
# should we get rid of this?
|
40
|
+
def make_html
|
41
|
+
@document.encoding = 'UTF-8'
|
42
|
+
@best_candidate = nil
|
43
|
+
end
|
44
|
+
|
49
45
|
def content(remove_unlikely_candidates = true)
|
50
|
-
@
|
51
|
-
@
|
46
|
+
@document.css("script, style").each {|el| el.remove }
|
47
|
+
@document.search('//comment()').each {|el| el.remove }
|
52
48
|
|
53
49
|
article = youtube if is_youtube? && remove_unlikely_candidates
|
54
50
|
article = vimeo if is_vimeo? && remove_unlikely_candidates
|
@@ -75,23 +71,23 @@ module Readability
|
|
75
71
|
end
|
76
72
|
|
77
73
|
def is_youtube?
|
78
|
-
(@
|
74
|
+
(@base_uri.to_s =~ /^http:\/\/(www\.)?youtube.com/)
|
79
75
|
end
|
80
76
|
|
81
77
|
def is_vimeo?
|
82
|
-
(@
|
78
|
+
(@base_uri.to_s =~ /^http:\/\/(www.)?vimeo.com/)
|
83
79
|
end
|
84
80
|
|
85
81
|
def is_ted?
|
86
|
-
(@
|
82
|
+
(@base_uri.to_s =~ /^http:\/\/(www.)?ted.com\/talks/)
|
87
83
|
end
|
88
84
|
|
89
85
|
def is_special_case?
|
90
|
-
(@
|
86
|
+
(@base_uri.to_s =~ REGEXES[:videoRe])
|
91
87
|
end
|
92
88
|
|
93
89
|
def youtube
|
94
|
-
if @
|
90
|
+
if @request =~ /\?v=([_\-a-z0-9]+)&?/i
|
95
91
|
Nokogiri::HTML.fragment <<-HTML
|
96
92
|
<object width="706" height="422">
|
97
93
|
<param name="movie" value="http://www.youtube.com/v/#{$1}?fs=1&hl=en_US"></param>
|
@@ -106,8 +102,11 @@ module Readability
|
|
106
102
|
end
|
107
103
|
|
108
104
|
def vimeo
|
105
|
+
# matches non-channel or pages that used swfobject to print player
|
106
|
+
if @document.css("#clip_id")
|
107
|
+
Nokogiri::HTML.fragment("<iframe src=\"http://player.vimeo.com/video/#{@document.css("#clip_id").attr('value')}\" width=\"572\" height=\"322\" frameborder=\"0\"></iframe>")
|
109
108
|
# matches channel pages
|
110
|
-
|
109
|
+
elsif player = @document.css(".player")
|
111
110
|
html = ""
|
112
111
|
player.each do |video|
|
113
112
|
if video.to_html =~ /clip_id=([0-9]+)/
|
@@ -115,16 +114,13 @@ module Readability
|
|
115
114
|
end
|
116
115
|
end
|
117
116
|
Nokogiri::HTML.fragment(html)
|
118
|
-
# matches non-channel or pages that used swfobject to print player
|
119
|
-
elsif @html.to_html =~ /clip_id=([0-9]+)/
|
120
|
-
Nokogiri::HTML.fragment("<iframe src=\"http://player.vimeo.com/video/#{$1}\" width=\"572\" height=\"322\" frameborder=\"0\"></iframe>")
|
121
117
|
else
|
122
118
|
nil
|
123
119
|
end
|
124
120
|
end
|
125
121
|
|
126
122
|
def ted
|
127
|
-
if (player = @
|
123
|
+
if (player = @document.css(".copy_paste")).present?
|
128
124
|
unless player.first.attr("value").blank?
|
129
125
|
Nokogiri::HTML.fragment(player.first.attr("value").to_s)
|
130
126
|
else
|
@@ -140,8 +136,8 @@ module Readability
|
|
140
136
|
# Things like preambles, content split by ads that we removed, etc.
|
141
137
|
|
142
138
|
sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
|
143
|
-
output = Nokogiri::XML::Node.new('div', @
|
144
|
-
best_candidate[:elem].parent.
|
139
|
+
output = Nokogiri::XML::Node.new('div', @document)
|
140
|
+
best_candidate[:elem].parent.try(:children).each do |sibling|
|
145
141
|
append = false
|
146
142
|
append = true if sibling == best_candidate[:elem]
|
147
143
|
append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
|
@@ -173,10 +169,10 @@ module Readability
|
|
173
169
|
|
174
170
|
debug("Top 5 candidates:")
|
175
171
|
sorted_candidates[0...5].each do |candidate|
|
176
|
-
debug("Candidate #{candidate[:elem].
|
172
|
+
debug("Candidate #{candidate[:elem].try(:name)}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}")
|
177
173
|
end
|
178
174
|
|
179
|
-
best_candidate = sorted_candidates.first || { :elem => @
|
175
|
+
best_candidate = sorted_candidates.first || { :elem => @document.css("body").first, :content_score => 0 }
|
180
176
|
#debug("Best candidate #{best_candidate[:elem].andand.name} with score #{best_candidate[:content_score]}")
|
181
177
|
best_candidate
|
182
178
|
end
|
@@ -190,7 +186,7 @@ module Readability
|
|
190
186
|
|
191
187
|
def score_paragraphs(min_text_length)
|
192
188
|
candidates = {}
|
193
|
-
@
|
189
|
+
@document.css("p,td").each do |elem|
|
194
190
|
parent_node = elem.parent
|
195
191
|
grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil
|
196
192
|
inner_text = elem.text
|
@@ -263,7 +259,7 @@ module Readability
|
|
263
259
|
end
|
264
260
|
|
265
261
|
def remove_unlikely_candidates!
|
266
|
-
@
|
262
|
+
@document.css("*").each do |elem|
|
267
263
|
str = "#{elem[:class]}#{elem[:id]}"
|
268
264
|
if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && elem.name.downcase != 'body'
|
269
265
|
debug("Removing unlikely candidate - #{str}")
|
@@ -273,7 +269,7 @@ module Readability
|
|
273
269
|
end
|
274
270
|
|
275
271
|
def transform_misused_divs_into_paragraphs!
|
276
|
-
@
|
272
|
+
@document.css("*").each do |elem|
|
277
273
|
if elem.name.downcase == "div"
|
278
274
|
# transform <div>s that do not contain other block elements into <p>s
|
279
275
|
if elem.inner_html !~ REGEXES[:divToPElementsRe]
|