busk-ruby-readability 1.0.6 → 1.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/readability.rb +37 -41
- data/spec/fixtures/cant_read.html +426 -0
- data/spec/fixtures/sample.html +1198 -0
- data/spec/fixtures/samples/channel4-1-fragments.rb +14 -0
- data/spec/fixtures/samples/channel4-1.html +1330 -0
- data/spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb +31 -0
- data/spec/fixtures/samples/globemail-ottawa-cuts.html +2410 -0
- data/spec/fixtures/should_not_truncate.txt +1077 -0
- data/spec/fixtures/vimeo.com.html +1072 -0
- data/spec/readability_spec.rb +199 -0
- data/spec/spec.opts +4 -0
- data/spec/spec_helper.rb +13 -0
- metadata +17 -4
data/lib/readability.rb
CHANGED
@@ -1,35 +1,25 @@
|
|
1
1
|
require 'rubygems'
|
2
2
|
require 'nokogiri'
|
3
3
|
|
4
|
+
# so to run with non-Rails projects
|
5
|
+
class Object
|
6
|
+
def try(method)
|
7
|
+
send method if respond_to? method
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
4
11
|
module Readability
|
5
12
|
class Document
|
6
13
|
TEXT_LENGTH_THRESHOLD = 25
|
7
14
|
RETRY_LENGTH = 250
|
8
15
|
|
9
|
-
attr_accessor :
|
16
|
+
attr_accessor :document, :base_uri, :request, :options, :best_candidate
|
10
17
|
|
11
|
-
def initialize(
|
12
|
-
@
|
18
|
+
def initialize(document, base_uri, request, options = {})
|
19
|
+
@document = document
|
20
|
+
@base_uri = base_uri
|
21
|
+
@request = request
|
13
22
|
@options = options
|
14
|
-
make_html
|
15
|
-
end
|
16
|
-
|
17
|
-
# def charset
|
18
|
-
# @charset ||= begin
|
19
|
-
# if content_type = @input.read.to_s.match(/(<meta\s*([^>]*)http-equiv=['"]?content-type['"]?([^>]*))/i)
|
20
|
-
# if content_type = content_type[0].match(/charset=([\w-]*)/i)
|
21
|
-
# content_type[1]
|
22
|
-
# else
|
23
|
-
# "utf-8"
|
24
|
-
# end
|
25
|
-
# else
|
26
|
-
# "utf-8"
|
27
|
-
# end
|
28
|
-
# end
|
29
|
-
# end
|
30
|
-
|
31
|
-
def make_html
|
32
|
-
@html = Nokogiri::HTML(@input, nil, 'UTF-8')
|
33
23
|
end
|
34
24
|
|
35
25
|
REGEXES = {
|
@@ -46,9 +36,15 @@ module Readability
|
|
46
36
|
:videoRe => /http:\/\/(www\.)?(youtube|vimeo|ted|player\.vimeo)\.com/i
|
47
37
|
}
|
48
38
|
|
39
|
+
# should we get rid of this?
|
40
|
+
def make_html
|
41
|
+
@document.encoding = 'UTF-8'
|
42
|
+
@best_candidate = nil
|
43
|
+
end
|
44
|
+
|
49
45
|
def content(remove_unlikely_candidates = true)
|
50
|
-
@
|
51
|
-
@
|
46
|
+
@document.css("script, style").each {|el| el.remove }
|
47
|
+
@document.search('//comment()').each {|el| el.remove }
|
52
48
|
|
53
49
|
article = youtube if is_youtube? && remove_unlikely_candidates
|
54
50
|
article = vimeo if is_vimeo? && remove_unlikely_candidates
|
@@ -75,23 +71,23 @@ module Readability
|
|
75
71
|
end
|
76
72
|
|
77
73
|
def is_youtube?
|
78
|
-
(@
|
74
|
+
(@base_uri.to_s =~ /^http:\/\/(www\.)?youtube.com/)
|
79
75
|
end
|
80
76
|
|
81
77
|
def is_vimeo?
|
82
|
-
(@
|
78
|
+
(@base_uri.to_s =~ /^http:\/\/(www.)?vimeo.com/)
|
83
79
|
end
|
84
80
|
|
85
81
|
def is_ted?
|
86
|
-
(@
|
82
|
+
(@base_uri.to_s =~ /^http:\/\/(www.)?ted.com\/talks/)
|
87
83
|
end
|
88
84
|
|
89
85
|
def is_special_case?
|
90
|
-
(@
|
86
|
+
(@base_uri.to_s =~ REGEXES[:videoRe])
|
91
87
|
end
|
92
88
|
|
93
89
|
def youtube
|
94
|
-
if @
|
90
|
+
if @request =~ /\?v=([_\-a-z0-9]+)&?/i
|
95
91
|
Nokogiri::HTML.fragment <<-HTML
|
96
92
|
<object width="706" height="422">
|
97
93
|
<param name="movie" value="http://www.youtube.com/v/#{$1}?fs=1&hl=en_US"></param>
|
@@ -106,8 +102,11 @@ module Readability
|
|
106
102
|
end
|
107
103
|
|
108
104
|
def vimeo
|
105
|
+
# matches non-channel or pages that used swfobject to print player
|
106
|
+
if @document.css("#clip_id")
|
107
|
+
Nokogiri::HTML.fragment("<iframe src=\"http://player.vimeo.com/video/#{@document.css("#clip_id").attr('value')}\" width=\"572\" height=\"322\" frameborder=\"0\"></iframe>")
|
109
108
|
# matches channel pages
|
110
|
-
|
109
|
+
elsif player = @document.css(".player")
|
111
110
|
html = ""
|
112
111
|
player.each do |video|
|
113
112
|
if video.to_html =~ /clip_id=([0-9]+)/
|
@@ -115,16 +114,13 @@ module Readability
|
|
115
114
|
end
|
116
115
|
end
|
117
116
|
Nokogiri::HTML.fragment(html)
|
118
|
-
# matches non-channel or pages that used swfobject to print player
|
119
|
-
elsif @html.to_html =~ /clip_id=([0-9]+)/
|
120
|
-
Nokogiri::HTML.fragment("<iframe src=\"http://player.vimeo.com/video/#{$1}\" width=\"572\" height=\"322\" frameborder=\"0\"></iframe>")
|
121
117
|
else
|
122
118
|
nil
|
123
119
|
end
|
124
120
|
end
|
125
121
|
|
126
122
|
def ted
|
127
|
-
if (player = @
|
123
|
+
if (player = @document.css(".copy_paste")).present?
|
128
124
|
unless player.first.attr("value").blank?
|
129
125
|
Nokogiri::HTML.fragment(player.first.attr("value").to_s)
|
130
126
|
else
|
@@ -140,8 +136,8 @@ module Readability
|
|
140
136
|
# Things like preambles, content split by ads that we removed, etc.
|
141
137
|
|
142
138
|
sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
|
143
|
-
output = Nokogiri::XML::Node.new('div', @
|
144
|
-
best_candidate[:elem].parent.
|
139
|
+
output = Nokogiri::XML::Node.new('div', @document)
|
140
|
+
best_candidate[:elem].parent.try(:children).each do |sibling|
|
145
141
|
append = false
|
146
142
|
append = true if sibling == best_candidate[:elem]
|
147
143
|
append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
|
@@ -173,10 +169,10 @@ module Readability
|
|
173
169
|
|
174
170
|
debug("Top 5 candidates:")
|
175
171
|
sorted_candidates[0...5].each do |candidate|
|
176
|
-
debug("Candidate #{candidate[:elem].
|
172
|
+
debug("Candidate #{candidate[:elem].try(:name)}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}")
|
177
173
|
end
|
178
174
|
|
179
|
-
best_candidate = sorted_candidates.first || { :elem => @
|
175
|
+
best_candidate = sorted_candidates.first || { :elem => @document.css("body").first, :content_score => 0 }
|
180
176
|
#debug("Best candidate #{best_candidate[:elem].andand.name} with score #{best_candidate[:content_score]}")
|
181
177
|
best_candidate
|
182
178
|
end
|
@@ -190,7 +186,7 @@ module Readability
|
|
190
186
|
|
191
187
|
def score_paragraphs(min_text_length)
|
192
188
|
candidates = {}
|
193
|
-
@
|
189
|
+
@document.css("p,td").each do |elem|
|
194
190
|
parent_node = elem.parent
|
195
191
|
grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil
|
196
192
|
inner_text = elem.text
|
@@ -263,7 +259,7 @@ module Readability
|
|
263
259
|
end
|
264
260
|
|
265
261
|
def remove_unlikely_candidates!
|
266
|
-
@
|
262
|
+
@document.css("*").each do |elem|
|
267
263
|
str = "#{elem[:class]}#{elem[:id]}"
|
268
264
|
if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && elem.name.downcase != 'body'
|
269
265
|
debug("Removing unlikely candidate - #{str}")
|
@@ -273,7 +269,7 @@ module Readability
|
|
273
269
|
end
|
274
270
|
|
275
271
|
def transform_misused_divs_into_paragraphs!
|
276
|
-
@
|
272
|
+
@document.css("*").each do |elem|
|
277
273
|
if elem.name.downcase == "div"
|
278
274
|
# transform <div>s that do not contain other block elements into <p>s
|
279
275
|
if elem.inner_html !~ REGEXES[:divToPElementsRe]
|