busk-ruby-readability 1.0.7 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +10 -0
- data/Gemfile.lock +28 -0
- data/README +9 -0
- data/Rakefile +2 -0
- data/lib/readability.rb +24 -16
- data/ruby-readability.gemspec +13 -0
- data/spec/fixtures/folha.html +29 -0
- data/spec/readability_spec.rb +35 -15
- data/spec/spec_helper.rb +4 -9
- data/test_on_url.rb +6 -0
- metadata +28 -11
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
busk-ruby-readability (1.1.0)
|
5
|
+
|
6
|
+
GEM
|
7
|
+
remote: http://rubygems.org/
|
8
|
+
specs:
|
9
|
+
diff-lcs (1.1.2)
|
10
|
+
fakeweb (1.3.0)
|
11
|
+
nokogiri (1.4.4)
|
12
|
+
rspec (2.4.0)
|
13
|
+
rspec-core (~> 2.4.0)
|
14
|
+
rspec-expectations (~> 2.4.0)
|
15
|
+
rspec-mocks (~> 2.4.0)
|
16
|
+
rspec-core (2.4.0)
|
17
|
+
rspec-expectations (2.4.0)
|
18
|
+
diff-lcs (~> 1.1.2)
|
19
|
+
rspec-mocks (2.4.0)
|
20
|
+
|
21
|
+
PLATFORMS
|
22
|
+
ruby
|
23
|
+
|
24
|
+
DEPENDENCIES
|
25
|
+
busk-ruby-readability!
|
26
|
+
fakeweb
|
27
|
+
nokogiri
|
28
|
+
rspec
|
data/README
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
This code is under the Apache License 2.0. http://www.apache.org/licenses/LICENSE-2.0
|
2
|
+
|
3
|
+
This is a ruby port of arc90's readability project
|
4
|
+
|
5
|
+
http://lab.arc90.com/experiments/readability/
|
6
|
+
|
7
|
+
Given a html document, it pulls out the main body text and cleans it up.
|
8
|
+
|
9
|
+
Ruby port by starrhorne and iterationlabs
|
data/Rakefile
ADDED
data/lib/readability.rb
CHANGED
@@ -43,6 +43,7 @@ module Readability
|
|
43
43
|
end
|
44
44
|
|
45
45
|
def content(remove_unlikely_candidates = true)
|
46
|
+
debug "Starting the content heuristic"
|
46
47
|
@document.css("script, style").each {|el| el.remove }
|
47
48
|
@document.search('//comment()').each {|el| el.remove }
|
48
49
|
|
@@ -58,9 +59,9 @@ module Readability
|
|
58
59
|
candidates = score_paragraphs(options[:min_text_length] || TEXT_LENGTH_THRESHOLD)
|
59
60
|
best_candidate = select_best_candidate(candidates)
|
60
61
|
article = get_article(candidates, best_candidate)
|
61
|
-
|
62
|
+
|
62
63
|
cleaned_article = sanitize(article, candidates, options)
|
63
|
-
|
64
|
+
|
64
65
|
if remove_unlikely_candidates && article.text.strip.length < (options[:retry_length] || RETRY_LENGTH)
|
65
66
|
make_html
|
66
67
|
content(false)
|
@@ -69,24 +70,25 @@ module Readability
|
|
69
70
|
end
|
70
71
|
end
|
71
72
|
end
|
72
|
-
|
73
|
+
|
73
74
|
def is_youtube?
|
74
|
-
(@base_uri.to_s =~ /^
|
75
|
+
(@base_uri.to_s =~ /^(www\.)?youtube.com/)
|
75
76
|
end
|
76
|
-
|
77
|
+
|
77
78
|
def is_vimeo?
|
78
|
-
(@base_uri.to_s =~ /^
|
79
|
+
(@base_uri.to_s =~ /^(www.)?vimeo.com/)
|
79
80
|
end
|
80
81
|
|
81
82
|
def is_ted?
|
82
|
-
(@base_uri.to_s =~ /^
|
83
|
+
(@base_uri.to_s =~ /^(www.)?ted.com\/talks/)
|
83
84
|
end
|
84
|
-
|
85
|
+
|
85
86
|
def is_special_case?
|
86
87
|
(@base_uri.to_s =~ REGEXES[:videoRe])
|
87
88
|
end
|
88
|
-
|
89
|
+
|
89
90
|
def youtube
|
91
|
+
debug("I have a Youtube video page")
|
90
92
|
if @request =~ /\?v=([_\-a-z0-9]+)&?/i
|
91
93
|
Nokogiri::HTML.fragment <<-HTML
|
92
94
|
<object width="706" height="422">
|
@@ -100,8 +102,9 @@ module Readability
|
|
100
102
|
nil
|
101
103
|
end
|
102
104
|
end
|
103
|
-
|
105
|
+
|
104
106
|
def vimeo
|
107
|
+
debug("I have a Vimeo video page")
|
105
108
|
# matches non-channel or pages that used swfobject to print player
|
106
109
|
if @document.css("#clip_id")
|
107
110
|
Nokogiri::HTML.fragment("<iframe src=\"http://player.vimeo.com/video/#{@document.css("#clip_id").attr('value')}\" width=\"572\" height=\"322\" frameborder=\"0\"></iframe>")
|
@@ -118,8 +121,9 @@ module Readability
|
|
118
121
|
nil
|
119
122
|
end
|
120
123
|
end
|
121
|
-
|
124
|
+
|
122
125
|
def ted
|
126
|
+
debug("I have a TED video page")
|
123
127
|
if (player = @document.css(".copy_paste")).present?
|
124
128
|
unless player.first.attr("value").blank?
|
125
129
|
Nokogiri::HTML.fragment(player.first.attr("value").to_s)
|
@@ -130,7 +134,7 @@ module Readability
|
|
130
134
|
nil
|
131
135
|
end
|
132
136
|
end
|
133
|
-
|
137
|
+
|
134
138
|
def get_article(candidates, best_candidate)
|
135
139
|
# Now that we have the top candidate, look through its siblings for content that might also be related.
|
136
140
|
# Things like preambles, content split by ads that we removed, etc.
|
@@ -164,7 +168,7 @@ module Readability
|
|
164
168
|
end
|
165
169
|
|
166
170
|
def select_best_candidate(candidates)
|
167
|
-
@best_candidate ||= begin
|
171
|
+
@best_candidate ||= begin
|
168
172
|
sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
|
169
173
|
|
170
174
|
debug("Top 5 candidates:")
|
@@ -239,6 +243,10 @@ module Readability
|
|
239
243
|
weight
|
240
244
|
end
|
241
245
|
|
246
|
+
def convert_to_utf8(string)
|
247
|
+
string.unpack("C*").pack("U*")
|
248
|
+
end
|
249
|
+
|
242
250
|
def score_node(elem)
|
243
251
|
content_score = class_weight(elem)
|
244
252
|
case elem.name.downcase
|
@@ -255,7 +263,7 @@ module Readability
|
|
255
263
|
end
|
256
264
|
|
257
265
|
def debug(str)
|
258
|
-
puts str if options[:debug]
|
266
|
+
puts "READABILITY : "+ str if options[:debug]
|
259
267
|
end
|
260
268
|
|
261
269
|
def remove_unlikely_candidates!
|
@@ -296,7 +304,7 @@ module Readability
|
|
296
304
|
node.css("form").each do |elem|
|
297
305
|
elem.remove
|
298
306
|
end
|
299
|
-
|
307
|
+
|
300
308
|
node.css("iframe").each do |iframe|
|
301
309
|
unless iframe.attr("src").to_s =~ REGEXES[:videoRe]
|
302
310
|
iframe.remove
|
@@ -326,7 +334,7 @@ module Readability
|
|
326
334
|
to_remove = false
|
327
335
|
reason = ""
|
328
336
|
|
329
|
-
if (counts["img"] > counts["p"]) && (counts["p"] > 0)
|
337
|
+
if (counts["img"] > counts["p"]) && (counts["p"] > 0)
|
330
338
|
reason = "too many images #{counts['p']}"
|
331
339
|
to_remove = true
|
332
340
|
elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
|
@@ -0,0 +1,13 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.authors = ["Fabio Mont Alegre", "Rodrigo Flores"]
|
3
|
+
s.email = "it-team@busk.com"
|
4
|
+
s.homepage = "http://github.com/busk/ruby-readability"
|
5
|
+
s.version = "1.1.0"
|
6
|
+
s.name = "busk-ruby-readability"
|
7
|
+
s.summary = "A rewrite of original ruby-readability"
|
8
|
+
|
9
|
+
s.files = `git ls-files`.split("\n")
|
10
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
11
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
12
|
+
s.require_paths = ["lib"]
|
13
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<title>Folha.com - Equil�brio e Sa�de - Anvisa detecta pelo de roedor e interdita ch� da Dr. Oetker - 27/01/2011</title>
|
5
|
+
<meta name="title" content="Anvisa detecta pelo de roedor e interdita ch� da Dr. Oetker">
|
6
|
+
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
|
7
|
+
</head>
|
8
|
+
|
9
|
+
<body>
|
10
|
+
<div id="articleBy">
|
11
|
+
<p>
|
12
|
+
COLABORA��O PARA A <b>FOLHA</b>
|
13
|
+
</p>
|
14
|
+
|
15
|
+
<p>
|
16
|
+
A Anvisa (Ag�ncia Nacional de Vigil�ncia Sanit�ria) interditou o lote do ch� de erva doce da marca Dr. Oetker. A medida foi publicada no "Di�rio Oficial da Uni�o" na quarta-feira (26).
|
17
|
+
</p>
|
18
|
+
|
19
|
+
<p>
|
20
|
+
Segundo a Vigil�ncia Sanit�ria, o lote L160T02 do produto --data de validade 01/12/2011-- apresentou resultado insatisfat�rio no ensaio de pesquisa para mat�rias macrosc�picas e microsc�picas que detectou a presen�a de p�lo de roedor e fragmentos de inseto.
|
21
|
+
</p>
|
22
|
+
|
23
|
+
<p>
|
24
|
+
A interdi��o cautelar vale pelo per�odo de 90 dias ap�s a data de publica��o. Durante esse tempo, o produto interditado n�o deve ser consumido e nem comercializado. As pessoas que j� adquiriram o produto do lote suspenso devem interromper o consumo.
|
25
|
+
</p>
|
26
|
+
</div>
|
27
|
+
</body>
|
28
|
+
|
29
|
+
</html>
|
data/spec/readability_spec.rb
CHANGED
@@ -47,7 +47,7 @@ describe Readability do
|
|
47
47
|
</body>
|
48
48
|
</html>
|
49
49
|
HTML
|
50
|
-
|
50
|
+
|
51
51
|
@doc = Readability::Document.new(@html, nil, nil)
|
52
52
|
@elem1 = @doc.document.css("#elem1").first
|
53
53
|
@elem2 = @doc.document.css("#elem2").first
|
@@ -105,7 +105,7 @@ describe Readability do
|
|
105
105
|
</body>
|
106
106
|
</html>
|
107
107
|
HTML
|
108
|
-
|
108
|
+
|
109
109
|
@doc = Readability::Document.new(@html, nil, nil)
|
110
110
|
@candidates = @doc.score_paragraphs(0)
|
111
111
|
end
|
@@ -140,6 +140,27 @@ describe Readability do
|
|
140
140
|
end
|
141
141
|
end
|
142
142
|
|
143
|
+
describe 'dealing with iso-8859-1' do
|
144
|
+
before(:each) do
|
145
|
+
file = File.open('spec/fixtures/folha.html', 'r')
|
146
|
+
@content = file.read
|
147
|
+
end
|
148
|
+
|
149
|
+
it "should return the main page content" do
|
150
|
+
Readability::Document.new(Nokogiri::HTML(@content, nil, 'ISO-8859'),nil,nil).content.unpack("C*").pack("U*") .should == "<div><div>\n <p>\n COLABORA\303\207\303\203O PARA A FOLHA\n </p>\n <p>\n A Anvisa (Ag\303\252ncia Nacional de Vigil\303\242ncia Sanit\303\241ria) interditou o lote do ch\303\241 de erva doce da marca Dr. Oetker. A medida foi publicada no \"Di\303\241rio Oficial da Uni\303\243o\" na quarta-feira (26).\n </p>\n <p>\n Segundo a Vigil\303\242ncia Sanit\303\241ria, o lote L160T02 do produto --data de validade 01/12/2011-- apresentou resultado insatisfat\303\263rio no ensaio de pesquisa para mat\303\251rias macrosc\303\263picas e microsc\303\263picas que detectou a presen\303\247a de p\303\252lo de roedor e fragmentos de inseto.\n </p>\n <p>\n A interdi\303\247\303\243o cautelar vale pelo per\303\255odo de 90 dias ap\303\263s a data de publica\303\247\303\243o. Durante esse tempo, o produto interditado n\303\243o deve ser consumido e nem comercializado. As pessoas que j\303\241 adquiriram o produto do lote suspenso devem interromper o consumo.\n </p>\n</div></div>"
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
describe 'dealing with utf-8' do
|
155
|
+
before do
|
156
|
+
@doc = Readability::Document.new(Nokogiri::HTML("<html><head><title>title!</title></head><body><div><p>Açougue, espátula, Vovô, çáóéãà</p></div></body>", nil, 'UTF-8'), nil, nil, :min_text_length => 0, :retry_length => 1)
|
157
|
+
end
|
158
|
+
|
159
|
+
it 'should return the main page content' do
|
160
|
+
@doc.content.should match("Açougue, espátula, Vovô, çáóéãà")
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
143
164
|
describe "ignoring sidebars" do
|
144
165
|
before do
|
145
166
|
@doc = Readability::Document.new(Nokogiri::HTML("<html><head><title>title!</title></head><body><div><p>Some content</p></div><div class='sidebar'><p>sidebar<p></div></body>"), nil, nil, :min_text_length => 0, :retry_length => 1)
|
@@ -149,13 +170,13 @@ describe Readability do
|
|
149
170
|
@doc.content.should_not match("sidebar")
|
150
171
|
end
|
151
172
|
end
|
152
|
-
|
173
|
+
|
153
174
|
describe "outputs good stuff for known documents" do
|
154
175
|
before do
|
155
176
|
@html_files = Dir.glob(File.dirname(__FILE__) + "/fixtures/samples/*.html")
|
156
177
|
@samples = @html_files.map {|filename| File.basename(filename, '.html') }
|
157
178
|
end
|
158
|
-
|
179
|
+
|
159
180
|
it "should output expected fragments of text" do
|
160
181
|
|
161
182
|
checks = 0
|
@@ -165,12 +186,12 @@ describe Readability do
|
|
165
186
|
|
166
187
|
load "fixtures/samples/#{sample}-fragments.rb"
|
167
188
|
puts "testing #{sample}..."
|
168
|
-
|
189
|
+
|
169
190
|
$required_fragments.each do |required_text|
|
170
191
|
doc.should include(required_text)
|
171
192
|
checks += 1
|
172
193
|
end
|
173
|
-
|
194
|
+
|
174
195
|
$excluded_fragments.each do |text_to_avoid|
|
175
196
|
doc.should_not include(text_to_avoid)
|
176
197
|
checks += 1
|
@@ -179,21 +200,20 @@ describe Readability do
|
|
179
200
|
puts "Performed #{checks} checks."
|
180
201
|
end
|
181
202
|
end
|
182
|
-
|
203
|
+
|
183
204
|
describe "handles vimeo.com videos" do
|
184
|
-
|
185
|
-
before(:each) do
|
205
|
+
|
206
|
+
before(:each) do
|
186
207
|
FakeWeb.register_uri(:get, 'http://vimeo.com/10365005',
|
187
208
|
:response => File.read("spec/fixtures/vimeo.com.html"))
|
188
209
|
@uri = URI.parse("http://vimeo.com/10365005")
|
189
|
-
|
190
|
-
@content = Readability::Document.new(Nokogiri::HTML(open('http://vimeo.com/10365005')), @uri, @uri).content
|
210
|
+
@content = Readability::Document.new(Nokogiri::HTML(open('http://vimeo.com/10365005')), @uri.host, @uri.request_uri).content
|
191
211
|
end
|
192
|
-
|
212
|
+
|
193
213
|
it "should extract the video from the page" do
|
194
|
-
@content.should include("<iframe src=\"http://player.vimeo.com/video/10365005")
|
214
|
+
@content.should include("<iframe src=\"http://player.vimeo.com/video/10365005\"")
|
195
215
|
end
|
196
|
-
|
216
|
+
|
197
217
|
end
|
198
|
-
|
218
|
+
|
199
219
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -1,13 +1,8 @@
|
|
1
|
-
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
2
|
-
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
3
1
|
require 'rubygems'
|
4
2
|
require 'readability'
|
5
|
-
require '
|
6
|
-
require 'spec/autorun'
|
7
|
-
require 'nokogiri'
|
8
|
-
require 'open-uri'
|
9
|
-
require 'fakeweb'
|
3
|
+
require 'bundler'
|
10
4
|
|
11
|
-
|
5
|
+
Bundler.setup(:test)
|
12
6
|
|
13
|
-
|
7
|
+
require 'open-uri'
|
8
|
+
require 'fakeweb'
|
data/test_on_url.rb
ADDED
metadata
CHANGED
@@ -1,26 +1,27 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: busk-ruby-readability
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 1
|
8
|
+
- 1
|
8
9
|
- 0
|
9
|
-
|
10
|
-
version: 1.0.7
|
10
|
+
version: 1.1.0
|
11
11
|
platform: ruby
|
12
|
-
authors:
|
13
|
-
|
12
|
+
authors:
|
13
|
+
- Fabio Mont Alegre
|
14
|
+
- Rodrigo Flores
|
14
15
|
autorequire:
|
15
16
|
bindir: bin
|
16
17
|
cert_chain: []
|
17
18
|
|
18
|
-
date: 2011-01-
|
19
|
+
date: 2011-01-27 00:00:00 -02:00
|
19
20
|
default_executable:
|
20
21
|
dependencies: []
|
21
22
|
|
22
23
|
description:
|
23
|
-
email:
|
24
|
+
email: it-team@busk.com
|
24
25
|
executables: []
|
25
26
|
|
26
27
|
extensions: []
|
@@ -28,8 +29,14 @@ extensions: []
|
|
28
29
|
extra_rdoc_files: []
|
29
30
|
|
30
31
|
files:
|
32
|
+
- Gemfile
|
33
|
+
- Gemfile.lock
|
34
|
+
- README
|
35
|
+
- Rakefile
|
31
36
|
- lib/readability.rb
|
37
|
+
- ruby-readability.gemspec
|
32
38
|
- spec/fixtures/cant_read.html
|
39
|
+
- spec/fixtures/folha.html
|
33
40
|
- spec/fixtures/sample.html
|
34
41
|
- spec/fixtures/samples/channel4-1-fragments.rb
|
35
42
|
- spec/fixtures/samples/channel4-1.html
|
@@ -40,6 +47,7 @@ files:
|
|
40
47
|
- spec/readability_spec.rb
|
41
48
|
- spec/spec.opts
|
42
49
|
- spec/spec_helper.rb
|
50
|
+
- test_on_url.rb
|
43
51
|
has_rdoc: true
|
44
52
|
homepage: http://github.com/busk/ruby-readability
|
45
53
|
licenses: []
|
@@ -49,8 +57,6 @@ rdoc_options: []
|
|
49
57
|
|
50
58
|
require_paths:
|
51
59
|
- lib
|
52
|
-
- spec
|
53
|
-
- spec/fixtures
|
54
60
|
required_ruby_version: !ruby/object:Gem::Requirement
|
55
61
|
none: false
|
56
62
|
requirements:
|
@@ -76,5 +82,16 @@ rubygems_version: 1.3.7
|
|
76
82
|
signing_key:
|
77
83
|
specification_version: 3
|
78
84
|
summary: A rewrite of original ruby-readability
|
79
|
-
test_files:
|
80
|
-
|
85
|
+
test_files:
|
86
|
+
- spec/fixtures/cant_read.html
|
87
|
+
- spec/fixtures/folha.html
|
88
|
+
- spec/fixtures/sample.html
|
89
|
+
- spec/fixtures/samples/channel4-1-fragments.rb
|
90
|
+
- spec/fixtures/samples/channel4-1.html
|
91
|
+
- spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
|
92
|
+
- spec/fixtures/samples/globemail-ottawa-cuts.html
|
93
|
+
- spec/fixtures/should_not_truncate.txt
|
94
|
+
- spec/fixtures/vimeo.com.html
|
95
|
+
- spec/readability_spec.rb
|
96
|
+
- spec/spec.opts
|
97
|
+
- spec/spec_helper.rb
|