busk-ruby-readability 1.0.7 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +10 -0
- data/Gemfile.lock +28 -0
- data/README +9 -0
- data/Rakefile +2 -0
- data/lib/readability.rb +24 -16
- data/ruby-readability.gemspec +13 -0
- data/spec/fixtures/folha.html +29 -0
- data/spec/readability_spec.rb +35 -15
- data/spec/spec_helper.rb +4 -9
- data/test_on_url.rb +6 -0
- metadata +28 -11
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
busk-ruby-readability (1.1.0)
|
5
|
+
|
6
|
+
GEM
|
7
|
+
remote: http://rubygems.org/
|
8
|
+
specs:
|
9
|
+
diff-lcs (1.1.2)
|
10
|
+
fakeweb (1.3.0)
|
11
|
+
nokogiri (1.4.4)
|
12
|
+
rspec (2.4.0)
|
13
|
+
rspec-core (~> 2.4.0)
|
14
|
+
rspec-expectations (~> 2.4.0)
|
15
|
+
rspec-mocks (~> 2.4.0)
|
16
|
+
rspec-core (2.4.0)
|
17
|
+
rspec-expectations (2.4.0)
|
18
|
+
diff-lcs (~> 1.1.2)
|
19
|
+
rspec-mocks (2.4.0)
|
20
|
+
|
21
|
+
PLATFORMS
|
22
|
+
ruby
|
23
|
+
|
24
|
+
DEPENDENCIES
|
25
|
+
busk-ruby-readability!
|
26
|
+
fakeweb
|
27
|
+
nokogiri
|
28
|
+
rspec
|
data/README
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
This code is under the Apache License 2.0. http://www.apache.org/licenses/LICENSE-2.0
|
2
|
+
|
3
|
+
This is a ruby port of arc90's readability project
|
4
|
+
|
5
|
+
http://lab.arc90.com/experiments/readability/
|
6
|
+
|
7
|
+
Given a html document, it pulls out the main body text and cleans it up.
|
8
|
+
|
9
|
+
Ruby port by starrhorne and iterationlabs
|
data/Rakefile
ADDED
data/lib/readability.rb
CHANGED
@@ -43,6 +43,7 @@ module Readability
|
|
43
43
|
end
|
44
44
|
|
45
45
|
def content(remove_unlikely_candidates = true)
|
46
|
+
debug "Starting the content heuristic"
|
46
47
|
@document.css("script, style").each {|el| el.remove }
|
47
48
|
@document.search('//comment()').each {|el| el.remove }
|
48
49
|
|
@@ -58,9 +59,9 @@ module Readability
|
|
58
59
|
candidates = score_paragraphs(options[:min_text_length] || TEXT_LENGTH_THRESHOLD)
|
59
60
|
best_candidate = select_best_candidate(candidates)
|
60
61
|
article = get_article(candidates, best_candidate)
|
61
|
-
|
62
|
+
|
62
63
|
cleaned_article = sanitize(article, candidates, options)
|
63
|
-
|
64
|
+
|
64
65
|
if remove_unlikely_candidates && article.text.strip.length < (options[:retry_length] || RETRY_LENGTH)
|
65
66
|
make_html
|
66
67
|
content(false)
|
@@ -69,24 +70,25 @@ module Readability
|
|
69
70
|
end
|
70
71
|
end
|
71
72
|
end
|
72
|
-
|
73
|
+
|
73
74
|
def is_youtube?
|
74
|
-
(@base_uri.to_s =~ /^
|
75
|
+
(@base_uri.to_s =~ /^(www\.)?youtube.com/)
|
75
76
|
end
|
76
|
-
|
77
|
+
|
77
78
|
def is_vimeo?
|
78
|
-
(@base_uri.to_s =~ /^
|
79
|
+
(@base_uri.to_s =~ /^(www.)?vimeo.com/)
|
79
80
|
end
|
80
81
|
|
81
82
|
def is_ted?
|
82
|
-
(@base_uri.to_s =~ /^
|
83
|
+
(@base_uri.to_s =~ /^(www.)?ted.com\/talks/)
|
83
84
|
end
|
84
|
-
|
85
|
+
|
85
86
|
def is_special_case?
|
86
87
|
(@base_uri.to_s =~ REGEXES[:videoRe])
|
87
88
|
end
|
88
|
-
|
89
|
+
|
89
90
|
def youtube
|
91
|
+
debug("I have a Youtube video page")
|
90
92
|
if @request =~ /\?v=([_\-a-z0-9]+)&?/i
|
91
93
|
Nokogiri::HTML.fragment <<-HTML
|
92
94
|
<object width="706" height="422">
|
@@ -100,8 +102,9 @@ module Readability
|
|
100
102
|
nil
|
101
103
|
end
|
102
104
|
end
|
103
|
-
|
105
|
+
|
104
106
|
def vimeo
|
107
|
+
debug("I have a Vimeo video page")
|
105
108
|
# matches non-channel or pages that used swfobject to print player
|
106
109
|
if @document.css("#clip_id")
|
107
110
|
Nokogiri::HTML.fragment("<iframe src=\"http://player.vimeo.com/video/#{@document.css("#clip_id").attr('value')}\" width=\"572\" height=\"322\" frameborder=\"0\"></iframe>")
|
@@ -118,8 +121,9 @@ module Readability
|
|
118
121
|
nil
|
119
122
|
end
|
120
123
|
end
|
121
|
-
|
124
|
+
|
122
125
|
def ted
|
126
|
+
debug("I have a TED video page")
|
123
127
|
if (player = @document.css(".copy_paste")).present?
|
124
128
|
unless player.first.attr("value").blank?
|
125
129
|
Nokogiri::HTML.fragment(player.first.attr("value").to_s)
|
@@ -130,7 +134,7 @@ module Readability
|
|
130
134
|
nil
|
131
135
|
end
|
132
136
|
end
|
133
|
-
|
137
|
+
|
134
138
|
def get_article(candidates, best_candidate)
|
135
139
|
# Now that we have the top candidate, look through its siblings for content that might also be related.
|
136
140
|
# Things like preambles, content split by ads that we removed, etc.
|
@@ -164,7 +168,7 @@ module Readability
|
|
164
168
|
end
|
165
169
|
|
166
170
|
def select_best_candidate(candidates)
|
167
|
-
@best_candidate ||= begin
|
171
|
+
@best_candidate ||= begin
|
168
172
|
sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
|
169
173
|
|
170
174
|
debug("Top 5 candidates:")
|
@@ -239,6 +243,10 @@ module Readability
|
|
239
243
|
weight
|
240
244
|
end
|
241
245
|
|
246
|
+
def convert_to_utf8(string)
|
247
|
+
string.unpack("C*").pack("U*")
|
248
|
+
end
|
249
|
+
|
242
250
|
def score_node(elem)
|
243
251
|
content_score = class_weight(elem)
|
244
252
|
case elem.name.downcase
|
@@ -255,7 +263,7 @@ module Readability
|
|
255
263
|
end
|
256
264
|
|
257
265
|
def debug(str)
|
258
|
-
puts str if options[:debug]
|
266
|
+
puts "READABILITY : "+ str if options[:debug]
|
259
267
|
end
|
260
268
|
|
261
269
|
def remove_unlikely_candidates!
|
@@ -296,7 +304,7 @@ module Readability
|
|
296
304
|
node.css("form").each do |elem|
|
297
305
|
elem.remove
|
298
306
|
end
|
299
|
-
|
307
|
+
|
300
308
|
node.css("iframe").each do |iframe|
|
301
309
|
unless iframe.attr("src").to_s =~ REGEXES[:videoRe]
|
302
310
|
iframe.remove
|
@@ -326,7 +334,7 @@ module Readability
|
|
326
334
|
to_remove = false
|
327
335
|
reason = ""
|
328
336
|
|
329
|
-
if (counts["img"] > counts["p"]) && (counts["p"] > 0)
|
337
|
+
if (counts["img"] > counts["p"]) && (counts["p"] > 0)
|
330
338
|
reason = "too many images #{counts['p']}"
|
331
339
|
to_remove = true
|
332
340
|
elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
|
@@ -0,0 +1,13 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.authors = ["Fabio Mont Alegre", "Rodrigo Flores"]
|
3
|
+
s.email = "it-team@busk.com"
|
4
|
+
s.homepage = "http://github.com/busk/ruby-readability"
|
5
|
+
s.version = "1.1.0"
|
6
|
+
s.name = "busk-ruby-readability"
|
7
|
+
s.summary = "A rewrite of original ruby-readability"
|
8
|
+
|
9
|
+
s.files = `git ls-files`.split("\n")
|
10
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
11
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
12
|
+
s.require_paths = ["lib"]
|
13
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<title>Folha.com - Equil�brio e Sa�de - Anvisa detecta pelo de roedor e interdita ch� da Dr. Oetker - 27/01/2011</title>
|
5
|
+
<meta name="title" content="Anvisa detecta pelo de roedor e interdita ch� da Dr. Oetker">
|
6
|
+
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
|
7
|
+
</head>
|
8
|
+
|
9
|
+
<body>
|
10
|
+
<div id="articleBy">
|
11
|
+
<p>
|
12
|
+
COLABORA��O PARA A <b>FOLHA</b>
|
13
|
+
</p>
|
14
|
+
|
15
|
+
<p>
|
16
|
+
A Anvisa (Ag�ncia Nacional de Vigil�ncia Sanit�ria) interditou o lote do ch� de erva doce da marca Dr. Oetker. A medida foi publicada no "Di�rio Oficial da Uni�o" na quarta-feira (26).
|
17
|
+
</p>
|
18
|
+
|
19
|
+
<p>
|
20
|
+
Segundo a Vigil�ncia Sanit�ria, o lote L160T02 do produto --data de validade 01/12/2011-- apresentou resultado insatisfat�rio no ensaio de pesquisa para mat�rias macrosc�picas e microsc�picas que detectou a presen�a de p�lo de roedor e fragmentos de inseto.
|
21
|
+
</p>
|
22
|
+
|
23
|
+
<p>
|
24
|
+
A interdi��o cautelar vale pelo per�odo de 90 dias ap�s a data de publica��o. Durante esse tempo, o produto interditado n�o deve ser consumido e nem comercializado. As pessoas que j� adquiriram o produto do lote suspenso devem interromper o consumo.
|
25
|
+
</p>
|
26
|
+
</div>
|
27
|
+
</body>
|
28
|
+
|
29
|
+
</html>
|
data/spec/readability_spec.rb
CHANGED
@@ -47,7 +47,7 @@ describe Readability do
|
|
47
47
|
</body>
|
48
48
|
</html>
|
49
49
|
HTML
|
50
|
-
|
50
|
+
|
51
51
|
@doc = Readability::Document.new(@html, nil, nil)
|
52
52
|
@elem1 = @doc.document.css("#elem1").first
|
53
53
|
@elem2 = @doc.document.css("#elem2").first
|
@@ -105,7 +105,7 @@ describe Readability do
|
|
105
105
|
</body>
|
106
106
|
</html>
|
107
107
|
HTML
|
108
|
-
|
108
|
+
|
109
109
|
@doc = Readability::Document.new(@html, nil, nil)
|
110
110
|
@candidates = @doc.score_paragraphs(0)
|
111
111
|
end
|
@@ -140,6 +140,27 @@ describe Readability do
|
|
140
140
|
end
|
141
141
|
end
|
142
142
|
|
143
|
+
describe 'dealing with iso-8859-1' do
|
144
|
+
before(:each) do
|
145
|
+
file = File.open('spec/fixtures/folha.html', 'r')
|
146
|
+
@content = file.read
|
147
|
+
end
|
148
|
+
|
149
|
+
it "should return the main page content" do
|
150
|
+
Readability::Document.new(Nokogiri::HTML(@content, nil, 'ISO-8859'),nil,nil).content.unpack("C*").pack("U*") .should == "<div><div>\n <p>\n COLABORA\303\207\303\203O PARA A FOLHA\n </p>\n <p>\n A Anvisa (Ag\303\252ncia Nacional de Vigil\303\242ncia Sanit\303\241ria) interditou o lote do ch\303\241 de erva doce da marca Dr. Oetker. A medida foi publicada no \"Di\303\241rio Oficial da Uni\303\243o\" na quarta-feira (26).\n </p>\n <p>\n Segundo a Vigil\303\242ncia Sanit\303\241ria, o lote L160T02 do produto --data de validade 01/12/2011-- apresentou resultado insatisfat\303\263rio no ensaio de pesquisa para mat\303\251rias macrosc\303\263picas e microsc\303\263picas que detectou a presen\303\247a de p\303\252lo de roedor e fragmentos de inseto.\n </p>\n <p>\n A interdi\303\247\303\243o cautelar vale pelo per\303\255odo de 90 dias ap\303\263s a data de publica\303\247\303\243o. Durante esse tempo, o produto interditado n\303\243o deve ser consumido e nem comercializado. As pessoas que j\303\241 adquiriram o produto do lote suspenso devem interromper o consumo.\n </p>\n</div></div>"
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
describe 'dealing with utf-8' do
|
155
|
+
before do
|
156
|
+
@doc = Readability::Document.new(Nokogiri::HTML("<html><head><title>title!</title></head><body><div><p>Açougue, espátula, Vovô, çáóéãà</p></div></body>", nil, 'UTF-8'), nil, nil, :min_text_length => 0, :retry_length => 1)
|
157
|
+
end
|
158
|
+
|
159
|
+
it 'should return the main page content' do
|
160
|
+
@doc.content.should match("Açougue, espátula, Vovô, çáóéãà")
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
143
164
|
describe "ignoring sidebars" do
|
144
165
|
before do
|
145
166
|
@doc = Readability::Document.new(Nokogiri::HTML("<html><head><title>title!</title></head><body><div><p>Some content</p></div><div class='sidebar'><p>sidebar<p></div></body>"), nil, nil, :min_text_length => 0, :retry_length => 1)
|
@@ -149,13 +170,13 @@ describe Readability do
|
|
149
170
|
@doc.content.should_not match("sidebar")
|
150
171
|
end
|
151
172
|
end
|
152
|
-
|
173
|
+
|
153
174
|
describe "outputs good stuff for known documents" do
|
154
175
|
before do
|
155
176
|
@html_files = Dir.glob(File.dirname(__FILE__) + "/fixtures/samples/*.html")
|
156
177
|
@samples = @html_files.map {|filename| File.basename(filename, '.html') }
|
157
178
|
end
|
158
|
-
|
179
|
+
|
159
180
|
it "should output expected fragments of text" do
|
160
181
|
|
161
182
|
checks = 0
|
@@ -165,12 +186,12 @@ describe Readability do
|
|
165
186
|
|
166
187
|
load "fixtures/samples/#{sample}-fragments.rb"
|
167
188
|
puts "testing #{sample}..."
|
168
|
-
|
189
|
+
|
169
190
|
$required_fragments.each do |required_text|
|
170
191
|
doc.should include(required_text)
|
171
192
|
checks += 1
|
172
193
|
end
|
173
|
-
|
194
|
+
|
174
195
|
$excluded_fragments.each do |text_to_avoid|
|
175
196
|
doc.should_not include(text_to_avoid)
|
176
197
|
checks += 1
|
@@ -179,21 +200,20 @@ describe Readability do
|
|
179
200
|
puts "Performed #{checks} checks."
|
180
201
|
end
|
181
202
|
end
|
182
|
-
|
203
|
+
|
183
204
|
describe "handles vimeo.com videos" do
|
184
|
-
|
185
|
-
before(:each) do
|
205
|
+
|
206
|
+
before(:each) do
|
186
207
|
FakeWeb.register_uri(:get, 'http://vimeo.com/10365005',
|
187
208
|
:response => File.read("spec/fixtures/vimeo.com.html"))
|
188
209
|
@uri = URI.parse("http://vimeo.com/10365005")
|
189
|
-
|
190
|
-
@content = Readability::Document.new(Nokogiri::HTML(open('http://vimeo.com/10365005')), @uri, @uri).content
|
210
|
+
@content = Readability::Document.new(Nokogiri::HTML(open('http://vimeo.com/10365005')), @uri.host, @uri.request_uri).content
|
191
211
|
end
|
192
|
-
|
212
|
+
|
193
213
|
it "should extract the video from the page" do
|
194
|
-
@content.should include("<iframe src=\"http://player.vimeo.com/video/10365005")
|
214
|
+
@content.should include("<iframe src=\"http://player.vimeo.com/video/10365005\"")
|
195
215
|
end
|
196
|
-
|
216
|
+
|
197
217
|
end
|
198
|
-
|
218
|
+
|
199
219
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -1,13 +1,8 @@
|
|
1
|
-
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
2
|
-
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
3
1
|
require 'rubygems'
|
4
2
|
require 'readability'
|
5
|
-
require '
|
6
|
-
require 'spec/autorun'
|
7
|
-
require 'nokogiri'
|
8
|
-
require 'open-uri'
|
9
|
-
require 'fakeweb'
|
3
|
+
require 'bundler'
|
10
4
|
|
11
|
-
|
5
|
+
Bundler.setup(:test)
|
12
6
|
|
13
|
-
|
7
|
+
require 'open-uri'
|
8
|
+
require 'fakeweb'
|
data/test_on_url.rb
ADDED
metadata
CHANGED
@@ -1,26 +1,27 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: busk-ruby-readability
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 1
|
8
|
+
- 1
|
8
9
|
- 0
|
9
|
-
|
10
|
-
version: 1.0.7
|
10
|
+
version: 1.1.0
|
11
11
|
platform: ruby
|
12
|
-
authors:
|
13
|
-
|
12
|
+
authors:
|
13
|
+
- Fabio Mont Alegre
|
14
|
+
- Rodrigo Flores
|
14
15
|
autorequire:
|
15
16
|
bindir: bin
|
16
17
|
cert_chain: []
|
17
18
|
|
18
|
-
date: 2011-01-
|
19
|
+
date: 2011-01-27 00:00:00 -02:00
|
19
20
|
default_executable:
|
20
21
|
dependencies: []
|
21
22
|
|
22
23
|
description:
|
23
|
-
email:
|
24
|
+
email: it-team@busk.com
|
24
25
|
executables: []
|
25
26
|
|
26
27
|
extensions: []
|
@@ -28,8 +29,14 @@ extensions: []
|
|
28
29
|
extra_rdoc_files: []
|
29
30
|
|
30
31
|
files:
|
32
|
+
- Gemfile
|
33
|
+
- Gemfile.lock
|
34
|
+
- README
|
35
|
+
- Rakefile
|
31
36
|
- lib/readability.rb
|
37
|
+
- ruby-readability.gemspec
|
32
38
|
- spec/fixtures/cant_read.html
|
39
|
+
- spec/fixtures/folha.html
|
33
40
|
- spec/fixtures/sample.html
|
34
41
|
- spec/fixtures/samples/channel4-1-fragments.rb
|
35
42
|
- spec/fixtures/samples/channel4-1.html
|
@@ -40,6 +47,7 @@ files:
|
|
40
47
|
- spec/readability_spec.rb
|
41
48
|
- spec/spec.opts
|
42
49
|
- spec/spec_helper.rb
|
50
|
+
- test_on_url.rb
|
43
51
|
has_rdoc: true
|
44
52
|
homepage: http://github.com/busk/ruby-readability
|
45
53
|
licenses: []
|
@@ -49,8 +57,6 @@ rdoc_options: []
|
|
49
57
|
|
50
58
|
require_paths:
|
51
59
|
- lib
|
52
|
-
- spec
|
53
|
-
- spec/fixtures
|
54
60
|
required_ruby_version: !ruby/object:Gem::Requirement
|
55
61
|
none: false
|
56
62
|
requirements:
|
@@ -76,5 +82,16 @@ rubygems_version: 1.3.7
|
|
76
82
|
signing_key:
|
77
83
|
specification_version: 3
|
78
84
|
summary: A rewrite of original ruby-readability
|
79
|
-
test_files:
|
80
|
-
|
85
|
+
test_files:
|
86
|
+
- spec/fixtures/cant_read.html
|
87
|
+
- spec/fixtures/folha.html
|
88
|
+
- spec/fixtures/sample.html
|
89
|
+
- spec/fixtures/samples/channel4-1-fragments.rb
|
90
|
+
- spec/fixtures/samples/channel4-1.html
|
91
|
+
- spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
|
92
|
+
- spec/fixtures/samples/globemail-ottawa-cuts.html
|
93
|
+
- spec/fixtures/should_not_truncate.txt
|
94
|
+
- spec/fixtures/vimeo.com.html
|
95
|
+
- spec/readability_spec.rb
|
96
|
+
- spec/spec.opts
|
97
|
+
- spec/spec_helper.rb
|