busk-ruby-readability 1.0.7 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile ADDED
@@ -0,0 +1,10 @@
1
+ source "http://rubygems.org"
2
+
3
+ gem "nokogiri"
4
+
5
+ group :test do
6
+ gem 'rspec'
7
+ gem 'fakeweb'
8
+ end
9
+
10
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,28 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ busk-ruby-readability (1.1.0)
5
+
6
+ GEM
7
+ remote: http://rubygems.org/
8
+ specs:
9
+ diff-lcs (1.1.2)
10
+ fakeweb (1.3.0)
11
+ nokogiri (1.4.4)
12
+ rspec (2.4.0)
13
+ rspec-core (~> 2.4.0)
14
+ rspec-expectations (~> 2.4.0)
15
+ rspec-mocks (~> 2.4.0)
16
+ rspec-core (2.4.0)
17
+ rspec-expectations (2.4.0)
18
+ diff-lcs (~> 1.1.2)
19
+ rspec-mocks (2.4.0)
20
+
21
+ PLATFORMS
22
+ ruby
23
+
24
+ DEPENDENCIES
25
+ busk-ruby-readability!
26
+ fakeweb
27
+ nokogiri
28
+ rspec
data/README ADDED
@@ -0,0 +1,9 @@
1
+ This code is under the Apache License 2.0. http://www.apache.org/licenses/LICENSE-2.0
2
+
3
+ This is a ruby port of arc90's readability project
4
+
5
+ http://lab.arc90.com/experiments/readability/
6
+
7
+ Given a html document, it pulls out the main body text and cleans it up.
8
+
9
+ Ruby port by starrhorne and iterationlabs
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require 'bundler'
2
+ Bundler::GemHelper.install_tasks
data/lib/readability.rb CHANGED
@@ -43,6 +43,7 @@ module Readability
43
43
  end
44
44
 
45
45
  def content(remove_unlikely_candidates = true)
46
+ debug "Starting the content heuristic"
46
47
  @document.css("script, style").each {|el| el.remove }
47
48
  @document.search('//comment()').each {|el| el.remove }
48
49
 
@@ -58,9 +59,9 @@ module Readability
58
59
  candidates = score_paragraphs(options[:min_text_length] || TEXT_LENGTH_THRESHOLD)
59
60
  best_candidate = select_best_candidate(candidates)
60
61
  article = get_article(candidates, best_candidate)
61
-
62
+
62
63
  cleaned_article = sanitize(article, candidates, options)
63
-
64
+
64
65
  if remove_unlikely_candidates && article.text.strip.length < (options[:retry_length] || RETRY_LENGTH)
65
66
  make_html
66
67
  content(false)
@@ -69,24 +70,25 @@ module Readability
69
70
  end
70
71
  end
71
72
  end
72
-
73
+
73
74
  def is_youtube?
74
- (@base_uri.to_s =~ /^http:\/\/(www\.)?youtube.com/)
75
+ (@base_uri.to_s =~ /^(www\.)?youtube.com/)
75
76
  end
76
-
77
+
77
78
  def is_vimeo?
78
- (@base_uri.to_s =~ /^http:\/\/(www.)?vimeo.com/)
79
+ (@base_uri.to_s =~ /^(www.)?vimeo.com/)
79
80
  end
80
81
 
81
82
  def is_ted?
82
- (@base_uri.to_s =~ /^http:\/\/(www.)?ted.com\/talks/)
83
+ (@base_uri.to_s =~ /^(www.)?ted.com\/talks/)
83
84
  end
84
-
85
+
85
86
  def is_special_case?
86
87
  (@base_uri.to_s =~ REGEXES[:videoRe])
87
88
  end
88
-
89
+
89
90
  def youtube
91
+ debug("I have a Youtube video page")
90
92
  if @request =~ /\?v=([_\-a-z0-9]+)&?/i
91
93
  Nokogiri::HTML.fragment <<-HTML
92
94
  <object width="706" height="422">
@@ -100,8 +102,9 @@ module Readability
100
102
  nil
101
103
  end
102
104
  end
103
-
105
+
104
106
  def vimeo
107
+ debug("I have a Vimeo video page")
105
108
  # matches non-channel or pages that used swfobject to print player
106
109
  if @document.css("#clip_id")
107
110
  Nokogiri::HTML.fragment("<iframe src=\"http://player.vimeo.com/video/#{@document.css("#clip_id").attr('value')}\" width=\"572\" height=\"322\" frameborder=\"0\"></iframe>")
@@ -118,8 +121,9 @@ module Readability
118
121
  nil
119
122
  end
120
123
  end
121
-
124
+
122
125
  def ted
126
+ debug("I have a TED video page")
123
127
  if (player = @document.css(".copy_paste")).present?
124
128
  unless player.first.attr("value").blank?
125
129
  Nokogiri::HTML.fragment(player.first.attr("value").to_s)
@@ -130,7 +134,7 @@ module Readability
130
134
  nil
131
135
  end
132
136
  end
133
-
137
+
134
138
  def get_article(candidates, best_candidate)
135
139
  # Now that we have the top candidate, look through its siblings for content that might also be related.
136
140
  # Things like preambles, content split by ads that we removed, etc.
@@ -164,7 +168,7 @@ module Readability
164
168
  end
165
169
 
166
170
  def select_best_candidate(candidates)
167
- @best_candidate ||= begin
171
+ @best_candidate ||= begin
168
172
  sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
169
173
 
170
174
  debug("Top 5 candidates:")
@@ -239,6 +243,10 @@ module Readability
239
243
  weight
240
244
  end
241
245
 
246
+ def convert_to_utf8(string)
247
+ string.unpack("C*").pack("U*")
248
+ end
249
+
242
250
  def score_node(elem)
243
251
  content_score = class_weight(elem)
244
252
  case elem.name.downcase
@@ -255,7 +263,7 @@ module Readability
255
263
  end
256
264
 
257
265
  def debug(str)
258
- puts str if options[:debug]
266
+ puts "READABILITY : "+ str if options[:debug]
259
267
  end
260
268
 
261
269
  def remove_unlikely_candidates!
@@ -296,7 +304,7 @@ module Readability
296
304
  node.css("form").each do |elem|
297
305
  elem.remove
298
306
  end
299
-
307
+
300
308
  node.css("iframe").each do |iframe|
301
309
  unless iframe.attr("src").to_s =~ REGEXES[:videoRe]
302
310
  iframe.remove
@@ -326,7 +334,7 @@ module Readability
326
334
  to_remove = false
327
335
  reason = ""
328
336
 
329
- if (counts["img"] > counts["p"]) && (counts["p"] > 0)
337
+ if (counts["img"] > counts["p"]) && (counts["p"] > 0)
330
338
  reason = "too many images #{counts['p']}"
331
339
  to_remove = true
332
340
  elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
@@ -0,0 +1,13 @@
1
+ Gem::Specification.new do |s|
2
+ s.authors = ["Fabio Mont Alegre", "Rodrigo Flores"]
3
+ s.email = "it-team@busk.com"
4
+ s.homepage = "http://github.com/busk/ruby-readability"
5
+ s.version = "1.1.0"
6
+ s.name = "busk-ruby-readability"
7
+ s.summary = "A rewrite of original ruby-readability"
8
+
9
+ s.files = `git ls-files`.split("\n")
10
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
11
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
12
+ s.require_paths = ["lib"]
13
+ end
@@ -0,0 +1,29 @@
1
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
2
+ <html>
3
+ <head>
4
+ <title>Folha.com - Equil�brio e Sa�de - Anvisa detecta pelo de roedor e interdita ch� da Dr. Oetker - 27/01/2011</title>
5
+ <meta name="title" content="Anvisa detecta pelo de roedor e interdita ch� da Dr. Oetker">
6
+ <meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
7
+ </head>
8
+
9
+ <body>
10
+ <div id="articleBy">
11
+ <p>
12
+ COLABORA��O PARA A <b>FOLHA</b>
13
+ </p>
14
+
15
+ <p>
16
+ A Anvisa (Ag�ncia Nacional de Vigil�ncia Sanit�ria) interditou o lote do ch� de erva doce da marca Dr. Oetker. A medida foi publicada no &quot;Di�rio Oficial da Uni�o&quot; na quarta-feira (26).
17
+ </p>
18
+
19
+ <p>
20
+ Segundo a Vigil�ncia Sanit�ria, o lote L160T02 do produto --data de validade 01/12/2011-- apresentou resultado insatisfat�rio no ensaio de pesquisa para mat�rias macrosc�picas e microsc�picas que detectou a presen�a de p�lo de roedor e fragmentos de inseto.
21
+ </p>
22
+
23
+ <p>
24
+ A interdi��o cautelar vale pelo per�odo de 90 dias ap�s a data de publica��o. Durante esse tempo, o produto interditado n�o deve ser consumido e nem comercializado. As pessoas que j� adquiriram o produto do lote suspenso devem interromper o consumo.
25
+ </p>
26
+ </div>
27
+ </body>
28
+
29
+ </html>
@@ -47,7 +47,7 @@ describe Readability do
47
47
  </body>
48
48
  </html>
49
49
  HTML
50
-
50
+
51
51
  @doc = Readability::Document.new(@html, nil, nil)
52
52
  @elem1 = @doc.document.css("#elem1").first
53
53
  @elem2 = @doc.document.css("#elem2").first
@@ -105,7 +105,7 @@ describe Readability do
105
105
  </body>
106
106
  </html>
107
107
  HTML
108
-
108
+
109
109
  @doc = Readability::Document.new(@html, nil, nil)
110
110
  @candidates = @doc.score_paragraphs(0)
111
111
  end
@@ -140,6 +140,27 @@ describe Readability do
140
140
  end
141
141
  end
142
142
 
143
+ describe 'dealing with iso-8859-1' do
144
+ before(:each) do
145
+ file = File.open('spec/fixtures/folha.html', 'r')
146
+ @content = file.read
147
+ end
148
+
149
+ it "should return the main page content" do
150
+ Readability::Document.new(Nokogiri::HTML(@content, nil, 'ISO-8859'),nil,nil).content.unpack("C*").pack("U*") .should == "<div><div>\n <p>\n COLABORA\303\207\303\203O PARA A FOLHA\n </p>\n <p>\n A Anvisa (Ag\303\252ncia Nacional de Vigil\303\242ncia Sanit\303\241ria) interditou o lote do ch\303\241 de erva doce da marca Dr. Oetker. A medida foi publicada no \"Di\303\241rio Oficial da Uni\303\243o\" na quarta-feira (26).\n </p>\n <p>\n Segundo a Vigil\303\242ncia Sanit\303\241ria, o lote L160T02 do produto --data de validade 01/12/2011-- apresentou resultado insatisfat\303\263rio no ensaio de pesquisa para mat\303\251rias macrosc\303\263picas e microsc\303\263picas que detectou a presen\303\247a de p\303\252lo de roedor e fragmentos de inseto.\n </p>\n <p>\n A interdi\303\247\303\243o cautelar vale pelo per\303\255odo de 90 dias ap\303\263s a data de publica\303\247\303\243o. Durante esse tempo, o produto interditado n\303\243o deve ser consumido e nem comercializado. As pessoas que j\303\241 adquiriram o produto do lote suspenso devem interromper o consumo.\n </p>\n</div></div>"
151
+ end
152
+ end
153
+
154
+ describe 'dealing with utf-8' do
155
+ before do
156
+ @doc = Readability::Document.new(Nokogiri::HTML("<html><head><title>title!</title></head><body><div><p>Açougue, espátula, Vovô, çáóéãà</p></div></body>", nil, 'UTF-8'), nil, nil, :min_text_length => 0, :retry_length => 1)
157
+ end
158
+
159
+ it 'should return the main page content' do
160
+ @doc.content.should match("Açougue, espátula, Vovô, çáóéãà")
161
+ end
162
+ end
163
+
143
164
  describe "ignoring sidebars" do
144
165
  before do
145
166
  @doc = Readability::Document.new(Nokogiri::HTML("<html><head><title>title!</title></head><body><div><p>Some content</p></div><div class='sidebar'><p>sidebar<p></div></body>"), nil, nil, :min_text_length => 0, :retry_length => 1)
@@ -149,13 +170,13 @@ describe Readability do
149
170
  @doc.content.should_not match("sidebar")
150
171
  end
151
172
  end
152
-
173
+
153
174
  describe "outputs good stuff for known documents" do
154
175
  before do
155
176
  @html_files = Dir.glob(File.dirname(__FILE__) + "/fixtures/samples/*.html")
156
177
  @samples = @html_files.map {|filename| File.basename(filename, '.html') }
157
178
  end
158
-
179
+
159
180
  it "should output expected fragments of text" do
160
181
 
161
182
  checks = 0
@@ -165,12 +186,12 @@ describe Readability do
165
186
 
166
187
  load "fixtures/samples/#{sample}-fragments.rb"
167
188
  puts "testing #{sample}..."
168
-
189
+
169
190
  $required_fragments.each do |required_text|
170
191
  doc.should include(required_text)
171
192
  checks += 1
172
193
  end
173
-
194
+
174
195
  $excluded_fragments.each do |text_to_avoid|
175
196
  doc.should_not include(text_to_avoid)
176
197
  checks += 1
@@ -179,21 +200,20 @@ describe Readability do
179
200
  puts "Performed #{checks} checks."
180
201
  end
181
202
  end
182
-
203
+
183
204
  describe "handles vimeo.com videos" do
184
-
185
- before(:each) do
205
+
206
+ before(:each) do
186
207
  FakeWeb.register_uri(:get, 'http://vimeo.com/10365005',
187
208
  :response => File.read("spec/fixtures/vimeo.com.html"))
188
209
  @uri = URI.parse("http://vimeo.com/10365005")
189
-
190
- @content = Readability::Document.new(Nokogiri::HTML(open('http://vimeo.com/10365005')), @uri, @uri).content
210
+ @content = Readability::Document.new(Nokogiri::HTML(open('http://vimeo.com/10365005')), @uri.host, @uri.request_uri).content
191
211
  end
192
-
212
+
193
213
  it "should extract the video from the page" do
194
- @content.should include("<iframe src=\"http://player.vimeo.com/video/10365005")
214
+ @content.should include("<iframe src=\"http://player.vimeo.com/video/10365005\"")
195
215
  end
196
-
216
+
197
217
  end
198
-
218
+
199
219
  end
data/spec/spec_helper.rb CHANGED
@@ -1,13 +1,8 @@
1
- $LOAD_PATH.unshift(File.dirname(__FILE__))
2
- $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
1
  require 'rubygems'
4
2
  require 'readability'
5
- require 'spec'
6
- require 'spec/autorun'
7
- require 'nokogiri'
8
- require 'open-uri'
9
- require 'fakeweb'
3
+ require 'bundler'
10
4
 
11
- Spec::Runner.configure do |config|
5
+ Bundler.setup(:test)
12
6
 
13
- end
7
+ require 'open-uri'
8
+ require 'fakeweb'
data/test_on_url.rb ADDED
@@ -0,0 +1,6 @@
1
+ require 'rubygems'
2
+ require 'open-uri'
3
+ require 'lib/readability'
4
+
5
+ text = open(ARGV.first).read
6
+ p Readability::Document.new(text).content
metadata CHANGED
@@ -1,26 +1,27 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: busk-ruby-readability
3
3
  version: !ruby/object:Gem::Version
4
- hash: 25
4
+ hash: 19
5
5
  prerelease: false
6
6
  segments:
7
7
  - 1
8
+ - 1
8
9
  - 0
9
- - 7
10
- version: 1.0.7
10
+ version: 1.1.0
11
11
  platform: ruby
12
- authors: []
13
-
12
+ authors:
13
+ - Fabio Mont Alegre
14
+ - Rodrigo Flores
14
15
  autorequire:
15
16
  bindir: bin
16
17
  cert_chain: []
17
18
 
18
- date: 2011-01-05 00:00:00 -02:00
19
+ date: 2011-01-27 00:00:00 -02:00
19
20
  default_executable:
20
21
  dependencies: []
21
22
 
22
23
  description:
23
- email: spiceee@gmail.com
24
+ email: it-team@busk.com
24
25
  executables: []
25
26
 
26
27
  extensions: []
@@ -28,8 +29,14 @@ extensions: []
28
29
  extra_rdoc_files: []
29
30
 
30
31
  files:
32
+ - Gemfile
33
+ - Gemfile.lock
34
+ - README
35
+ - Rakefile
31
36
  - lib/readability.rb
37
+ - ruby-readability.gemspec
32
38
  - spec/fixtures/cant_read.html
39
+ - spec/fixtures/folha.html
33
40
  - spec/fixtures/sample.html
34
41
  - spec/fixtures/samples/channel4-1-fragments.rb
35
42
  - spec/fixtures/samples/channel4-1.html
@@ -40,6 +47,7 @@ files:
40
47
  - spec/readability_spec.rb
41
48
  - spec/spec.opts
42
49
  - spec/spec_helper.rb
50
+ - test_on_url.rb
43
51
  has_rdoc: true
44
52
  homepage: http://github.com/busk/ruby-readability
45
53
  licenses: []
@@ -49,8 +57,6 @@ rdoc_options: []
49
57
 
50
58
  require_paths:
51
59
  - lib
52
- - spec
53
- - spec/fixtures
54
60
  required_ruby_version: !ruby/object:Gem::Requirement
55
61
  none: false
56
62
  requirements:
@@ -76,5 +82,16 @@ rubygems_version: 1.3.7
76
82
  signing_key:
77
83
  specification_version: 3
78
84
  summary: A rewrite of original ruby-readability
79
- test_files: []
80
-
85
+ test_files:
86
+ - spec/fixtures/cant_read.html
87
+ - spec/fixtures/folha.html
88
+ - spec/fixtures/sample.html
89
+ - spec/fixtures/samples/channel4-1-fragments.rb
90
+ - spec/fixtures/samples/channel4-1.html
91
+ - spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
92
+ - spec/fixtures/samples/globemail-ottawa-cuts.html
93
+ - spec/fixtures/should_not_truncate.txt
94
+ - spec/fixtures/vimeo.com.html
95
+ - spec/readability_spec.rb
96
+ - spec/spec.opts
97
+ - spec/spec_helper.rb