busk-ruby-readability 1.0.7 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile ADDED
@@ -0,0 +1,10 @@
1
+ source "http://rubygems.org"
2
+
3
+ gem "nokogiri"
4
+
5
+ group :test do
6
+ gem 'rspec'
7
+ gem 'fakeweb'
8
+ end
9
+
10
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,28 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ busk-ruby-readability (1.1.0)
5
+
6
+ GEM
7
+ remote: http://rubygems.org/
8
+ specs:
9
+ diff-lcs (1.1.2)
10
+ fakeweb (1.3.0)
11
+ nokogiri (1.4.4)
12
+ rspec (2.4.0)
13
+ rspec-core (~> 2.4.0)
14
+ rspec-expectations (~> 2.4.0)
15
+ rspec-mocks (~> 2.4.0)
16
+ rspec-core (2.4.0)
17
+ rspec-expectations (2.4.0)
18
+ diff-lcs (~> 1.1.2)
19
+ rspec-mocks (2.4.0)
20
+
21
+ PLATFORMS
22
+ ruby
23
+
24
+ DEPENDENCIES
25
+ busk-ruby-readability!
26
+ fakeweb
27
+ nokogiri
28
+ rspec
data/README ADDED
@@ -0,0 +1,9 @@
1
+ This code is under the Apache License 2.0. http://www.apache.org/licenses/LICENSE-2.0
2
+
3
+ This is a ruby port of arc90's readability project
4
+
5
+ http://lab.arc90.com/experiments/readability/
6
+
7
+ Given a html document, it pulls out the main body text and cleans it up.
8
+
9
+ Ruby port by starrhorne and iterationlabs
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require 'bundler'
2
+ Bundler::GemHelper.install_tasks
data/lib/readability.rb CHANGED
@@ -43,6 +43,7 @@ module Readability
43
43
  end
44
44
 
45
45
  def content(remove_unlikely_candidates = true)
46
+ debug "Starting the content heuristic"
46
47
  @document.css("script, style").each {|el| el.remove }
47
48
  @document.search('//comment()').each {|el| el.remove }
48
49
 
@@ -58,9 +59,9 @@ module Readability
58
59
  candidates = score_paragraphs(options[:min_text_length] || TEXT_LENGTH_THRESHOLD)
59
60
  best_candidate = select_best_candidate(candidates)
60
61
  article = get_article(candidates, best_candidate)
61
-
62
+
62
63
  cleaned_article = sanitize(article, candidates, options)
63
-
64
+
64
65
  if remove_unlikely_candidates && article.text.strip.length < (options[:retry_length] || RETRY_LENGTH)
65
66
  make_html
66
67
  content(false)
@@ -69,24 +70,25 @@ module Readability
69
70
  end
70
71
  end
71
72
  end
72
-
73
+
73
74
  def is_youtube?
74
- (@base_uri.to_s =~ /^http:\/\/(www\.)?youtube.com/)
75
+ (@base_uri.to_s =~ /^(www\.)?youtube.com/)
75
76
  end
76
-
77
+
77
78
  def is_vimeo?
78
- (@base_uri.to_s =~ /^http:\/\/(www.)?vimeo.com/)
79
+ (@base_uri.to_s =~ /^(www.)?vimeo.com/)
79
80
  end
80
81
 
81
82
  def is_ted?
82
- (@base_uri.to_s =~ /^http:\/\/(www.)?ted.com\/talks/)
83
+ (@base_uri.to_s =~ /^(www.)?ted.com\/talks/)
83
84
  end
84
-
85
+
85
86
  def is_special_case?
86
87
  (@base_uri.to_s =~ REGEXES[:videoRe])
87
88
  end
88
-
89
+
89
90
  def youtube
91
+ debug("I have a Youtube video page")
90
92
  if @request =~ /\?v=([_\-a-z0-9]+)&?/i
91
93
  Nokogiri::HTML.fragment <<-HTML
92
94
  <object width="706" height="422">
@@ -100,8 +102,9 @@ module Readability
100
102
  nil
101
103
  end
102
104
  end
103
-
105
+
104
106
  def vimeo
107
+ debug("I have a Vimeo video page")
105
108
  # matches non-channel or pages that used swfobject to print player
106
109
  if @document.css("#clip_id")
107
110
  Nokogiri::HTML.fragment("<iframe src=\"http://player.vimeo.com/video/#{@document.css("#clip_id").attr('value')}\" width=\"572\" height=\"322\" frameborder=\"0\"></iframe>")
@@ -118,8 +121,9 @@ module Readability
118
121
  nil
119
122
  end
120
123
  end
121
-
124
+
122
125
  def ted
126
+ debug("I have a TED video page")
123
127
  if (player = @document.css(".copy_paste")).present?
124
128
  unless player.first.attr("value").blank?
125
129
  Nokogiri::HTML.fragment(player.first.attr("value").to_s)
@@ -130,7 +134,7 @@ module Readability
130
134
  nil
131
135
  end
132
136
  end
133
-
137
+
134
138
  def get_article(candidates, best_candidate)
135
139
  # Now that we have the top candidate, look through its siblings for content that might also be related.
136
140
  # Things like preambles, content split by ads that we removed, etc.
@@ -164,7 +168,7 @@ module Readability
164
168
  end
165
169
 
166
170
  def select_best_candidate(candidates)
167
- @best_candidate ||= begin
171
+ @best_candidate ||= begin
168
172
  sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
169
173
 
170
174
  debug("Top 5 candidates:")
@@ -239,6 +243,10 @@ module Readability
239
243
  weight
240
244
  end
241
245
 
246
+ def convert_to_utf8(string)
247
+ string.unpack("C*").pack("U*")
248
+ end
249
+
242
250
  def score_node(elem)
243
251
  content_score = class_weight(elem)
244
252
  case elem.name.downcase
@@ -255,7 +263,7 @@ module Readability
255
263
  end
256
264
 
257
265
  def debug(str)
258
- puts str if options[:debug]
266
+ puts "READABILITY : "+ str if options[:debug]
259
267
  end
260
268
 
261
269
  def remove_unlikely_candidates!
@@ -296,7 +304,7 @@ module Readability
296
304
  node.css("form").each do |elem|
297
305
  elem.remove
298
306
  end
299
-
307
+
300
308
  node.css("iframe").each do |iframe|
301
309
  unless iframe.attr("src").to_s =~ REGEXES[:videoRe]
302
310
  iframe.remove
@@ -326,7 +334,7 @@ module Readability
326
334
  to_remove = false
327
335
  reason = ""
328
336
 
329
- if (counts["img"] > counts["p"]) && (counts["p"] > 0)
337
+ if (counts["img"] > counts["p"]) && (counts["p"] > 0)
330
338
  reason = "too many images #{counts['p']}"
331
339
  to_remove = true
332
340
  elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
@@ -0,0 +1,13 @@
1
+ Gem::Specification.new do |s|
2
+ s.authors = ["Fabio Mont Alegre", "Rodrigo Flores"]
3
+ s.email = "it-team@busk.com"
4
+ s.homepage = "http://github.com/busk/ruby-readability"
5
+ s.version = "1.1.0"
6
+ s.name = "busk-ruby-readability"
7
+ s.summary = "A rewrite of original ruby-readability"
8
+
9
+ s.files = `git ls-files`.split("\n")
10
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
11
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
12
+ s.require_paths = ["lib"]
13
+ end
@@ -0,0 +1,29 @@
1
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
2
+ <html>
3
+ <head>
4
+ <title>Folha.com - Equil�brio e Sa�de - Anvisa detecta pelo de roedor e interdita ch� da Dr. Oetker - 27/01/2011</title>
5
+ <meta name="title" content="Anvisa detecta pelo de roedor e interdita ch� da Dr. Oetker">
6
+ <meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
7
+ </head>
8
+
9
+ <body>
10
+ <div id="articleBy">
11
+ <p>
12
+ COLABORA��O PARA A <b>FOLHA</b>
13
+ </p>
14
+
15
+ <p>
16
+ A Anvisa (Ag�ncia Nacional de Vigil�ncia Sanit�ria) interditou o lote do ch� de erva doce da marca Dr. Oetker. A medida foi publicada no &quot;Di�rio Oficial da Uni�o&quot; na quarta-feira (26).
17
+ </p>
18
+
19
+ <p>
20
+ Segundo a Vigil�ncia Sanit�ria, o lote L160T02 do produto --data de validade 01/12/2011-- apresentou resultado insatisfat�rio no ensaio de pesquisa para mat�rias macrosc�picas e microsc�picas que detectou a presen�a de p�lo de roedor e fragmentos de inseto.
21
+ </p>
22
+
23
+ <p>
24
+ A interdi��o cautelar vale pelo per�odo de 90 dias ap�s a data de publica��o. Durante esse tempo, o produto interditado n�o deve ser consumido e nem comercializado. As pessoas que j� adquiriram o produto do lote suspenso devem interromper o consumo.
25
+ </p>
26
+ </div>
27
+ </body>
28
+
29
+ </html>
@@ -47,7 +47,7 @@ describe Readability do
47
47
  </body>
48
48
  </html>
49
49
  HTML
50
-
50
+
51
51
  @doc = Readability::Document.new(@html, nil, nil)
52
52
  @elem1 = @doc.document.css("#elem1").first
53
53
  @elem2 = @doc.document.css("#elem2").first
@@ -105,7 +105,7 @@ describe Readability do
105
105
  </body>
106
106
  </html>
107
107
  HTML
108
-
108
+
109
109
  @doc = Readability::Document.new(@html, nil, nil)
110
110
  @candidates = @doc.score_paragraphs(0)
111
111
  end
@@ -140,6 +140,27 @@ describe Readability do
140
140
  end
141
141
  end
142
142
 
143
+ describe 'dealing with iso-8859-1' do
144
+ before(:each) do
145
+ file = File.open('spec/fixtures/folha.html', 'r')
146
+ @content = file.read
147
+ end
148
+
149
+ it "should return the main page content" do
150
+ Readability::Document.new(Nokogiri::HTML(@content, nil, 'ISO-8859'),nil,nil).content.unpack("C*").pack("U*") .should == "<div><div>\n <p>\n COLABORA\303\207\303\203O PARA A FOLHA\n </p>\n <p>\n A Anvisa (Ag\303\252ncia Nacional de Vigil\303\242ncia Sanit\303\241ria) interditou o lote do ch\303\241 de erva doce da marca Dr. Oetker. A medida foi publicada no \"Di\303\241rio Oficial da Uni\303\243o\" na quarta-feira (26).\n </p>\n <p>\n Segundo a Vigil\303\242ncia Sanit\303\241ria, o lote L160T02 do produto --data de validade 01/12/2011-- apresentou resultado insatisfat\303\263rio no ensaio de pesquisa para mat\303\251rias macrosc\303\263picas e microsc\303\263picas que detectou a presen\303\247a de p\303\252lo de roedor e fragmentos de inseto.\n </p>\n <p>\n A interdi\303\247\303\243o cautelar vale pelo per\303\255odo de 90 dias ap\303\263s a data de publica\303\247\303\243o. Durante esse tempo, o produto interditado n\303\243o deve ser consumido e nem comercializado. As pessoas que j\303\241 adquiriram o produto do lote suspenso devem interromper o consumo.\n </p>\n</div></div>"
151
+ end
152
+ end
153
+
154
+ describe 'dealing with utf-8' do
155
+ before do
156
+ @doc = Readability::Document.new(Nokogiri::HTML("<html><head><title>title!</title></head><body><div><p>Açougue, espátula, Vovô, çáóéãà</p></div></body>", nil, 'UTF-8'), nil, nil, :min_text_length => 0, :retry_length => 1)
157
+ end
158
+
159
+ it 'should return the main page content' do
160
+ @doc.content.should match("Açougue, espátula, Vovô, çáóéãà")
161
+ end
162
+ end
163
+
143
164
  describe "ignoring sidebars" do
144
165
  before do
145
166
  @doc = Readability::Document.new(Nokogiri::HTML("<html><head><title>title!</title></head><body><div><p>Some content</p></div><div class='sidebar'><p>sidebar<p></div></body>"), nil, nil, :min_text_length => 0, :retry_length => 1)
@@ -149,13 +170,13 @@ describe Readability do
149
170
  @doc.content.should_not match("sidebar")
150
171
  end
151
172
  end
152
-
173
+
153
174
  describe "outputs good stuff for known documents" do
154
175
  before do
155
176
  @html_files = Dir.glob(File.dirname(__FILE__) + "/fixtures/samples/*.html")
156
177
  @samples = @html_files.map {|filename| File.basename(filename, '.html') }
157
178
  end
158
-
179
+
159
180
  it "should output expected fragments of text" do
160
181
 
161
182
  checks = 0
@@ -165,12 +186,12 @@ describe Readability do
165
186
 
166
187
  load "fixtures/samples/#{sample}-fragments.rb"
167
188
  puts "testing #{sample}..."
168
-
189
+
169
190
  $required_fragments.each do |required_text|
170
191
  doc.should include(required_text)
171
192
  checks += 1
172
193
  end
173
-
194
+
174
195
  $excluded_fragments.each do |text_to_avoid|
175
196
  doc.should_not include(text_to_avoid)
176
197
  checks += 1
@@ -179,21 +200,20 @@ describe Readability do
179
200
  puts "Performed #{checks} checks."
180
201
  end
181
202
  end
182
-
203
+
183
204
  describe "handles vimeo.com videos" do
184
-
185
- before(:each) do
205
+
206
+ before(:each) do
186
207
  FakeWeb.register_uri(:get, 'http://vimeo.com/10365005',
187
208
  :response => File.read("spec/fixtures/vimeo.com.html"))
188
209
  @uri = URI.parse("http://vimeo.com/10365005")
189
-
190
- @content = Readability::Document.new(Nokogiri::HTML(open('http://vimeo.com/10365005')), @uri, @uri).content
210
+ @content = Readability::Document.new(Nokogiri::HTML(open('http://vimeo.com/10365005')), @uri.host, @uri.request_uri).content
191
211
  end
192
-
212
+
193
213
  it "should extract the video from the page" do
194
- @content.should include("<iframe src=\"http://player.vimeo.com/video/10365005")
214
+ @content.should include("<iframe src=\"http://player.vimeo.com/video/10365005\"")
195
215
  end
196
-
216
+
197
217
  end
198
-
218
+
199
219
  end
data/spec/spec_helper.rb CHANGED
@@ -1,13 +1,8 @@
1
- $LOAD_PATH.unshift(File.dirname(__FILE__))
2
- $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
1
  require 'rubygems'
4
2
  require 'readability'
5
- require 'spec'
6
- require 'spec/autorun'
7
- require 'nokogiri'
8
- require 'open-uri'
9
- require 'fakeweb'
3
+ require 'bundler'
10
4
 
11
- Spec::Runner.configure do |config|
5
+ Bundler.setup(:test)
12
6
 
13
- end
7
+ require 'open-uri'
8
+ require 'fakeweb'
data/test_on_url.rb ADDED
@@ -0,0 +1,6 @@
1
+ require 'rubygems'
2
+ require 'open-uri'
3
+ require 'lib/readability'
4
+
5
+ text = open(ARGV.first).read
6
+ p Readability::Document.new(text).content
metadata CHANGED
@@ -1,26 +1,27 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: busk-ruby-readability
3
3
  version: !ruby/object:Gem::Version
4
- hash: 25
4
+ hash: 19
5
5
  prerelease: false
6
6
  segments:
7
7
  - 1
8
+ - 1
8
9
  - 0
9
- - 7
10
- version: 1.0.7
10
+ version: 1.1.0
11
11
  platform: ruby
12
- authors: []
13
-
12
+ authors:
13
+ - Fabio Mont Alegre
14
+ - Rodrigo Flores
14
15
  autorequire:
15
16
  bindir: bin
16
17
  cert_chain: []
17
18
 
18
- date: 2011-01-05 00:00:00 -02:00
19
+ date: 2011-01-27 00:00:00 -02:00
19
20
  default_executable:
20
21
  dependencies: []
21
22
 
22
23
  description:
23
- email: spiceee@gmail.com
24
+ email: it-team@busk.com
24
25
  executables: []
25
26
 
26
27
  extensions: []
@@ -28,8 +29,14 @@ extensions: []
28
29
  extra_rdoc_files: []
29
30
 
30
31
  files:
32
+ - Gemfile
33
+ - Gemfile.lock
34
+ - README
35
+ - Rakefile
31
36
  - lib/readability.rb
37
+ - ruby-readability.gemspec
32
38
  - spec/fixtures/cant_read.html
39
+ - spec/fixtures/folha.html
33
40
  - spec/fixtures/sample.html
34
41
  - spec/fixtures/samples/channel4-1-fragments.rb
35
42
  - spec/fixtures/samples/channel4-1.html
@@ -40,6 +47,7 @@ files:
40
47
  - spec/readability_spec.rb
41
48
  - spec/spec.opts
42
49
  - spec/spec_helper.rb
50
+ - test_on_url.rb
43
51
  has_rdoc: true
44
52
  homepage: http://github.com/busk/ruby-readability
45
53
  licenses: []
@@ -49,8 +57,6 @@ rdoc_options: []
49
57
 
50
58
  require_paths:
51
59
  - lib
52
- - spec
53
- - spec/fixtures
54
60
  required_ruby_version: !ruby/object:Gem::Requirement
55
61
  none: false
56
62
  requirements:
@@ -76,5 +82,16 @@ rubygems_version: 1.3.7
76
82
  signing_key:
77
83
  specification_version: 3
78
84
  summary: A rewrite of original ruby-readability
79
- test_files: []
80
-
85
+ test_files:
86
+ - spec/fixtures/cant_read.html
87
+ - spec/fixtures/folha.html
88
+ - spec/fixtures/sample.html
89
+ - spec/fixtures/samples/channel4-1-fragments.rb
90
+ - spec/fixtures/samples/channel4-1.html
91
+ - spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
92
+ - spec/fixtures/samples/globemail-ottawa-cuts.html
93
+ - spec/fixtures/should_not_truncate.txt
94
+ - spec/fixtures/vimeo.com.html
95
+ - spec/readability_spec.rb
96
+ - spec/spec.opts
97
+ - spec/spec_helper.rb