ruby-readability 0.5.0 → 0.5.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -20,6 +20,98 @@ describe Readability do
20
20
  HTML
21
21
  end
22
22
 
23
+ describe "images" do
24
+ before do
25
+ @bbc = File.read(File.dirname(__FILE__) + "/fixtures/bbc.html")
26
+ @nytimes = File.read(File.dirname(__FILE__) + "/fixtures/nytimes.html")
27
+ @thesum = File.read(File.dirname(__FILE__) + "/fixtures/thesun.html")
28
+
29
+ FakeWeb::Registry.instance.clean_registry
30
+ FakeWeb.register_uri(:get, "http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg",
31
+ :body => File.read(File.dirname(__FILE__) + "/fixtures/images/dim_1416768a.jpg"))
32
+ end
33
+
34
+ it "should show one image, but outside of the best candidate" do
35
+ @doc = Readability::Document.new(@thesum)
36
+ @doc.images.should == ["http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg"]
37
+ @doc.best_candidate_has_image.should == false
38
+ end
39
+
40
+ it "should show one image inside of the best candidate" do
41
+ @doc = Readability::Document.new(@nytimes)
42
+ @doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
43
+ @doc.best_candidate_has_image.should == true
44
+ end
45
+
46
+ it "should not try to download local images" do
47
+ @doc = Readability::Document.new(<<-HTML)
48
+ <html>
49
+ <head>
50
+ <title>title!</title>
51
+ </head>
52
+ <body class='comment'>
53
+ <div>
54
+ <img src="/something/local.gif" />
55
+ </div>
56
+ </body>
57
+ </html>
58
+ HTML
59
+ do_not_allow(@doc).load_image(anything)
60
+ @doc.images.should == []
61
+ end
62
+
63
+ describe "no images" do
64
+ it "shouldn't show images" do
65
+ @doc = Readability::Document.new(@bbc, :min_image_height => 600)
66
+ @doc.images.should == []
67
+ @doc.best_candidate_has_image.should == false
68
+ end
69
+ end
70
+
71
+ describe "poll of images" do
72
+ it "should show some images inside of the best candidate" do
73
+ @doc = Readability::Document.new(@bbc)
74
+ @doc.images.should =~ ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg",
75
+ "http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027786_john_capes229_rnsm.jpg",
76
+ "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif",
77
+ "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"]
78
+ @doc.best_candidate_has_image.should == true
79
+ end
80
+
81
+ it "should show some images inside of the best candidate, include gif format" do
82
+ @doc = Readability::Document.new(@bbc, :ignore_image_format => [])
83
+ @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg", "http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027786_john_capes229_rnsm.jpg", "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif", "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"]
84
+ @doc.best_candidate_has_image.should == true
85
+ end
86
+
87
+ describe "width, height and format" do
88
+ it "should show some images inside of the best candidate, but with width most equal to 400px" do
89
+ @doc = Readability::Document.new(@bbc, :min_image_width => 400, :ignore_image_format => [])
90
+ @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg"]
91
+ @doc.best_candidate_has_image.should == true
92
+ end
93
+
94
+ it "should show some images inside of the best candidate, but with width most equal to 304px" do
95
+ @doc = Readability::Document.new(@bbc, :min_image_width => 304, :ignore_image_format => [])
96
+ @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg", "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif", "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"]
97
+ @doc.best_candidate_has_image.should == true
98
+ end
99
+
100
+ it "should show some images inside of the best candidate, but with width most equal to 304px and ignoring JPG format" do
101
+ @doc = Readability::Document.new(@bbc, :min_image_width => 304, :ignore_image_format => ["jpg"])
102
+ @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif"]
103
+ @doc.best_candidate_has_image.should == true
104
+ end
105
+
106
+ it "should show some images inside of the best candidate, but with height most equal to 400px, no ignoring no format" do
107
+ @doc = Readability::Document.new(@bbc, :min_image_height => 400, :ignore_image_format => [])
108
+ @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif"]
109
+ @doc.best_candidate_has_image.should == true
110
+ end
111
+ end
112
+ end
113
+ end
114
+
23
115
  describe "transformMisusedDivsIntoParagraphs" do
24
116
  before do
25
117
  @doc = Readability::Document.new(@simple_html_fixture)
@@ -207,7 +299,7 @@ describe Readability do
207
299
  @html_files = Dir.glob(File.dirname(__FILE__) + "/fixtures/samples/*.html")
208
300
  @samples = @html_files.map {|filename| File.basename(filename, '.html') }
209
301
  end
210
-
302
+
211
303
  it "should output expected fragments of text" do
212
304
  checks = 0
213
305
  @samples.each do |sample|
@@ -216,12 +308,12 @@ describe Readability do
216
308
 
217
309
  load "fixtures/samples/#{sample}-fragments.rb"
218
310
  #puts "testing #{sample}..."
219
-
311
+
220
312
  $required_fragments.each do |required_text|
221
313
  doc.should include(required_text)
222
314
  checks += 1
223
315
  end
224
-
316
+
225
317
  $excluded_fragments.each do |text_to_avoid|
226
318
  doc.should_not include(text_to_avoid)
227
319
  checks += 1
@@ -1,7 +1,11 @@
1
1
  require 'rubygems'
2
2
  require 'readability'
3
3
  require 'rr'
4
+ require 'fakeweb'
5
+
6
+ FakeWeb.allow_net_connect = false
4
7
 
5
8
  RSpec.configure do |config|
6
9
  config.mock_with :rr
7
10
  end
11
+
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-readability
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.5.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -12,22 +12,33 @@ authors:
12
12
  autorequire:
13
13
  bindir: bin
14
14
  cert_chain: []
15
- date: 2012-01-24 00:00:00.000000000Z
15
+ date: 2012-03-14 00:00:00.000000000Z
16
16
  dependencies:
17
17
  - !ruby/object:Gem::Dependency
18
18
  name: rspec
19
- requirement: &70232951278200 !ruby/object:Gem::Requirement
19
+ requirement: &86479890 !ruby/object:Gem::Requirement
20
20
  none: false
21
21
  requirements:
22
22
  - - ! '>='
23
23
  - !ruby/object:Gem::Version
24
- version: '2.6'
24
+ version: '2.8'
25
25
  type: :development
26
26
  prerelease: false
27
- version_requirements: *70232951278200
27
+ version_requirements: *86479890
28
+ - !ruby/object:Gem::Dependency
29
+ name: rspec-expectations
30
+ requirement: &86479650 !ruby/object:Gem::Requirement
31
+ none: false
32
+ requirements:
33
+ - - ! '>='
34
+ - !ruby/object:Gem::Version
35
+ version: '2.8'
36
+ type: :development
37
+ prerelease: false
38
+ version_requirements: *86479650
28
39
  - !ruby/object:Gem::Dependency
29
40
  name: rr
30
- requirement: &70232951277680 !ruby/object:Gem::Requirement
41
+ requirement: &86479420 !ruby/object:Gem::Requirement
31
42
  none: false
32
43
  requirements:
33
44
  - - ! '>='
@@ -35,10 +46,10 @@ dependencies:
35
46
  version: '1.0'
36
47
  type: :development
37
48
  prerelease: false
38
- version_requirements: *70232951277680
49
+ version_requirements: *86479420
39
50
  - !ruby/object:Gem::Dependency
40
51
  name: nokogiri
41
- requirement: &70232951277200 !ruby/object:Gem::Requirement
52
+ requirement: &86479190 !ruby/object:Gem::Requirement
42
53
  none: false
43
54
  requirements:
44
55
  - - ! '>='
@@ -46,10 +57,10 @@ dependencies:
46
57
  version: 1.4.2
47
58
  type: :runtime
48
59
  prerelease: false
49
- version_requirements: *70232951277200
60
+ version_requirements: *86479190
50
61
  - !ruby/object:Gem::Dependency
51
62
  name: guess_html_encoding
52
- requirement: &70232951276720 !ruby/object:Gem::Requirement
63
+ requirement: &86478960 !ruby/object:Gem::Requirement
53
64
  none: false
54
65
  requirements:
55
66
  - - ! '>='
@@ -57,7 +68,7 @@ dependencies:
57
68
  version: 0.0.2
58
69
  type: :runtime
59
70
  prerelease: false
60
- version_requirements: *70232951276720
71
+ version_requirements: *86478960
61
72
  description: Port of arc90's readability project to ruby
62
73
  email:
63
74
  - andrew@iterationlabs.com
@@ -68,14 +79,18 @@ extra_rdoc_files: []
68
79
  files:
69
80
  - .document
70
81
  - .gitignore
82
+ - .rspec
71
83
  - Gemfile
72
- - README
84
+ - README.markdown
73
85
  - Rakefile
74
86
  - bin/readability
75
87
  - lib/readability.rb
76
88
  - lib/ruby-readability.rb
77
89
  - ruby-readability.gemspec
90
+ - spec/fixtures/bbc.html
78
91
  - spec/fixtures/cant_read.html
92
+ - spec/fixtures/images/dim_1416768a.jpg
93
+ - spec/fixtures/nytimes.html
79
94
  - spec/fixtures/sample.html
80
95
  - spec/fixtures/samples/blogpost_with_links-fragments.rb
81
96
  - spec/fixtures/samples/blogpost_with_links.html
@@ -86,6 +101,7 @@ files:
86
101
  - spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
87
102
  - spec/fixtures/samples/globemail-ottawa-cuts.html
88
103
  - spec/fixtures/should_not_truncate.txt
104
+ - spec/fixtures/thesun.html
89
105
  - spec/readability_spec.rb
90
106
  - spec/spec.opts
91
107
  - spec/spec_helper.rb
@@ -109,22 +125,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
109
125
  version: '0'
110
126
  requirements: []
111
127
  rubyforge_project: ruby-readability
112
- rubygems_version: 1.8.10
128
+ rubygems_version: 1.8.16
113
129
  signing_key:
114
130
  specification_version: 3
115
131
  summary: Port of arc90's readability project to ruby
116
- test_files:
117
- - spec/fixtures/cant_read.html
118
- - spec/fixtures/sample.html
119
- - spec/fixtures/samples/blogpost_with_links-fragments.rb
120
- - spec/fixtures/samples/blogpost_with_links.html
121
- - spec/fixtures/samples/channel4-1-fragments.rb
122
- - spec/fixtures/samples/channel4-1.html
123
- - spec/fixtures/samples/foxnews-india1-fragments.rb
124
- - spec/fixtures/samples/foxnews-india1.html
125
- - spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
126
- - spec/fixtures/samples/globemail-ottawa-cuts.html
127
- - spec/fixtures/should_not_truncate.txt
128
- - spec/readability_spec.rb
129
- - spec/spec.opts
130
- - spec/spec_helper.rb
132
+ test_files: []
data/README DELETED
@@ -1,54 +0,0 @@
1
- Ruby Readability
2
-
3
- Command line:
4
- (sudo) gem install ruby-readability
5
-
6
- Bundler:
7
- gem "ruby-readability", :require => 'readability'
8
-
9
- Example:
10
-
11
- require 'rubygems'
12
- require 'readability'
13
- require 'open-uri'
14
-
15
- source = open('http://lab.arc90.com/experiments/readability/').read
16
- puts Readability::Document.new(source).content
17
-
18
- Options:
19
-
20
- You may provide additions options to Readability::Document.new, including:
21
-
22
- :tags - the base whitelist of tags to sanitize, defaults to %w[div p]
23
- :remove_empty_nodes - remove <p> tags that have no text content; also removes p tags that contain only images
24
- :attributes - whitelist of allowed attributes
25
- :debug - provide debugging output, defaults false
26
- :encoding - if this page is of a known encoding, you can specify it; if left
27
- unspecified, the encoding will be guessed (only in Ruby 1.9.x)
28
- :html_headers - in Ruby 1.9.x these will be passed to the guess_html_encoding gem
29
- to aid with guessing the HTML encoding
30
-
31
- Readability comes with a command-line tool for experimentation in bin/readability.
32
-
33
- Usage: readability [options] URL
34
- -d, --debug Show debug output
35
- -i, --images Keep images and links
36
- -h, --help Show this message
37
-
38
- Potential issues:
39
-
40
- * If you're on a Mac and are getting segmentation faults, see this discussion https://github.com/tenderlove/nokogiri/issues/404 and consider updating your version of libxml2.
41
- Version 2.7.8 of libxml2 with the following worked for me:
42
- gem install nokogiri -- --with-xml2-include=/usr/local/Cellar/libxml2/2.7.8/include/libxml2 --with-xml2-lib=/usr/local/Cellar/libxml2/2.7.8/lib --with-xslt-dir=/usr/local/Cellar/libxslt/1.1.26
43
-
44
- ===
45
-
46
- This code is under the Apache License 2.0. http://www.apache.org/licenses/LICENSE-2.0
47
-
48
- This is a ruby port of arc90's readability project
49
-
50
- http://lab.arc90.com/experiments/readability/
51
-
52
- Given a html document, it pulls out the main body text and cleans it up.
53
-
54
- Ruby port by starrhorne, libc, and iterationlabs. Original gemification by fizx.