busk-ruby-readability 1.0.6 → 1.0.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,199 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), "spec_helper"))
2
+
3
+ describe Readability do
4
+ before do
5
+ @simple_html_fixture = Nokogiri::HTML <<-HTML
6
+ <html>
7
+ <head>
8
+ <title>title!</title>
9
+ </head>
10
+ <body class='comment'>
11
+ <div>
12
+ <p class='comment'>a comment</p>
13
+ <div class='comment' id='body'>real content</div>
14
+ <div id="contains_blockquote"><blockquote>something in a table</blockquote></div>
15
+ </div>
16
+ </body>
17
+ </html>
18
+ HTML
19
+ end
20
+
21
+ describe "transformMisusedDivsIntoParagraphs" do
22
+ before do
23
+ @doc = Readability::Document.new(@simple_html_fixture, nil, nil)
24
+ @doc.transform_misused_divs_into_paragraphs!
25
+ end
26
+
27
+ it "should transform divs containing no block elements into <p>s" do
28
+ @doc.document.css("#body").first.name.should == "p"
29
+ end
30
+
31
+ it "should not transform divs that contain block elements" do
32
+ @doc.document.css("#contains_blockquote").first.name.should == "div"
33
+ end
34
+ end
35
+
36
+ describe "score_node" do
37
+ before do
38
+ @html = Nokogiri::HTML <<-HTML
39
+ <html>
40
+ <body>
41
+ <div id='elem1'>
42
+ <p>some content</p>
43
+ </div>
44
+ <th id='elem2'>
45
+ <p>some other content</p>
46
+ </th>
47
+ </body>
48
+ </html>
49
+ HTML
50
+
51
+ @doc = Readability::Document.new(@html, nil, nil)
52
+ @elem1 = @doc.document.css("#elem1").first
53
+ @elem2 = @doc.document.css("#elem2").first
54
+ end
55
+
56
+ it "should like <div>s more than <th>s" do
57
+ @doc.score_node(@elem1)[:content_score].should > @doc.score_node(@elem2)[:content_score]
58
+ end
59
+
60
+ it "should like classes like text more than classes like comment" do
61
+ @elem2.name = "div"
62
+ @doc.score_node(@elem1)[:content_score].should == @doc.score_node(@elem2)[:content_score]
63
+ @elem1['class'] = "text"
64
+ @elem2['class'] = "comment"
65
+ @doc.score_node(@elem1)[:content_score].should > @doc.score_node(@elem2)[:content_score]
66
+ end
67
+ end
68
+
69
+ describe "remove_unlikely_candidates!" do
70
+ before do
71
+ @doc = Readability::Document.new(@simple_html_fixture, nil, nil)
72
+ @doc.remove_unlikely_candidates!
73
+ end
74
+
75
+ it "should remove things that have class comment" do
76
+ @doc.document.inner_html.should_not =~ /a comment/
77
+ end
78
+
79
+ it "should not remove body tags" do
80
+ @doc.document.inner_html.should =~ /<\/body>/
81
+ end
82
+
83
+ it "should not remove things with class comment and id body" do
84
+ @doc.document.inner_html.should =~ /real content/
85
+ end
86
+ end
87
+
88
+ describe "score_paragraphs" do
89
+ before(:each) do
90
+ @html = Nokogiri::HTML <<-HTML
91
+ <html>
92
+ <head>
93
+ <title>title!</title>
94
+ </head>
95
+ <body id="body">
96
+ <div id="div1">
97
+ <div id="div2">
98
+ <p id="some_comment">a comment</p>
99
+ </div>
100
+ <p id="some_text">some text</p>
101
+ </div>
102
+ <div id="div3">
103
+ <p id="some_text2">some more text</p>
104
+ </div>
105
+ </body>
106
+ </html>
107
+ HTML
108
+
109
+ @doc = Readability::Document.new(@html, nil, nil)
110
+ @candidates = @doc.score_paragraphs(0)
111
+ end
112
+
113
+ it "should score elements in the document" do
114
+ @candidates.values.length.should == 4
115
+ end
116
+
117
+ it "should prefer the body in this particular example" do
118
+ @candidates.values.sort { |a, b|
119
+ b[:content_score] <=> a[:content_score]
120
+ }.first[:elem][:id].should == "body"
121
+ end
122
+ end
123
+
124
+ describe "the cant_read.html fixture" do
125
+ it "should work on the cant_read.html fixture with some allowed tags" do
126
+ allowed_tags = %w[div span table tr td p i strong u h1 h2 h3 h4 pre code br a]
127
+ allowed_attributes = %w[href]
128
+ html = File.read(File.dirname(__FILE__) + "/fixtures/cant_read.html")
129
+ Readability::Document.new(Nokogiri::HTML(html), nil, nil, :tags => allowed_tags, :attributes => allowed_attributes).content.should match(/Can you talk a little about how you developed the looks for the/)
130
+ end
131
+ end
132
+
133
+ describe "general functionality" do
134
+ before do
135
+ @doc = Readability::Document.new(Nokogiri::HTML("<html><head><title>title!</title></head><body><div><p>Some content</p></div></body>"), nil, nil, :min_text_length => 0, :retry_length => 1)
136
+ end
137
+
138
+ it "should return the main page content" do
139
+ @doc.content.should match("Some content")
140
+ end
141
+ end
142
+
143
+ describe "ignoring sidebars" do
144
+ before do
145
+ @doc = Readability::Document.new(Nokogiri::HTML("<html><head><title>title!</title></head><body><div><p>Some content</p></div><div class='sidebar'><p>sidebar<p></div></body>"), nil, nil, :min_text_length => 0, :retry_length => 1)
146
+ end
147
+
148
+ it "should not return the sidebar" do
149
+ @doc.content.should_not match("sidebar")
150
+ end
151
+ end
152
+
153
+ describe "outputs good stuff for known documents" do
154
+ before do
155
+ @html_files = Dir.glob(File.dirname(__FILE__) + "/fixtures/samples/*.html")
156
+ @samples = @html_files.map {|filename| File.basename(filename, '.html') }
157
+ end
158
+
159
+ it "should output expected fragments of text" do
160
+
161
+ checks = 0
162
+ @samples.each do |sample|
163
+ html = File.read(File.dirname(__FILE__) + "/fixtures/samples/#{sample}.html")
164
+ doc = Readability::Document.new(Nokogiri::HTML(html), nil, nil).content
165
+
166
+ load "fixtures/samples/#{sample}-fragments.rb"
167
+ puts "testing #{sample}..."
168
+
169
+ $required_fragments.each do |required_text|
170
+ doc.should include(required_text)
171
+ checks += 1
172
+ end
173
+
174
+ $excluded_fragments.each do |text_to_avoid|
175
+ doc.should_not include(text_to_avoid)
176
+ checks += 1
177
+ end
178
+ end
179
+ puts "Performed #{checks} checks."
180
+ end
181
+ end
182
+
183
+ describe "handles vimeo.com videos" do
184
+
185
+ before(:each) do
186
+ FakeWeb.register_uri(:get, 'http://vimeo.com/10365005',
187
+ :response => File.read("spec/fixtures/vimeo.com.html"))
188
+ @uri = URI.parse("http://vimeo.com/10365005")
189
+
190
+ @content = Readability::Document.new(Nokogiri::HTML(open('http://vimeo.com/10365005')), @uri, @uri).content
191
+ end
192
+
193
+ it "should extract the video from the page" do
194
+ @content.should include("<iframe src=\"http://player.vimeo.com/video/10365005")
195
+ end
196
+
197
+ end
198
+
199
+ end
data/spec/spec.opts ADDED
@@ -0,0 +1,4 @@
1
+ --colour
2
+ --format s -c
3
+ --loadby mtime
4
+ --reverse
@@ -0,0 +1,13 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
2
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
+ require 'rubygems'
4
+ require 'readability'
5
+ require 'spec'
6
+ require 'spec/autorun'
7
+ require 'nokogiri'
8
+ require 'open-uri'
9
+ require 'fakeweb'
10
+
11
+ Spec::Runner.configure do |config|
12
+
13
+ end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: busk-ruby-readability
3
3
  version: !ruby/object:Gem::Version
4
- hash: 27
4
+ hash: 25
5
5
  prerelease: false
6
6
  segments:
7
7
  - 1
8
8
  - 0
9
- - 6
10
- version: 1.0.6
9
+ - 7
10
+ version: 1.0.7
11
11
  platform: ruby
12
12
  authors: []
13
13
 
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-10-05 00:00:00 -03:00
18
+ date: 2011-01-05 00:00:00 -02:00
19
19
  default_executable:
20
20
  dependencies: []
21
21
 
@@ -29,6 +29,17 @@ extra_rdoc_files: []
29
29
 
30
30
  files:
31
31
  - lib/readability.rb
32
+ - spec/fixtures/cant_read.html
33
+ - spec/fixtures/sample.html
34
+ - spec/fixtures/samples/channel4-1-fragments.rb
35
+ - spec/fixtures/samples/channel4-1.html
36
+ - spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
37
+ - spec/fixtures/samples/globemail-ottawa-cuts.html
38
+ - spec/fixtures/should_not_truncate.txt
39
+ - spec/fixtures/vimeo.com.html
40
+ - spec/readability_spec.rb
41
+ - spec/spec.opts
42
+ - spec/spec_helper.rb
32
43
  has_rdoc: true
33
44
  homepage: http://github.com/busk/ruby-readability
34
45
  licenses: []
@@ -38,6 +49,8 @@ rdoc_options: []
38
49
 
39
50
  require_paths:
40
51
  - lib
52
+ - spec
53
+ - spec/fixtures
41
54
  required_ruby_version: !ruby/object:Gem::Requirement
42
55
  none: false
43
56
  requirements: