busk-ruby-readability 1.0.6 → 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,199 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), "spec_helper"))
2
+
3
+ describe Readability do
4
+ before do
5
+ @simple_html_fixture = Nokogiri::HTML <<-HTML
6
+ <html>
7
+ <head>
8
+ <title>title!</title>
9
+ </head>
10
+ <body class='comment'>
11
+ <div>
12
+ <p class='comment'>a comment</p>
13
+ <div class='comment' id='body'>real content</div>
14
+ <div id="contains_blockquote"><blockquote>something in a table</blockquote></div>
15
+ </div>
16
+ </body>
17
+ </html>
18
+ HTML
19
+ end
20
+
21
+ describe "transformMisusedDivsIntoParagraphs" do
22
+ before do
23
+ @doc = Readability::Document.new(@simple_html_fixture, nil, nil)
24
+ @doc.transform_misused_divs_into_paragraphs!
25
+ end
26
+
27
+ it "should transform divs containing no block elements into <p>s" do
28
+ @doc.document.css("#body").first.name.should == "p"
29
+ end
30
+
31
+ it "should not transform divs that contain block elements" do
32
+ @doc.document.css("#contains_blockquote").first.name.should == "div"
33
+ end
34
+ end
35
+
36
+ describe "score_node" do
37
+ before do
38
+ @html = Nokogiri::HTML <<-HTML
39
+ <html>
40
+ <body>
41
+ <div id='elem1'>
42
+ <p>some content</p>
43
+ </div>
44
+ <th id='elem2'>
45
+ <p>some other content</p>
46
+ </th>
47
+ </body>
48
+ </html>
49
+ HTML
50
+
51
+ @doc = Readability::Document.new(@html, nil, nil)
52
+ @elem1 = @doc.document.css("#elem1").first
53
+ @elem2 = @doc.document.css("#elem2").first
54
+ end
55
+
56
+ it "should like <div>s more than <th>s" do
57
+ @doc.score_node(@elem1)[:content_score].should > @doc.score_node(@elem2)[:content_score]
58
+ end
59
+
60
+ it "should like classes like text more than classes like comment" do
61
+ @elem2.name = "div"
62
+ @doc.score_node(@elem1)[:content_score].should == @doc.score_node(@elem2)[:content_score]
63
+ @elem1['class'] = "text"
64
+ @elem2['class'] = "comment"
65
+ @doc.score_node(@elem1)[:content_score].should > @doc.score_node(@elem2)[:content_score]
66
+ end
67
+ end
68
+
69
+ describe "remove_unlikely_candidates!" do
70
+ before do
71
+ @doc = Readability::Document.new(@simple_html_fixture, nil, nil)
72
+ @doc.remove_unlikely_candidates!
73
+ end
74
+
75
+ it "should remove things that have class comment" do
76
+ @doc.document.inner_html.should_not =~ /a comment/
77
+ end
78
+
79
+ it "should not remove body tags" do
80
+ @doc.document.inner_html.should =~ /<\/body>/
81
+ end
82
+
83
+ it "should not remove things with class comment and id body" do
84
+ @doc.document.inner_html.should =~ /real content/
85
+ end
86
+ end
87
+
88
+ describe "score_paragraphs" do
89
+ before(:each) do
90
+ @html = Nokogiri::HTML <<-HTML
91
+ <html>
92
+ <head>
93
+ <title>title!</title>
94
+ </head>
95
+ <body id="body">
96
+ <div id="div1">
97
+ <div id="div2">
98
+ <p id="some_comment">a comment</p>
99
+ </div>
100
+ <p id="some_text">some text</p>
101
+ </div>
102
+ <div id="div3">
103
+ <p id="some_text2">some more text</p>
104
+ </div>
105
+ </body>
106
+ </html>
107
+ HTML
108
+
109
+ @doc = Readability::Document.new(@html, nil, nil)
110
+ @candidates = @doc.score_paragraphs(0)
111
+ end
112
+
113
+ it "should score elements in the document" do
114
+ @candidates.values.length.should == 4
115
+ end
116
+
117
+ it "should prefer the body in this particular example" do
118
+ @candidates.values.sort { |a, b|
119
+ b[:content_score] <=> a[:content_score]
120
+ }.first[:elem][:id].should == "body"
121
+ end
122
+ end
123
+
124
+ describe "the cant_read.html fixture" do
125
+ it "should work on the cant_read.html fixture with some allowed tags" do
126
+ allowed_tags = %w[div span table tr td p i strong u h1 h2 h3 h4 pre code br a]
127
+ allowed_attributes = %w[href]
128
+ html = File.read(File.dirname(__FILE__) + "/fixtures/cant_read.html")
129
+ Readability::Document.new(Nokogiri::HTML(html), nil, nil, :tags => allowed_tags, :attributes => allowed_attributes).content.should match(/Can you talk a little about how you developed the looks for the/)
130
+ end
131
+ end
132
+
133
+ describe "general functionality" do
134
+ before do
135
+ @doc = Readability::Document.new(Nokogiri::HTML("<html><head><title>title!</title></head><body><div><p>Some content</p></div></body>"), nil, nil, :min_text_length => 0, :retry_length => 1)
136
+ end
137
+
138
+ it "should return the main page content" do
139
+ @doc.content.should match("Some content")
140
+ end
141
+ end
142
+
143
+ describe "ignoring sidebars" do
144
+ before do
145
+ @doc = Readability::Document.new(Nokogiri::HTML("<html><head><title>title!</title></head><body><div><p>Some content</p></div><div class='sidebar'><p>sidebar<p></div></body>"), nil, nil, :min_text_length => 0, :retry_length => 1)
146
+ end
147
+
148
+ it "should not return the sidebar" do
149
+ @doc.content.should_not match("sidebar")
150
+ end
151
+ end
152
+
153
+ describe "outputs good stuff for known documents" do
154
+ before do
155
+ @html_files = Dir.glob(File.dirname(__FILE__) + "/fixtures/samples/*.html")
156
+ @samples = @html_files.map {|filename| File.basename(filename, '.html') }
157
+ end
158
+
159
+ it "should output expected fragments of text" do
160
+
161
+ checks = 0
162
+ @samples.each do |sample|
163
+ html = File.read(File.dirname(__FILE__) + "/fixtures/samples/#{sample}.html")
164
+ doc = Readability::Document.new(Nokogiri::HTML(html), nil, nil).content
165
+
166
+ load "fixtures/samples/#{sample}-fragments.rb"
167
+ puts "testing #{sample}..."
168
+
169
+ $required_fragments.each do |required_text|
170
+ doc.should include(required_text)
171
+ checks += 1
172
+ end
173
+
174
+ $excluded_fragments.each do |text_to_avoid|
175
+ doc.should_not include(text_to_avoid)
176
+ checks += 1
177
+ end
178
+ end
179
+ puts "Performed #{checks} checks."
180
+ end
181
+ end
182
+
183
+ describe "handles vimeo.com videos" do
184
+
185
+ before(:each) do
186
+ FakeWeb.register_uri(:get, 'http://vimeo.com/10365005',
187
+ :response => File.read("spec/fixtures/vimeo.com.html"))
188
+ @uri = URI.parse("http://vimeo.com/10365005")
189
+
190
+ @content = Readability::Document.new(Nokogiri::HTML(open('http://vimeo.com/10365005')), @uri, @uri).content
191
+ end
192
+
193
+ it "should extract the video from the page" do
194
+ @content.should include("<iframe src=\"http://player.vimeo.com/video/10365005")
195
+ end
196
+
197
+ end
198
+
199
+ end
data/spec/spec.opts ADDED
@@ -0,0 +1,4 @@
1
+ --colour
2
+ --format s -c
3
+ --loadby mtime
4
+ --reverse
@@ -0,0 +1,13 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
2
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
+ require 'rubygems'
4
+ require 'readability'
5
+ require 'spec'
6
+ require 'spec/autorun'
7
+ require 'nokogiri'
8
+ require 'open-uri'
9
+ require 'fakeweb'
10
+
11
+ Spec::Runner.configure do |config|
12
+
13
+ end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: busk-ruby-readability
3
3
  version: !ruby/object:Gem::Version
4
- hash: 27
4
+ hash: 25
5
5
  prerelease: false
6
6
  segments:
7
7
  - 1
8
8
  - 0
9
- - 6
10
- version: 1.0.6
9
+ - 7
10
+ version: 1.0.7
11
11
  platform: ruby
12
12
  authors: []
13
13
 
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-10-05 00:00:00 -03:00
18
+ date: 2011-01-05 00:00:00 -02:00
19
19
  default_executable:
20
20
  dependencies: []
21
21
 
@@ -29,6 +29,17 @@ extra_rdoc_files: []
29
29
 
30
30
  files:
31
31
  - lib/readability.rb
32
+ - spec/fixtures/cant_read.html
33
+ - spec/fixtures/sample.html
34
+ - spec/fixtures/samples/channel4-1-fragments.rb
35
+ - spec/fixtures/samples/channel4-1.html
36
+ - spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
37
+ - spec/fixtures/samples/globemail-ottawa-cuts.html
38
+ - spec/fixtures/should_not_truncate.txt
39
+ - spec/fixtures/vimeo.com.html
40
+ - spec/readability_spec.rb
41
+ - spec/spec.opts
42
+ - spec/spec_helper.rb
32
43
  has_rdoc: true
33
44
  homepage: http://github.com/busk/ruby-readability
34
45
  licenses: []
@@ -38,6 +49,8 @@ rdoc_options: []
38
49
 
39
50
  require_paths:
40
51
  - lib
52
+ - spec
53
+ - spec/fixtures
41
54
  required_ruby_version: !ruby/object:Gem::Requirement
42
55
  none: false
43
56
  requirements: