ruby-readability 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,180 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), "spec_helper"))
2
+
3
+ describe Readability do
4
+ before do
5
+ @simple_html_fixture = <<-HTML
6
+ <html>
7
+ <head>
8
+ <title>title!</title>
9
+ </head>
10
+ <body class='comment'>
11
+ <div>
12
+ <p class='comment'>a comment</p>
13
+ <div class='comment' id='body'>real content</div>
14
+ <div id="contains_blockquote"><blockquote>something in a table</blockquote></div>
15
+ </div>
16
+ </body>
17
+ </html>
18
+ HTML
19
+ end
20
+
21
+ describe "transformMisusedDivsIntoParagraphs" do
22
+ before do
23
+ @doc = Readability::Document.new(@simple_html_fixture)
24
+ @doc.transform_misused_divs_into_paragraphs!
25
+ end
26
+
27
+ it "should transform divs containing no block elements into <p>s" do
28
+ @doc.html.css("#body").first.name.should == "p"
29
+ end
30
+
31
+ it "should not transform divs that contain block elements" do
32
+ @doc.html.css("#contains_blockquote").first.name.should == "div"
33
+ end
34
+ end
35
+
36
+ describe "score_node" do
37
+ before do
38
+ @doc = Readability::Document.new(<<-HTML)
39
+ <html>
40
+ <body>
41
+ <div id='elem1'>
42
+ <p>some content</p>
43
+ </div>
44
+ <th id='elem2'>
45
+ <p>some other content</p>
46
+ </th>
47
+ </body>
48
+ </html>
49
+ HTML
50
+ @elem1 = @doc.html.css("#elem1").first
51
+ @elem2 = @doc.html.css("#elem2").first
52
+ end
53
+
54
+ it "should like <div>s more than <th>s" do
55
+ @doc.score_node(@elem1)[:content_score].should > @doc.score_node(@elem2)[:content_score]
56
+ end
57
+
58
+ it "should like classes like text more than classes like comment" do
59
+ @elem2.name = "div"
60
+ @doc.score_node(@elem1)[:content_score].should == @doc.score_node(@elem2)[:content_score]
61
+ @elem1['class'] = "text"
62
+ @elem2['class'] = "comment"
63
+ @doc.score_node(@elem1)[:content_score].should > @doc.score_node(@elem2)[:content_score]
64
+ end
65
+ end
66
+
67
+ describe "remove_unlikely_candidates!" do
68
+ before do
69
+ @doc = Readability::Document.new(@simple_html_fixture)
70
+ @doc.remove_unlikely_candidates!
71
+ end
72
+
73
+ it "should remove things that have class comment" do
74
+ @doc.html.inner_html.should_not =~ /a comment/
75
+ end
76
+
77
+ it "should not remove body tags" do
78
+ @doc.html.inner_html.should =~ /<\/body>/
79
+ end
80
+
81
+ it "should not remove things with class comment and id body" do
82
+ @doc.html.inner_html.should =~ /real content/
83
+ end
84
+ end
85
+
86
+ describe "score_paragraphs" do
87
+ before(:each) do
88
+ @doc = Readability::Document.new(<<-HTML)
89
+ <html>
90
+ <head>
91
+ <title>title!</title>
92
+ </head>
93
+ <body id="body">
94
+ <div id="div1">
95
+ <div id="div2>
96
+ <p id="some_comment">a comment</p>
97
+ </div>
98
+ <p id="some_text">some text</p>
99
+ </div>
100
+ <div id="div3">
101
+ <p id="some_text2">some more text</p>
102
+ </div>
103
+ </body>
104
+ </html>
105
+ HTML
106
+ @candidates = @doc.score_paragraphs(0)
107
+ end
108
+
109
+ it "should score elements in the document" do
110
+ @candidates.values.length.should == 3
111
+ end
112
+
113
+ it "should prefer the body in this particular example" do
114
+ @candidates.values.sort { |a, b|
115
+ b[:content_score] <=> a[:content_score]
116
+ }.first[:elem][:id].should == "body"
117
+ end
118
+ end
119
+
120
+ describe "the cant_read.html fixture" do
121
+ it "should work on the cant_read.html fixture with some allowed tags" do
122
+ allowed_tags = %w[div span table tr td p i strong u h1 h2 h3 h4 pre code br a]
123
+ allowed_attributes = %w[href]
124
+ html = File.read(File.dirname(__FILE__) + "/fixtures/cant_read.html")
125
+ Readability::Document.new(html, :tags => allowed_tags, :attributes => allowed_attributes).content.should match(/Can you talk a little about how you developed the looks for the/)
126
+ end
127
+ end
128
+
129
+ describe "general functionality" do
130
+ before do
131
+ @doc = Readability::Document.new("<html><head><title>title!</title></head><body><div><p>Some content</p></div></body>",
132
+ :min_text_length => 0, :retry_length => 1)
133
+ end
134
+
135
+ it "should return the main page content" do
136
+ @doc.content.should match("Some content")
137
+ end
138
+ end
139
+
140
+ describe "ignoring sidebars" do
141
+ before do
142
+ @doc = Readability::Document.new("<html><head><title>title!</title></head><body><div><p>Some content</p></div><div class='sidebar'><p>sidebar<p></div></body>",
143
+ :min_text_length => 0, :retry_length => 1)
144
+ end
145
+
146
+ it "should not return the sidebar" do
147
+ @doc.content.should_not match("sidebar")
148
+ end
149
+ end
150
+
151
+ describe "outputs good stuff for known documents" do
152
+ before do
153
+ @html_files = Dir.glob(File.dirname(__FILE__) + "/fixtures/samples/*.html")
154
+ @samples = @html_files.map {|filename| File.basename(filename, '.html') }
155
+ end
156
+
157
+ it "should output expected fragments of text" do
158
+
159
+ checks = 0
160
+ @samples.each do |sample|
161
+ html = File.read(File.dirname(__FILE__) + "/fixtures/samples/#{sample}.html")
162
+ doc = Readability::Document.new(html).content
163
+
164
+ load "fixtures/samples/#{sample}-fragments.rb"
165
+ puts "testing #{sample}..."
166
+
167
+ $required_fragments.each do |required_text|
168
+ doc.should include(required_text)
169
+ checks += 1
170
+ end
171
+
172
+ $excluded_fragments.each do |text_to_avoid|
173
+ doc.should_not include(text_to_avoid)
174
+ checks += 1
175
+ end
176
+ end
177
+ puts "Performed #{checks} checks."
178
+ end
179
+ end
180
+ end
@@ -0,0 +1,10 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
2
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
+ require 'rubygems'
4
+ require 'readability'
5
+ require 'spec'
6
+ require 'spec/autorun'
7
+
8
+ Spec::Runner.configure do |config|
9
+
10
+ end
metadata ADDED
@@ -0,0 +1,94 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ruby-readability
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 1
8
+ - 0
9
+ version: 0.1.0
10
+ platform: ruby
11
+ authors:
12
+ - Kyle Maxwell
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-08-30 00:00:00 -07:00
18
+ default_executable: readability
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: rspec
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ segments:
28
+ - 1
29
+ - 2
30
+ - 9
31
+ version: 1.2.9
32
+ type: :development
33
+ version_requirements: *id001
34
+ description: ruby-readability
35
+ email: kmaxwell@twitter.com
36
+ executables:
37
+ - readability
38
+ extensions: []
39
+
40
+ extra_rdoc_files:
41
+ - README
42
+ files:
43
+ - .document
44
+ - .gitignore
45
+ - README
46
+ - Rakefile
47
+ - VERSION
48
+ - bin/readability
49
+ - lib/readability.rb
50
+ - lib/readability_old.rb
51
+ - spec/fixtures/cant_read.html
52
+ - spec/fixtures/sample.html
53
+ - spec/fixtures/samples/channel4-1-fragments.rb
54
+ - spec/fixtures/samples/channel4-1.html
55
+ - spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
56
+ - spec/fixtures/samples/globemail-ottawa-cuts.html
57
+ - spec/fixtures/should_not_truncate.txt
58
+ - spec/readability_spec.rb
59
+ - spec/spec_helper.rb
60
+ has_rdoc: true
61
+ homepage: http://github.com/fizx/ruby-readability
62
+ licenses: []
63
+
64
+ post_install_message:
65
+ rdoc_options:
66
+ - --charset=UTF-8
67
+ require_paths:
68
+ - lib
69
+ required_ruby_version: !ruby/object:Gem::Requirement
70
+ requirements:
71
+ - - ">="
72
+ - !ruby/object:Gem::Version
73
+ segments:
74
+ - 0
75
+ version: "0"
76
+ required_rubygems_version: !ruby/object:Gem::Requirement
77
+ requirements:
78
+ - - ">="
79
+ - !ruby/object:Gem::Version
80
+ segments:
81
+ - 0
82
+ version: "0"
83
+ requirements: []
84
+
85
+ rubyforge_project:
86
+ rubygems_version: 1.3.6
87
+ signing_key:
88
+ specification_version: 3
89
+ summary: ruby-readability
90
+ test_files:
91
+ - spec/fixtures/samples/channel4-1-fragments.rb
92
+ - spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
93
+ - spec/readability_spec.rb
94
+ - spec/spec_helper.rb