ruby-readability 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,180 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), "spec_helper"))
2
+
3
+ describe Readability do
4
+ before do
5
+ @simple_html_fixture = <<-HTML
6
+ <html>
7
+ <head>
8
+ <title>title!</title>
9
+ </head>
10
+ <body class='comment'>
11
+ <div>
12
+ <p class='comment'>a comment</p>
13
+ <div class='comment' id='body'>real content</div>
14
+ <div id="contains_blockquote"><blockquote>something in a table</blockquote></div>
15
+ </div>
16
+ </body>
17
+ </html>
18
+ HTML
19
+ end
20
+
21
+ describe "transformMisusedDivsIntoParagraphs" do
22
+ before do
23
+ @doc = Readability::Document.new(@simple_html_fixture)
24
+ @doc.transform_misused_divs_into_paragraphs!
25
+ end
26
+
27
+ it "should transform divs containing no block elements into <p>s" do
28
+ @doc.html.css("#body").first.name.should == "p"
29
+ end
30
+
31
+ it "should not transform divs that contain block elements" do
32
+ @doc.html.css("#contains_blockquote").first.name.should == "div"
33
+ end
34
+ end
35
+
36
+ describe "score_node" do
37
+ before do
38
+ @doc = Readability::Document.new(<<-HTML)
39
+ <html>
40
+ <body>
41
+ <div id='elem1'>
42
+ <p>some content</p>
43
+ </div>
44
+ <th id='elem2'>
45
+ <p>some other content</p>
46
+ </th>
47
+ </body>
48
+ </html>
49
+ HTML
50
+ @elem1 = @doc.html.css("#elem1").first
51
+ @elem2 = @doc.html.css("#elem2").first
52
+ end
53
+
54
+ it "should like <div>s more than <th>s" do
55
+ @doc.score_node(@elem1)[:content_score].should > @doc.score_node(@elem2)[:content_score]
56
+ end
57
+
58
+ it "should like classes like text more than classes like comment" do
59
+ @elem2.name = "div"
60
+ @doc.score_node(@elem1)[:content_score].should == @doc.score_node(@elem2)[:content_score]
61
+ @elem1['class'] = "text"
62
+ @elem2['class'] = "comment"
63
+ @doc.score_node(@elem1)[:content_score].should > @doc.score_node(@elem2)[:content_score]
64
+ end
65
+ end
66
+
67
+ describe "remove_unlikely_candidates!" do
68
+ before do
69
+ @doc = Readability::Document.new(@simple_html_fixture)
70
+ @doc.remove_unlikely_candidates!
71
+ end
72
+
73
+ it "should remove things that have class comment" do
74
+ @doc.html.inner_html.should_not =~ /a comment/
75
+ end
76
+
77
+ it "should not remove body tags" do
78
+ @doc.html.inner_html.should =~ /<\/body>/
79
+ end
80
+
81
+ it "should not remove things with class comment and id body" do
82
+ @doc.html.inner_html.should =~ /real content/
83
+ end
84
+ end
85
+
86
+ describe "score_paragraphs" do
87
+ before(:each) do
88
+ @doc = Readability::Document.new(<<-HTML)
89
+ <html>
90
+ <head>
91
+ <title>title!</title>
92
+ </head>
93
+ <body id="body">
94
+ <div id="div1">
95
+ <div id="div2>
96
+ <p id="some_comment">a comment</p>
97
+ </div>
98
+ <p id="some_text">some text</p>
99
+ </div>
100
+ <div id="div3">
101
+ <p id="some_text2">some more text</p>
102
+ </div>
103
+ </body>
104
+ </html>
105
+ HTML
106
+ @candidates = @doc.score_paragraphs(0)
107
+ end
108
+
109
+ it "should score elements in the document" do
110
+ @candidates.values.length.should == 3
111
+ end
112
+
113
+ it "should prefer the body in this particular example" do
114
+ @candidates.values.sort { |a, b|
115
+ b[:content_score] <=> a[:content_score]
116
+ }.first[:elem][:id].should == "body"
117
+ end
118
+ end
119
+
120
+ describe "the cant_read.html fixture" do
121
+ it "should work on the cant_read.html fixture with some allowed tags" do
122
+ allowed_tags = %w[div span table tr td p i strong u h1 h2 h3 h4 pre code br a]
123
+ allowed_attributes = %w[href]
124
+ html = File.read(File.dirname(__FILE__) + "/fixtures/cant_read.html")
125
+ Readability::Document.new(html, :tags => allowed_tags, :attributes => allowed_attributes).content.should match(/Can you talk a little about how you developed the looks for the/)
126
+ end
127
+ end
128
+
129
+ describe "general functionality" do
130
+ before do
131
+ @doc = Readability::Document.new("<html><head><title>title!</title></head><body><div><p>Some content</p></div></body>",
132
+ :min_text_length => 0, :retry_length => 1)
133
+ end
134
+
135
+ it "should return the main page content" do
136
+ @doc.content.should match("Some content")
137
+ end
138
+ end
139
+
140
+ describe "ignoring sidebars" do
141
+ before do
142
+ @doc = Readability::Document.new("<html><head><title>title!</title></head><body><div><p>Some content</p></div><div class='sidebar'><p>sidebar<p></div></body>",
143
+ :min_text_length => 0, :retry_length => 1)
144
+ end
145
+
146
+ it "should not return the sidebar" do
147
+ @doc.content.should_not match("sidebar")
148
+ end
149
+ end
150
+
151
+ describe "outputs good stuff for known documents" do
152
+ before do
153
+ @html_files = Dir.glob(File.dirname(__FILE__) + "/fixtures/samples/*.html")
154
+ @samples = @html_files.map {|filename| File.basename(filename, '.html') }
155
+ end
156
+
157
+ it "should output expected fragments of text" do
158
+
159
+ checks = 0
160
+ @samples.each do |sample|
161
+ html = File.read(File.dirname(__FILE__) + "/fixtures/samples/#{sample}.html")
162
+ doc = Readability::Document.new(html).content
163
+
164
+ load "fixtures/samples/#{sample}-fragments.rb"
165
+ puts "testing #{sample}..."
166
+
167
+ $required_fragments.each do |required_text|
168
+ doc.should include(required_text)
169
+ checks += 1
170
+ end
171
+
172
+ $excluded_fragments.each do |text_to_avoid|
173
+ doc.should_not include(text_to_avoid)
174
+ checks += 1
175
+ end
176
+ end
177
+ puts "Performed #{checks} checks."
178
+ end
179
+ end
180
+ end
@@ -0,0 +1,10 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
2
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
+ require 'rubygems'
4
+ require 'readability'
5
+ require 'spec'
6
+ require 'spec/autorun'
7
+
8
+ Spec::Runner.configure do |config|
9
+
10
+ end
metadata ADDED
@@ -0,0 +1,94 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ruby-readability
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 1
8
+ - 0
9
+ version: 0.1.0
10
+ platform: ruby
11
+ authors:
12
+ - Kyle Maxwell
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-08-30 00:00:00 -07:00
18
+ default_executable: readability
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: rspec
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ segments:
28
+ - 1
29
+ - 2
30
+ - 9
31
+ version: 1.2.9
32
+ type: :development
33
+ version_requirements: *id001
34
+ description: ruby-readability
35
+ email: kmaxwell@twitter.com
36
+ executables:
37
+ - readability
38
+ extensions: []
39
+
40
+ extra_rdoc_files:
41
+ - README
42
+ files:
43
+ - .document
44
+ - .gitignore
45
+ - README
46
+ - Rakefile
47
+ - VERSION
48
+ - bin/readability
49
+ - lib/readability.rb
50
+ - lib/readability_old.rb
51
+ - spec/fixtures/cant_read.html
52
+ - spec/fixtures/sample.html
53
+ - spec/fixtures/samples/channel4-1-fragments.rb
54
+ - spec/fixtures/samples/channel4-1.html
55
+ - spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
56
+ - spec/fixtures/samples/globemail-ottawa-cuts.html
57
+ - spec/fixtures/should_not_truncate.txt
58
+ - spec/readability_spec.rb
59
+ - spec/spec_helper.rb
60
+ has_rdoc: true
61
+ homepage: http://github.com/fizx/ruby-readability
62
+ licenses: []
63
+
64
+ post_install_message:
65
+ rdoc_options:
66
+ - --charset=UTF-8
67
+ require_paths:
68
+ - lib
69
+ required_ruby_version: !ruby/object:Gem::Requirement
70
+ requirements:
71
+ - - ">="
72
+ - !ruby/object:Gem::Version
73
+ segments:
74
+ - 0
75
+ version: "0"
76
+ required_rubygems_version: !ruby/object:Gem::Requirement
77
+ requirements:
78
+ - - ">="
79
+ - !ruby/object:Gem::Version
80
+ segments:
81
+ - 0
82
+ version: "0"
83
+ requirements: []
84
+
85
+ rubyforge_project:
86
+ rubygems_version: 1.3.6
87
+ signing_key:
88
+ specification_version: 3
89
+ summary: ruby-readability
90
+ test_files:
91
+ - spec/fixtures/samples/channel4-1-fragments.rb
92
+ - spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
93
+ - spec/readability_spec.rb
94
+ - spec/spec_helper.rb