fblee-readability 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,180 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), "spec_helper"))
2
+
3
+ describe Readability do
4
+ before do
5
+ @simple_html_fixture = <<-HTML
6
+ <html>
7
+ <head>
8
+ <title>title!</title>
9
+ </head>
10
+ <body class='comment'>
11
+ <div>
12
+ <p class='comment'>a comment</p>
13
+ <div class='comment' id='body'>real content</div>
14
+ <div id="contains_blockquote"><blockquote>something in a table</blockquote></div>
15
+ </div>
16
+ </body>
17
+ </html>
18
+ HTML
19
+ end
20
+
21
+ describe "transformMisusedDivsIntoParagraphs" do
22
+ before do
23
+ @doc = Readability::Document.new(@simple_html_fixture)
24
+ @doc.transform_misused_divs_into_paragraphs!
25
+ end
26
+
27
+ it "should transform divs containing no block elements into <p>s" do
28
+ @doc.html.css("#body").first.name.should == "p"
29
+ end
30
+
31
+ it "should not transform divs that contain block elements" do
32
+ @doc.html.css("#contains_blockquote").first.name.should == "div"
33
+ end
34
+ end
35
+
36
+ describe "score_node" do
37
+ before do
38
+ @doc = Readability::Document.new(<<-HTML)
39
+ <html>
40
+ <body>
41
+ <div id='elem1'>
42
+ <p>some content</p>
43
+ </div>
44
+ <th id='elem2'>
45
+ <p>some other content</p>
46
+ </th>
47
+ </body>
48
+ </html>
49
+ HTML
50
+ @elem1 = @doc.html.css("#elem1").first
51
+ @elem2 = @doc.html.css("#elem2").first
52
+ end
53
+
54
+ it "should like <div>s more than <th>s" do
55
+ @doc.score_node(@elem1)[:content_score].should > @doc.score_node(@elem2)[:content_score]
56
+ end
57
+
58
+ it "should like classes like text more than classes like comment" do
59
+ @elem2.name = "div"
60
+ @doc.score_node(@elem1)[:content_score].should == @doc.score_node(@elem2)[:content_score]
61
+ @elem1['class'] = "text"
62
+ @elem2['class'] = "comment"
63
+ @doc.score_node(@elem1)[:content_score].should > @doc.score_node(@elem2)[:content_score]
64
+ end
65
+ end
66
+
67
+ describe "remove_unlikely_candidates!" do
68
+ before do
69
+ @doc = Readability::Document.new(@simple_html_fixture)
70
+ @doc.remove_unlikely_candidates!
71
+ end
72
+
73
+ it "should remove things that have class comment" do
74
+ @doc.html.inner_html.should_not =~ /a comment/
75
+ end
76
+
77
+ it "should not remove body tags" do
78
+ @doc.html.inner_html.should =~ /<\/body>/
79
+ end
80
+
81
+ it "should not remove things with class comment and id body" do
82
+ @doc.html.inner_html.should =~ /real content/
83
+ end
84
+ end
85
+
86
+ describe "score_paragraphs" do
87
+ before(:each) do
88
+ @doc = Readability::Document.new(<<-HTML)
89
+ <html>
90
+ <head>
91
+ <title>title!</title>
92
+ </head>
93
+ <body id="body">
94
+ <div id="div1">
95
+ <div id="div2>
96
+ <p id="some_comment">a comment</p>
97
+ </div>
98
+ <p id="some_text">some text</p>
99
+ </div>
100
+ <div id="div3">
101
+ <p id="some_text2">some more text</p>
102
+ </div>
103
+ </body>
104
+ </html>
105
+ HTML
106
+ @candidates = @doc.score_paragraphs(0)
107
+ end
108
+
109
+ it "should score elements in the document" do
110
+ @candidates.values.length.should == 3
111
+ end
112
+
113
+ it "should prefer the body in this particular example" do
114
+ @candidates.values.sort { |a, b|
115
+ b[:content_score] <=> a[:content_score]
116
+ }.first[:elem][:id].should == "body"
117
+ end
118
+ end
119
+
120
+ describe "the cant_read.html fixture" do
121
+ it "should work on the cant_read.html fixture with some allowed tags" do
122
+ allowed_tags = %w[div span table tr td p i strong u h1 h2 h3 h4 pre code br a]
123
+ allowed_attributes = %w[href]
124
+ html = File.read(File.dirname(__FILE__) + "/fixtures/cant_read.html")
125
+ Readability::Document.new(html, :tags => allowed_tags, :attributes => allowed_attributes).content.should match(/Can you talk a little about how you developed the looks for the/)
126
+ end
127
+ end
128
+
129
+ describe "general functionality" do
130
+ before do
131
+ @doc = Readability::Document.new("<html><head><title>title!</title></head><body><div><p>Some content</p></div></body>",
132
+ :min_text_length => 0, :retry_length => 1)
133
+ end
134
+
135
+ it "should return the main page content" do
136
+ @doc.content.should match("Some content")
137
+ end
138
+ end
139
+
140
+ describe "ignoring sidebars" do
141
+ before do
142
+ @doc = Readability::Document.new("<html><head><title>title!</title></head><body><div><p>Some content</p></div><div class='sidebar'><p>sidebar<p></div></body>",
143
+ :min_text_length => 0, :retry_length => 1)
144
+ end
145
+
146
+ it "should not return the sidebar" do
147
+ @doc.content.should_not match("sidebar")
148
+ end
149
+ end
150
+
151
+ describe "outputs good stuff for known documents" do
152
+ before do
153
+ @html_files = Dir.glob(File.dirname(__FILE__) + "/fixtures/samples/*.html")
154
+ @samples = @html_files.map {|filename| File.basename(filename, '.html') }
155
+ end
156
+
157
+ it "should output expected fragments of text" do
158
+
159
+ checks = 0
160
+ @samples.each do |sample|
161
+ html = File.read(File.dirname(__FILE__) + "/fixtures/samples/#{sample}.html")
162
+ doc = Readability::Document.new(html).content
163
+
164
+ load "fixtures/samples/#{sample}-fragments.rb"
165
+ puts "testing #{sample}..."
166
+
167
+ $required_fragments.each do |required_text|
168
+ doc.should include(required_text)
169
+ checks += 1
170
+ end
171
+
172
+ $excluded_fragments.each do |text_to_avoid|
173
+ doc.should_not include(text_to_avoid)
174
+ checks += 1
175
+ end
176
+ end
177
+ puts "Performed #{checks} checks."
178
+ end
179
+ end
180
+ end
@@ -0,0 +1,10 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
2
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
+ require 'rubygems'
4
+ require 'readability'
5
+ require 'spec'
6
+ require 'spec/autorun'
7
+
8
+ Spec::Runner.configure do |config|
9
+
10
+ end
data/test_on_url.rb ADDED
@@ -0,0 +1,6 @@
1
+ require 'rubygems'
2
+ require 'open-uri'
3
+ require 'readability'
4
+
5
+ text = open(ARGV.first).read
6
+ p Readability::Document.new(text).content
metadata ADDED
@@ -0,0 +1,71 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: fblee-readability
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 1
9
+ version: 0.0.1
10
+ platform: ruby
11
+ authors:
12
+ - Lee Mallabone
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-03-23 00:00:00 +00:00
18
+ default_executable:
19
+ dependencies: []
20
+
21
+ description: Extracts readable content from an HTML page
22
+ email: lee@broadersheet.com
23
+ executables: []
24
+
25
+ extensions: []
26
+
27
+ extra_rdoc_files: []
28
+
29
+ files:
30
+ - Rakefile
31
+ - readability.gemspec
32
+ - README
33
+ - test_on_url.rb
34
+ - lib/readability.rb
35
+ - spec/fixtures/cant_read.html
36
+ - spec/fixtures/sample.html
37
+ - spec/fixtures/should_not_truncate.txt
38
+ - spec/readability_spec.rb
39
+ - spec/spec_helper.rb
40
+ has_rdoc: true
41
+ homepage: http://github.com/fblee/ruby-readability
42
+ licenses: []
43
+
44
+ post_install_message:
45
+ rdoc_options: []
46
+
47
+ require_paths:
48
+ - lib
49
+ required_ruby_version: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ segments:
54
+ - 0
55
+ version: "0"
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ segments:
61
+ - 0
62
+ version: "0"
63
+ requirements: []
64
+
65
+ rubyforge_project:
66
+ rubygems_version: 1.3.6
67
+ signing_key:
68
+ specification_version: 3
69
+ summary: Extracts readable content from an HTML page
70
+ test_files: []
71
+