ruby-readability 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.gitignore +21 -0
- data/README +9 -0
- data/Rakefile +45 -0
- data/VERSION +1 -0
- data/bin/readability +13 -0
- data/lib/readability.rb +295 -0
- data/lib/readability_old.rb +74 -0
- data/spec/fixtures/cant_read.html +426 -0
- data/spec/fixtures/sample.html +1198 -0
- data/spec/fixtures/samples/channel4-1-fragments.rb +14 -0
- data/spec/fixtures/samples/channel4-1.html +1330 -0
- data/spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb +31 -0
- data/spec/fixtures/samples/globemail-ottawa-cuts.html +2410 -0
- data/spec/fixtures/should_not_truncate.txt +1077 -0
- data/spec/readability_spec.rb +180 -0
- data/spec/spec_helper.rb +10 -0
- metadata +94 -0
@@ -0,0 +1,180 @@
|
|
1
|
+
require File.expand_path(File.join(File.dirname(__FILE__), "spec_helper"))
|
2
|
+
|
3
|
+
describe Readability do
|
4
|
+
before do
|
5
|
+
@simple_html_fixture = <<-HTML
|
6
|
+
<html>
|
7
|
+
<head>
|
8
|
+
<title>title!</title>
|
9
|
+
</head>
|
10
|
+
<body class='comment'>
|
11
|
+
<div>
|
12
|
+
<p class='comment'>a comment</p>
|
13
|
+
<div class='comment' id='body'>real content</div>
|
14
|
+
<div id="contains_blockquote"><blockquote>something in a table</blockquote></div>
|
15
|
+
</div>
|
16
|
+
</body>
|
17
|
+
</html>
|
18
|
+
HTML
|
19
|
+
end
|
20
|
+
|
21
|
+
describe "transformMisusedDivsIntoParagraphs" do
|
22
|
+
before do
|
23
|
+
@doc = Readability::Document.new(@simple_html_fixture)
|
24
|
+
@doc.transform_misused_divs_into_paragraphs!
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should transform divs containing no block elements into <p>s" do
|
28
|
+
@doc.html.css("#body").first.name.should == "p"
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should not transform divs that contain block elements" do
|
32
|
+
@doc.html.css("#contains_blockquote").first.name.should == "div"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
describe "score_node" do
|
37
|
+
before do
|
38
|
+
@doc = Readability::Document.new(<<-HTML)
|
39
|
+
<html>
|
40
|
+
<body>
|
41
|
+
<div id='elem1'>
|
42
|
+
<p>some content</p>
|
43
|
+
</div>
|
44
|
+
<th id='elem2'>
|
45
|
+
<p>some other content</p>
|
46
|
+
</th>
|
47
|
+
</body>
|
48
|
+
</html>
|
49
|
+
HTML
|
50
|
+
@elem1 = @doc.html.css("#elem1").first
|
51
|
+
@elem2 = @doc.html.css("#elem2").first
|
52
|
+
end
|
53
|
+
|
54
|
+
it "should like <div>s more than <th>s" do
|
55
|
+
@doc.score_node(@elem1)[:content_score].should > @doc.score_node(@elem2)[:content_score]
|
56
|
+
end
|
57
|
+
|
58
|
+
it "should like classes like text more than classes like comment" do
|
59
|
+
@elem2.name = "div"
|
60
|
+
@doc.score_node(@elem1)[:content_score].should == @doc.score_node(@elem2)[:content_score]
|
61
|
+
@elem1['class'] = "text"
|
62
|
+
@elem2['class'] = "comment"
|
63
|
+
@doc.score_node(@elem1)[:content_score].should > @doc.score_node(@elem2)[:content_score]
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
describe "remove_unlikely_candidates!" do
|
68
|
+
before do
|
69
|
+
@doc = Readability::Document.new(@simple_html_fixture)
|
70
|
+
@doc.remove_unlikely_candidates!
|
71
|
+
end
|
72
|
+
|
73
|
+
it "should remove things that have class comment" do
|
74
|
+
@doc.html.inner_html.should_not =~ /a comment/
|
75
|
+
end
|
76
|
+
|
77
|
+
it "should not remove body tags" do
|
78
|
+
@doc.html.inner_html.should =~ /<\/body>/
|
79
|
+
end
|
80
|
+
|
81
|
+
it "should not remove things with class comment and id body" do
|
82
|
+
@doc.html.inner_html.should =~ /real content/
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
describe "score_paragraphs" do
|
87
|
+
before(:each) do
|
88
|
+
@doc = Readability::Document.new(<<-HTML)
|
89
|
+
<html>
|
90
|
+
<head>
|
91
|
+
<title>title!</title>
|
92
|
+
</head>
|
93
|
+
<body id="body">
|
94
|
+
<div id="div1">
|
95
|
+
<div id="div2>
|
96
|
+
<p id="some_comment">a comment</p>
|
97
|
+
</div>
|
98
|
+
<p id="some_text">some text</p>
|
99
|
+
</div>
|
100
|
+
<div id="div3">
|
101
|
+
<p id="some_text2">some more text</p>
|
102
|
+
</div>
|
103
|
+
</body>
|
104
|
+
</html>
|
105
|
+
HTML
|
106
|
+
@candidates = @doc.score_paragraphs(0)
|
107
|
+
end
|
108
|
+
|
109
|
+
it "should score elements in the document" do
|
110
|
+
@candidates.values.length.should == 3
|
111
|
+
end
|
112
|
+
|
113
|
+
it "should prefer the body in this particular example" do
|
114
|
+
@candidates.values.sort { |a, b|
|
115
|
+
b[:content_score] <=> a[:content_score]
|
116
|
+
}.first[:elem][:id].should == "body"
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
describe "the cant_read.html fixture" do
|
121
|
+
it "should work on the cant_read.html fixture with some allowed tags" do
|
122
|
+
allowed_tags = %w[div span table tr td p i strong u h1 h2 h3 h4 pre code br a]
|
123
|
+
allowed_attributes = %w[href]
|
124
|
+
html = File.read(File.dirname(__FILE__) + "/fixtures/cant_read.html")
|
125
|
+
Readability::Document.new(html, :tags => allowed_tags, :attributes => allowed_attributes).content.should match(/Can you talk a little about how you developed the looks for the/)
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
describe "general functionality" do
|
130
|
+
before do
|
131
|
+
@doc = Readability::Document.new("<html><head><title>title!</title></head><body><div><p>Some content</p></div></body>",
|
132
|
+
:min_text_length => 0, :retry_length => 1)
|
133
|
+
end
|
134
|
+
|
135
|
+
it "should return the main page content" do
|
136
|
+
@doc.content.should match("Some content")
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
describe "ignoring sidebars" do
|
141
|
+
before do
|
142
|
+
@doc = Readability::Document.new("<html><head><title>title!</title></head><body><div><p>Some content</p></div><div class='sidebar'><p>sidebar<p></div></body>",
|
143
|
+
:min_text_length => 0, :retry_length => 1)
|
144
|
+
end
|
145
|
+
|
146
|
+
it "should not return the sidebar" do
|
147
|
+
@doc.content.should_not match("sidebar")
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
describe "outputs good stuff for known documents" do
|
152
|
+
before do
|
153
|
+
@html_files = Dir.glob(File.dirname(__FILE__) + "/fixtures/samples/*.html")
|
154
|
+
@samples = @html_files.map {|filename| File.basename(filename, '.html') }
|
155
|
+
end
|
156
|
+
|
157
|
+
it "should output expected fragments of text" do
|
158
|
+
|
159
|
+
checks = 0
|
160
|
+
@samples.each do |sample|
|
161
|
+
html = File.read(File.dirname(__FILE__) + "/fixtures/samples/#{sample}.html")
|
162
|
+
doc = Readability::Document.new(html).content
|
163
|
+
|
164
|
+
load "fixtures/samples/#{sample}-fragments.rb"
|
165
|
+
puts "testing #{sample}..."
|
166
|
+
|
167
|
+
$required_fragments.each do |required_text|
|
168
|
+
doc.should include(required_text)
|
169
|
+
checks += 1
|
170
|
+
end
|
171
|
+
|
172
|
+
$excluded_fragments.each do |text_to_avoid|
|
173
|
+
doc.should_not include(text_to_avoid)
|
174
|
+
checks += 1
|
175
|
+
end
|
176
|
+
end
|
177
|
+
puts "Performed #{checks} checks."
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,94 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ruby-readability
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 1
|
8
|
+
- 0
|
9
|
+
version: 0.1.0
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Kyle Maxwell
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2010-08-30 00:00:00 -07:00
|
18
|
+
default_executable: readability
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: rspec
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
requirements:
|
25
|
+
- - ">="
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
segments:
|
28
|
+
- 1
|
29
|
+
- 2
|
30
|
+
- 9
|
31
|
+
version: 1.2.9
|
32
|
+
type: :development
|
33
|
+
version_requirements: *id001
|
34
|
+
description: ruby-readability
|
35
|
+
email: kmaxwell@twitter.com
|
36
|
+
executables:
|
37
|
+
- readability
|
38
|
+
extensions: []
|
39
|
+
|
40
|
+
extra_rdoc_files:
|
41
|
+
- README
|
42
|
+
files:
|
43
|
+
- .document
|
44
|
+
- .gitignore
|
45
|
+
- README
|
46
|
+
- Rakefile
|
47
|
+
- VERSION
|
48
|
+
- bin/readability
|
49
|
+
- lib/readability.rb
|
50
|
+
- lib/readability_old.rb
|
51
|
+
- spec/fixtures/cant_read.html
|
52
|
+
- spec/fixtures/sample.html
|
53
|
+
- spec/fixtures/samples/channel4-1-fragments.rb
|
54
|
+
- spec/fixtures/samples/channel4-1.html
|
55
|
+
- spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
|
56
|
+
- spec/fixtures/samples/globemail-ottawa-cuts.html
|
57
|
+
- spec/fixtures/should_not_truncate.txt
|
58
|
+
- spec/readability_spec.rb
|
59
|
+
- spec/spec_helper.rb
|
60
|
+
has_rdoc: true
|
61
|
+
homepage: http://github.com/fizx/ruby-readability
|
62
|
+
licenses: []
|
63
|
+
|
64
|
+
post_install_message:
|
65
|
+
rdoc_options:
|
66
|
+
- --charset=UTF-8
|
67
|
+
require_paths:
|
68
|
+
- lib
|
69
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
70
|
+
requirements:
|
71
|
+
- - ">="
|
72
|
+
- !ruby/object:Gem::Version
|
73
|
+
segments:
|
74
|
+
- 0
|
75
|
+
version: "0"
|
76
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
77
|
+
requirements:
|
78
|
+
- - ">="
|
79
|
+
- !ruby/object:Gem::Version
|
80
|
+
segments:
|
81
|
+
- 0
|
82
|
+
version: "0"
|
83
|
+
requirements: []
|
84
|
+
|
85
|
+
rubyforge_project:
|
86
|
+
rubygems_version: 1.3.6
|
87
|
+
signing_key:
|
88
|
+
specification_version: 3
|
89
|
+
summary: ruby-readability
|
90
|
+
test_files:
|
91
|
+
- spec/fixtures/samples/channel4-1-fragments.rb
|
92
|
+
- spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
|
93
|
+
- spec/readability_spec.rb
|
94
|
+
- spec/spec_helper.rb
|