distillery 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,259 @@
1
+ require 'spec_helper'
2
+
3
+ module Distillery
4
+ describe Document do
5
+
6
+ let(:document) { File.open('./spec/fixtures/pina_collada_cupcakes.html').read }
7
+ let!(:noko_doc) { ::Nokogiri::HTML(document) }
8
+ subject { Document.new(document) }
9
+
10
+ def document_of(html, *postprocessing)
11
+ Document.new(html_of(html)).tap do |doc|
12
+ postprocessing.each do |method|
13
+ doc.send(method)
14
+ end
15
+ end
16
+ end
17
+
18
+ def html_of(body)
19
+ "<html><body>#{body}</body></html>"
20
+ end
21
+
22
+ describe ".new" do
23
+
24
+ it 'raises an exception without an argument' do
25
+ expect { Document.new }.to raise_exception(ArgumentError)
26
+ end
27
+
28
+ end
29
+
30
+ describe 'nokogiri delegation' do
31
+
32
+ before(:each) do
33
+ ::Nokogiri.stub(:HTML).and_return(noko_doc)
34
+ noko_doc.stub!(:to_xml).and_return('xml-doc')
35
+ end
36
+
37
+ it "delegates method_calls to the internal doc" do
38
+ noko_doc.should_receive(:to_xml).once
39
+ subject.to_xml.should == 'xml-doc'
40
+ end
41
+
42
+ end
43
+
44
+ describe 'remove_irrelevant_elements!' do
45
+
46
+ %w[script link meta].each do |tag|
47
+ it "should strip out ##{tag} tags" do
48
+ subject.search(tag).should_not be_empty
49
+ subject.remove_irrelevant_elements!
50
+ subject.search(tag).should be_empty
51
+ end
52
+ end
53
+
54
+ it 'does not remove the body even if it has a bad class or id' do
55
+ doc = Document.new("<html><body class='sidebar'>foo</body></html>")
56
+ doc.remove_unlikely_elements!
57
+ doc.search('body').should_not be_empty
58
+ end
59
+
60
+ end
61
+
62
+ describe 'remove_unlikely_elements!' do
63
+ %w[combx comment disqus foot header menu meta nav rss shoutbox sidebar sponsor].each do |klass|
64
+ it "removes any elements classed .#{klass}, as it is unlikely to be page content" do
65
+ doc = document_of("<div class='#{klass}'>foo</div>", :remove_unlikely_elements!)
66
+ doc.inner_html.should == html_of("")
67
+ end
68
+ it "removes any elements id'd ##{klass}, as it is unlikely to be page content" do
69
+ doc = document_of("<div id='#{klass}'>foo</div>", :remove_unlikely_elements!)
70
+ doc.inner_html.should == html_of("")
71
+ end
72
+
73
+ end
74
+
75
+ end
76
+
77
+ describe 'coerce_elements_to_paragraphs!' do
78
+
79
+ it 'converts divs who have no children to paragraphs' do
80
+ doc = document_of("<div>foo</div>", :coerce_elements_to_paragraphs!)
81
+ doc.inner_html.should == html_of("<p>foo</p>")
82
+ end
83
+
84
+ it 'converts divs who have children that are not block-level elements to paragraphs' do
85
+ doc = document_of("<div><span>foo</span></div>", :coerce_elements_to_paragraphs!)
86
+ doc.inner_html.should == html_of("<p><span>foo</span></p>")
87
+ end
88
+
89
+ it 'converts divs whose have empty child divs to paragrahs' do
90
+ doc = document_of("<div><pre>foo</pre><div></div></div>", :coerce_elements_to_paragraphs!)
91
+ doc.inner_html.gsub("\n", "").should == html_of("<p><pre>foo</pre><p></p></p>")
92
+ end
93
+
94
+ end
95
+
96
+ describe '#score!' do
97
+
98
+ it 'popualtes the score ivar with data' do
99
+ subject.scores.should be_a(Hash)
100
+ subject.scores.should be_empty
101
+ subject.score!
102
+ subject.scores.should_not be_empty
103
+ end
104
+
105
+ it 'only calculates scores for paragraphs' do
106
+ doc = document_of("<p>foo</p><div>bar</div>", :score!)
107
+ doc.scores.should_not have_key('/html/body/div')
108
+ doc.scores.should have_key('/html/body/p')
109
+ end
110
+
111
+ it 'gives one point per comma in the text of an element' do
112
+ doc = document_of("<p>foo,bar,baz</p>", :score!)
113
+ doc.scores['/html/body/p'].should == 4
114
+ end
115
+
116
+ it 'gives one point per chunk of 100 characters, max of 3' do
117
+ doc = document_of("<p>#{'f'*201}</p>", :score!)
118
+ doc.scores['/html/body/p'].should == 4
119
+
120
+ doc = document_of("<p>#{'f'*1000}</p>", :score!)
121
+ doc.scores['/html/body/p'].should == 5
122
+ end
123
+
124
+ it 'adds its own points to its parent' do
125
+ doc = document_of("<p><div><p>foo</p></div></p>", :score!)
126
+ doc.scores['/html/body/div/p'].should == 2
127
+ doc.scores['/html/body/div'].should == 2
128
+ end
129
+
130
+ it 'adds 1/2 its points to its grandparent' do
131
+ doc = document_of("<p><div><div><p>foo</p></div></div></p>", :score!)
132
+ doc.scores['/html/body/div/div/p'].should == 2
133
+ doc.scores['/html/body/div/div'].should == 2
134
+ doc.scores['/html/body/div'].should == 1
135
+ end
136
+
137
+ it 'scales the final score by the inverse link density' do
138
+ doc = document_of("<p>foobar<a>baz</a></p>", :score!)
139
+ doc.scores['/html/body/p'].should == 1.3333333333333335
140
+ end
141
+
142
+ end
143
+
144
+ describe 'clean_top_scoring_element!' do
145
+ def doc_with_top_scored_html_of(markup, *postprocessing)
146
+ markup = '<div class="winner">' + ('<p>foo,</p>'*5) + markup + '</div>'
147
+ document_of(markup, *[:prep_for_distillation!, :score!].push(*postprocessing))
148
+ end
149
+
150
+ it 'removes all empty elements' do
151
+ doc = doc_with_top_scored_html_of("<div>foo <span></span</div>", :clean_top_scoring_element!)
152
+ doc.search('span').should be_empty
153
+ end
154
+
155
+ it 'does not remove <br> elements' do
156
+ doc = doc_with_top_scored_html_of("<div>foo<br class='noremove' /></div>", :clean_top_scoring_element!)
157
+ doc.search('.noremove').should_not be_empty
158
+ end
159
+
160
+ %w[iframe form object].each do |tag|
161
+ it "removes any #{tag} elements" do
162
+ doc = doc_with_top_scored_html_of("foo <#{tag}></#{tag}>", :clean_top_scoring_element!)
163
+ doc.search(tag).should be_empty
164
+ end
165
+ end
166
+
167
+ it 'removes elements that have negative scores' do
168
+ doc = doc_with_top_scored_html_of("<div class='widget'><div>bar</div></div>", :clean_top_scoring_element!)
169
+ doc.search('.widget').should be_empty
170
+ end
171
+
172
+ it 'removes elements that have more images than p tags' do
173
+ doc = doc_with_top_scored_html_of("<div class='remove'><img><img><img><p>bar</p><div>foo</div></div>", :clean_top_scoring_element!)
174
+ doc.search('.remove').should be_empty
175
+ end
176
+
177
+ it 'removes elements that have way more li elements and it is not a list' do
178
+ doc = doc_with_top_scored_html_of("<div class='remove'><div>me<ul>#{'<li>a</li>'*200}</ul></div></div>", :clean_top_scoring_element!)
179
+ doc.search('.remove').should be_empty
180
+ end
181
+
182
+ it 'removes elements that have more inputs than 1/3 the amount of p tags' do
183
+ doc = doc_with_top_scored_html_of("<div class='remove'><div><input><input><p>f</p><p>f</p><p>f</p></div></div>", :clean_top_scoring_element!)
184
+ doc.search('.remove').should be_empty
185
+
186
+ doc = doc_with_top_scored_html_of("<div class='remove'><input><p>#{'f'*25}</p><p>f</p><p>f</p></div>", :clean_top_scoring_element!)
187
+ doc.search('.remove').should_not be_empty
188
+ end
189
+
190
+ it 'removes elements that have < 25 characters and (no images or > 2 images' do
191
+ doc = doc_with_top_scored_html_of("<div class='remove'><div>foo</div></div>", :clean_top_scoring_element!)
192
+ doc.search('.remove').should be_empty
193
+
194
+ doc = doc_with_top_scored_html_of("<div class='remove'><div>foo <img><img><img></div></div>", :clean_top_scoring_element!)
195
+ doc.search('.remove').should be_empty
196
+ end
197
+
198
+ it 'removes elements that have a weight of < 25 and link density > 0.2' do
199
+ doc = doc_with_top_scored_html_of("<div class='remove'><div>fffff<a>#{'b'*2}</a></div></div>", :clean_top_scoring_element!)
200
+ doc.search('.remove').should be_empty
201
+ end
202
+
203
+ it 'removes elements that have a weight of >= 25 and link density > 0.5' do
204
+ doc = doc_with_top_scored_html_of("<div class='remove article'><div>#{'f'*100}<a>#{'b'*150}</a></div></div>", :clean_top_scoring_element!)
205
+ doc.search('.remove').should be_empty
206
+ end
207
+
208
+ it 'should not clean the conntent elements not of table ul or div' do
209
+ doc = doc_with_top_scored_html_of("<span class='remove'><strong>Source:</strong> Wikipedia</span>", :clean_top_scoring_element!)
210
+ doc.search('.remove').should_not be_empty
211
+ end
212
+
213
+ end
214
+
215
+ describe '#distill!' do
216
+ it 'returns the page content' do
217
+ subject.distill!.should =~ /great for lazy bakers/
218
+ end
219
+
220
+ it 'returns markup without the header' do
221
+ subject.distill!.should_not =~ /skinnytasteheader_1000_3/
222
+ end
223
+
224
+ it 'returns markup withouth the footer' do
225
+ subject.distill!.should_not =~ /Design by Call Me Kristin/
226
+ end
227
+
228
+ it 'returns markup without navigation' do
229
+ subject.distill!.should_not =~ /STNavbar1/
230
+ end
231
+
232
+ it 'returns markup without comments' do
233
+ subject.distill!.should_not =~ /Cindy said.../
234
+ end
235
+
236
+ if RUBY_VERSION =~ /^1.9/
237
+ it 'keeps the encoding of the string was passed in to the constructor' do
238
+ string = "<html><body><p>foo</p></body></html>"
239
+ string.encode!('ISO-8859-1')
240
+ Document.new(string).distill!.encoding.name.should == 'ISO-8859-1'
241
+ end
242
+ end
243
+
244
+ it 'does not clean the page if :clean => false is passed' do
245
+ doc = Document.new(File.open('./spec/fixtures/baked_ziti.html').read)
246
+ doc.distill!(:clean => false).should =~ /Add to Recipe Box/
247
+
248
+ doc = Document.new(File.open('./spec/fixtures/baked_ziti.html').read)
249
+ doc.distill!.should_not =~ /Add to Recipe Box/
250
+ end
251
+
252
+ it 'works with a HTML document that has no winner' do
253
+ document_of('foo').distill!.should == 'foo'
254
+ end
255
+
256
+ end
257
+
258
+ end
259
+ end
@@ -0,0 +1,27 @@
1
+ require 'spec_helper'
2
+
3
+ describe Distillery do
4
+
5
+ describe '.distill' do
6
+
7
+ let(:document) { File.open('./spec/fixtures/pina_collada_cupcakes.html').read }
8
+ let(:mockdoc) { mock(:doc, :distill => 'test') }
9
+
10
+ it 'takes a string and returns the distilled markup' do
11
+ Distillery.distill(document).should be_a(String)
12
+ end
13
+
14
+ it 'defers to Distillery::Document' do
15
+ Distillery::Document.should_receive(:new).once.with(document).and_return(mockdoc)
16
+ mockdoc.should_receive(:distill!).once
17
+ Distillery.distill(document)
18
+ end
19
+
20
+ it 'passes the same options through to the distill! method' do
21
+ Distillery::Document.stub!(:new).and_return(mockdoc)
22
+ mockdoc.should_receive(:distill!).once.with(hash_including(:clean => false))
23
+ Distillery.distill(document, :clean => false)
24
+ end
25
+ end
26
+
27
+ end
@@ -0,0 +1,13 @@
1
+ require 'distillery'
2
+ require 'rspec'
3
+ require 'ruby-debug'
4
+
5
+ Dir['./spec/support/**/*.rb'].each { |f| require f }
6
+
7
+ RSpec.configure do |config|
8
+ config.color_enabled = true
9
+ config.debug = true
10
+
11
+ config.filter_run :focus => true
12
+ config.run_all_when_everything_filtered = true
13
+ end
metadata ADDED
@@ -0,0 +1,180 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: distillery
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Jeff Pollard
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2011-04-30 00:00:00.000000000 -07:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: nokogiri
17
+ requirement: &2161079840 !ruby/object:Gem::Requirement
18
+ none: false
19
+ requirements:
20
+ - - ! '>'
21
+ - !ruby/object:Gem::Version
22
+ version: '1.0'
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: *2161079840
26
+ - !ruby/object:Gem::Dependency
27
+ name: slop
28
+ requirement: &2161079300 !ruby/object:Gem::Requirement
29
+ none: false
30
+ requirements:
31
+ - - ! '>'
32
+ - !ruby/object:Gem::Version
33
+ version: '1.0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: *2161079300
37
+ - !ruby/object:Gem::Dependency
38
+ name: rspec
39
+ requirement: &2161078840 !ruby/object:Gem::Requirement
40
+ none: false
41
+ requirements:
42
+ - - ! '>'
43
+ - !ruby/object:Gem::Version
44
+ version: '2.0'
45
+ type: :development
46
+ prerelease: false
47
+ version_requirements: *2161078840
48
+ - !ruby/object:Gem::Dependency
49
+ name: guard
50
+ requirement: &2161078340 !ruby/object:Gem::Requirement
51
+ none: false
52
+ requirements:
53
+ - - ! '>='
54
+ - !ruby/object:Gem::Version
55
+ version: '0'
56
+ type: :development
57
+ prerelease: false
58
+ version_requirements: *2161078340
59
+ - !ruby/object:Gem::Dependency
60
+ name: guard-rspec
61
+ requirement: &2161077760 !ruby/object:Gem::Requirement
62
+ none: false
63
+ requirements:
64
+ - - ! '>='
65
+ - !ruby/object:Gem::Version
66
+ version: '0'
67
+ type: :development
68
+ prerelease: false
69
+ version_requirements: *2161077760
70
+ - !ruby/object:Gem::Dependency
71
+ name: ruby-debug19
72
+ requirement: &2161077220 !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ type: :development
79
+ prerelease: false
80
+ version_requirements: *2161077220
81
+ - !ruby/object:Gem::Dependency
82
+ name: rb-fsevent
83
+ requirement: &2161076740 !ruby/object:Gem::Requirement
84
+ none: false
85
+ requirements:
86
+ - - ! '>='
87
+ - !ruby/object:Gem::Version
88
+ version: '0'
89
+ type: :development
90
+ prerelease: false
91
+ version_requirements: *2161076740
92
+ - !ruby/object:Gem::Dependency
93
+ name: growl
94
+ requirement: &2161057060 !ruby/object:Gem::Requirement
95
+ none: false
96
+ requirements:
97
+ - - ! '>='
98
+ - !ruby/object:Gem::Version
99
+ version: '0'
100
+ type: :development
101
+ prerelease: false
102
+ version_requirements: *2161057060
103
+ description: Distillery extracts the "content" portion out of an HTML document. It
104
+ applies heuristics based on element type, location, class/id name and other attributes
105
+ to try and find the content part of the HTML document and return it.
106
+ email:
107
+ - jeff.pollard@gmail.com
108
+ executables:
109
+ - distill
110
+ extensions: []
111
+ extra_rdoc_files: []
112
+ files:
113
+ - .gitignore
114
+ - Gemfile
115
+ - Guardfile
116
+ - LICENSE
117
+ - README.md
118
+ - Rakefile
119
+ - TODO
120
+ - bin/distill
121
+ - distillery.gemspec
122
+ - lib/distillery.rb
123
+ - lib/distillery/document.rb
124
+ - lib/distillery/version.rb
125
+ - spec/acceptance_spec.rb
126
+ - spec/fixtures/agave_cookies.html
127
+ - spec/fixtures/baked_ziti.html
128
+ - spec/fixtures/beef_jerkey.html
129
+ - spec/fixtures/clams_and_linguini.html
130
+ - spec/fixtures/clouds_shining_moment.html
131
+ - spec/fixtures/game_blog.html
132
+ - spec/fixtures/ginger_cookies.html
133
+ - spec/fixtures/js_this_keyword.html
134
+ - spec/fixtures/nyt_social_media.html
135
+ - spec/fixtures/pina_collada_cupcakes.html
136
+ - spec/fixtures/vanilla_pound_cake.html
137
+ - spec/lib/distillery/document_spec.rb
138
+ - spec/lib/distillery_spec.rb
139
+ - spec/spec_helper.rb
140
+ has_rdoc: true
141
+ homepage: https://github.com/Fluxx/distillery
142
+ licenses: []
143
+ post_install_message:
144
+ rdoc_options: []
145
+ require_paths:
146
+ - lib
147
+ required_ruby_version: !ruby/object:Gem::Requirement
148
+ none: false
149
+ requirements:
150
+ - - ! '>='
151
+ - !ruby/object:Gem::Version
152
+ version: '0'
153
+ required_rubygems_version: !ruby/object:Gem::Requirement
154
+ none: false
155
+ requirements:
156
+ - - ! '>='
157
+ - !ruby/object:Gem::Version
158
+ version: '0'
159
+ requirements: []
160
+ rubyforge_project: distillery
161
+ rubygems_version: 1.6.1
162
+ signing_key:
163
+ specification_version: 3
164
+ summary: Extract the content portion of an HTML document.
165
+ test_files:
166
+ - spec/acceptance_spec.rb
167
+ - spec/fixtures/agave_cookies.html
168
+ - spec/fixtures/baked_ziti.html
169
+ - spec/fixtures/beef_jerkey.html
170
+ - spec/fixtures/clams_and_linguini.html
171
+ - spec/fixtures/clouds_shining_moment.html
172
+ - spec/fixtures/game_blog.html
173
+ - spec/fixtures/ginger_cookies.html
174
+ - spec/fixtures/js_this_keyword.html
175
+ - spec/fixtures/nyt_social_media.html
176
+ - spec/fixtures/pina_collada_cupcakes.html
177
+ - spec/fixtures/vanilla_pound_cake.html
178
+ - spec/lib/distillery/document_spec.rb
179
+ - spec/lib/distillery_spec.rb
180
+ - spec/spec_helper.rb