distillery 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,259 @@
1
+ require 'spec_helper'
2
+
3
+ module Distillery
4
+ describe Document do
5
+
6
+ let(:document) { File.open('./spec/fixtures/pina_collada_cupcakes.html').read }
7
+ let!(:noko_doc) { ::Nokogiri::HTML(document) }
8
+ subject { Document.new(document) }
9
+
10
+ def document_of(html, *postprocessing)
11
+ Document.new(html_of(html)).tap do |doc|
12
+ postprocessing.each do |method|
13
+ doc.send(method)
14
+ end
15
+ end
16
+ end
17
+
18
+ def html_of(body)
19
+ "<html><body>#{body}</body></html>"
20
+ end
21
+
22
+ describe ".new" do
23
+
24
+ it 'raises an exception without an argument' do
25
+ expect { Document.new }.to raise_exception(ArgumentError)
26
+ end
27
+
28
+ end
29
+
30
+ describe 'nokogiri delegation' do
31
+
32
+ before(:each) do
33
+ ::Nokogiri.stub(:HTML).and_return(noko_doc)
34
+ noko_doc.stub!(:to_xml).and_return('xml-doc')
35
+ end
36
+
37
+ it "delegates method_calls to the internal doc" do
38
+ noko_doc.should_receive(:to_xml).once
39
+ subject.to_xml.should == 'xml-doc'
40
+ end
41
+
42
+ end
43
+
44
+ describe 'remove_irrelevant_elements!' do
45
+
46
+ %w[script link meta].each do |tag|
47
+ it "should strip out ##{tag} tags" do
48
+ subject.search(tag).should_not be_empty
49
+ subject.remove_irrelevant_elements!
50
+ subject.search(tag).should be_empty
51
+ end
52
+ end
53
+
54
+ it 'does not remove the body even if it has a bad class or id' do
55
+ doc = Document.new("<html><body class='sidebar'>foo</body></html>")
56
+ doc.remove_unlikely_elements!
57
+ doc.search('body').should_not be_empty
58
+ end
59
+
60
+ end
61
+
62
+ describe 'remove_unlikely_elements!' do
63
+ %w[combx comment disqus foot header menu meta nav rss shoutbox sidebar sponsor].each do |klass|
64
+ it "removes any elements classed .#{klass}, as it is unlikely to be page content" do
65
+ doc = document_of("<div class='#{klass}'>foo</div>", :remove_unlikely_elements!)
66
+ doc.inner_html.should == html_of("")
67
+ end
68
+ it "removes any elements id'd ##{klass}, as it is unlikely to be page content" do
69
+ doc = document_of("<div id='#{klass}'>foo</div>", :remove_unlikely_elements!)
70
+ doc.inner_html.should == html_of("")
71
+ end
72
+
73
+ end
74
+
75
+ end
76
+
77
+ describe 'coerce_elements_to_paragraphs!' do
78
+
79
+ it 'converts divs who have no children to paragraphs' do
80
+ doc = document_of("<div>foo</div>", :coerce_elements_to_paragraphs!)
81
+ doc.inner_html.should == html_of("<p>foo</p>")
82
+ end
83
+
84
+ it 'converts divs who have children that are not block-level elements to paragraphs' do
85
+ doc = document_of("<div><span>foo</span></div>", :coerce_elements_to_paragraphs!)
86
+ doc.inner_html.should == html_of("<p><span>foo</span></p>")
87
+ end
88
+
89
+ it 'converts divs whose have empty child divs to paragrahs' do
90
+ doc = document_of("<div><pre>foo</pre><div></div></div>", :coerce_elements_to_paragraphs!)
91
+ doc.inner_html.gsub("\n", "").should == html_of("<p><pre>foo</pre><p></p></p>")
92
+ end
93
+
94
+ end
95
+
96
+ describe '#score!' do
97
+
98
+ it 'popualtes the score ivar with data' do
99
+ subject.scores.should be_a(Hash)
100
+ subject.scores.should be_empty
101
+ subject.score!
102
+ subject.scores.should_not be_empty
103
+ end
104
+
105
+ it 'only calculates scores for paragraphs' do
106
+ doc = document_of("<p>foo</p><div>bar</div>", :score!)
107
+ doc.scores.should_not have_key('/html/body/div')
108
+ doc.scores.should have_key('/html/body/p')
109
+ end
110
+
111
+ it 'gives one point per comma in the text of an element' do
112
+ doc = document_of("<p>foo,bar,baz</p>", :score!)
113
+ doc.scores['/html/body/p'].should == 4
114
+ end
115
+
116
+ it 'gives one point per chunk of 100 characters, max of 3' do
117
+ doc = document_of("<p>#{'f'*201}</p>", :score!)
118
+ doc.scores['/html/body/p'].should == 4
119
+
120
+ doc = document_of("<p>#{'f'*1000}</p>", :score!)
121
+ doc.scores['/html/body/p'].should == 5
122
+ end
123
+
124
+ it 'adds its own points to its parent' do
125
+ doc = document_of("<p><div><p>foo</p></div></p>", :score!)
126
+ doc.scores['/html/body/div/p'].should == 2
127
+ doc.scores['/html/body/div'].should == 2
128
+ end
129
+
130
+ it 'adds 1/2 its points to its grandparent' do
131
+ doc = document_of("<p><div><div><p>foo</p></div></div></p>", :score!)
132
+ doc.scores['/html/body/div/div/p'].should == 2
133
+ doc.scores['/html/body/div/div'].should == 2
134
+ doc.scores['/html/body/div'].should == 1
135
+ end
136
+
137
+ it 'scales the final score by the inverse link density' do
138
+ doc = document_of("<p>foobar<a>baz</a></p>", :score!)
139
+ doc.scores['/html/body/p'].should == 1.3333333333333335
140
+ end
141
+
142
+ end
143
+
144
+ describe 'clean_top_scoring_element!' do
145
+ def doc_with_top_scored_html_of(markup, *postprocessing)
146
+ markup = '<div class="winner">' + ('<p>foo,</p>'*5) + markup + '</div>'
147
+ document_of(markup, *[:prep_for_distillation!, :score!].push(*postprocessing))
148
+ end
149
+
150
+ it 'removes all empty elements' do
151
+ doc = doc_with_top_scored_html_of("<div>foo <span></span</div>", :clean_top_scoring_element!)
152
+ doc.search('span').should be_empty
153
+ end
154
+
155
+ it 'does not remove <br> elements' do
156
+ doc = doc_with_top_scored_html_of("<div>foo<br class='noremove' /></div>", :clean_top_scoring_element!)
157
+ doc.search('.noremove').should_not be_empty
158
+ end
159
+
160
+ %w[iframe form object].each do |tag|
161
+ it "removes any #{tag} elements" do
162
+ doc = doc_with_top_scored_html_of("foo <#{tag}></#{tag}>", :clean_top_scoring_element!)
163
+ doc.search(tag).should be_empty
164
+ end
165
+ end
166
+
167
+ it 'removes elements that have negative scores' do
168
+ doc = doc_with_top_scored_html_of("<div class='widget'><div>bar</div></div>", :clean_top_scoring_element!)
169
+ doc.search('.widget').should be_empty
170
+ end
171
+
172
+ it 'removes elements that have more images than p tags' do
173
+ doc = doc_with_top_scored_html_of("<div class='remove'><img><img><img><p>bar</p><div>foo</div></div>", :clean_top_scoring_element!)
174
+ doc.search('.remove').should be_empty
175
+ end
176
+
177
+ it 'removes elements that have way more li elements and it is not a list' do
178
+ doc = doc_with_top_scored_html_of("<div class='remove'><div>me<ul>#{'<li>a</li>'*200}</ul></div></div>", :clean_top_scoring_element!)
179
+ doc.search('.remove').should be_empty
180
+ end
181
+
182
+ it 'removes elements that have more inputs than 1/3 the amount of p tags' do
183
+ doc = doc_with_top_scored_html_of("<div class='remove'><div><input><input><p>f</p><p>f</p><p>f</p></div></div>", :clean_top_scoring_element!)
184
+ doc.search('.remove').should be_empty
185
+
186
+ doc = doc_with_top_scored_html_of("<div class='remove'><input><p>#{'f'*25}</p><p>f</p><p>f</p></div>", :clean_top_scoring_element!)
187
+ doc.search('.remove').should_not be_empty
188
+ end
189
+
190
+ it 'removes elements that have < 25 characters and (no images or > 2 images' do
191
+ doc = doc_with_top_scored_html_of("<div class='remove'><div>foo</div></div>", :clean_top_scoring_element!)
192
+ doc.search('.remove').should be_empty
193
+
194
+ doc = doc_with_top_scored_html_of("<div class='remove'><div>foo <img><img><img></div></div>", :clean_top_scoring_element!)
195
+ doc.search('.remove').should be_empty
196
+ end
197
+
198
+ it 'removes elements that have a weight of < 25 and link density > 0.2' do
199
+ doc = doc_with_top_scored_html_of("<div class='remove'><div>fffff<a>#{'b'*2}</a></div></div>", :clean_top_scoring_element!)
200
+ doc.search('.remove').should be_empty
201
+ end
202
+
203
+ it 'removes elements that have a weight of >= 25 and link density > 0.5' do
204
+ doc = doc_with_top_scored_html_of("<div class='remove article'><div>#{'f'*100}<a>#{'b'*150}</a></div></div>", :clean_top_scoring_element!)
205
+ doc.search('.remove').should be_empty
206
+ end
207
+
208
+ it 'should not clean the conntent elements not of table ul or div' do
209
+ doc = doc_with_top_scored_html_of("<span class='remove'><strong>Source:</strong> Wikipedia</span>", :clean_top_scoring_element!)
210
+ doc.search('.remove').should_not be_empty
211
+ end
212
+
213
+ end
214
+
215
+ describe '#distill!' do
216
+ it 'returns the page content' do
217
+ subject.distill!.should =~ /great for lazy bakers/
218
+ end
219
+
220
+ it 'returns markup without the header' do
221
+ subject.distill!.should_not =~ /skinnytasteheader_1000_3/
222
+ end
223
+
224
+ it 'returns markup withouth the footer' do
225
+ subject.distill!.should_not =~ /Design by Call Me Kristin/
226
+ end
227
+
228
+ it 'returns markup without navigation' do
229
+ subject.distill!.should_not =~ /STNavbar1/
230
+ end
231
+
232
+ it 'returns markup without comments' do
233
+ subject.distill!.should_not =~ /Cindy said.../
234
+ end
235
+
236
+ if RUBY_VERSION =~ /^1.9/
237
+ it 'keeps the encoding of the string was passed in to the constructor' do
238
+ string = "<html><body><p>foo</p></body></html>"
239
+ string.encode!('ISO-8859-1')
240
+ Document.new(string).distill!.encoding.name.should == 'ISO-8859-1'
241
+ end
242
+ end
243
+
244
+ it 'does not clean the page if :clean => false is passed' do
245
+ doc = Document.new(File.open('./spec/fixtures/baked_ziti.html').read)
246
+ doc.distill!(:clean => false).should =~ /Add to Recipe Box/
247
+
248
+ doc = Document.new(File.open('./spec/fixtures/baked_ziti.html').read)
249
+ doc.distill!.should_not =~ /Add to Recipe Box/
250
+ end
251
+
252
+ it 'works with a HTML document that has no winner' do
253
+ document_of('foo').distill!.should == 'foo'
254
+ end
255
+
256
+ end
257
+
258
+ end
259
+ end
@@ -0,0 +1,27 @@
1
+ require 'spec_helper'
2
+
3
+ describe Distillery do
4
+
5
+ describe '.distill' do
6
+
7
+ let(:document) { File.open('./spec/fixtures/pina_collada_cupcakes.html').read }
8
+ let(:mockdoc) { mock(:doc, :distill => 'test') }
9
+
10
+ it 'takes a string and returns the distilled markup' do
11
+ Distillery.distill(document).should be_a(String)
12
+ end
13
+
14
+ it 'defers to Distillery::Document' do
15
+ Distillery::Document.should_receive(:new).once.with(document).and_return(mockdoc)
16
+ mockdoc.should_receive(:distill!).once
17
+ Distillery.distill(document)
18
+ end
19
+
20
+ it 'passes the same options through to the distill! method' do
21
+ Distillery::Document.stub!(:new).and_return(mockdoc)
22
+ mockdoc.should_receive(:distill!).once.with(hash_including(:clean => false))
23
+ Distillery.distill(document, :clean => false)
24
+ end
25
+ end
26
+
27
+ end
@@ -0,0 +1,13 @@
1
+ require 'distillery'
2
+ require 'rspec'
3
+ require 'ruby-debug'
4
+
5
+ Dir['./spec/support/**/*.rb'].each { |f| require f }
6
+
7
+ RSpec.configure do |config|
8
+ config.color_enabled = true
9
+ config.debug = true
10
+
11
+ config.filter_run :focus => true
12
+ config.run_all_when_everything_filtered = true
13
+ end
metadata ADDED
@@ -0,0 +1,180 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: distillery
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Jeff Pollard
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2011-04-30 00:00:00.000000000 -07:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: nokogiri
17
+ requirement: &2161079840 !ruby/object:Gem::Requirement
18
+ none: false
19
+ requirements:
20
+ - - ! '>'
21
+ - !ruby/object:Gem::Version
22
+ version: '1.0'
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: *2161079840
26
+ - !ruby/object:Gem::Dependency
27
+ name: slop
28
+ requirement: &2161079300 !ruby/object:Gem::Requirement
29
+ none: false
30
+ requirements:
31
+ - - ! '>'
32
+ - !ruby/object:Gem::Version
33
+ version: '1.0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: *2161079300
37
+ - !ruby/object:Gem::Dependency
38
+ name: rspec
39
+ requirement: &2161078840 !ruby/object:Gem::Requirement
40
+ none: false
41
+ requirements:
42
+ - - ! '>'
43
+ - !ruby/object:Gem::Version
44
+ version: '2.0'
45
+ type: :development
46
+ prerelease: false
47
+ version_requirements: *2161078840
48
+ - !ruby/object:Gem::Dependency
49
+ name: guard
50
+ requirement: &2161078340 !ruby/object:Gem::Requirement
51
+ none: false
52
+ requirements:
53
+ - - ! '>='
54
+ - !ruby/object:Gem::Version
55
+ version: '0'
56
+ type: :development
57
+ prerelease: false
58
+ version_requirements: *2161078340
59
+ - !ruby/object:Gem::Dependency
60
+ name: guard-rspec
61
+ requirement: &2161077760 !ruby/object:Gem::Requirement
62
+ none: false
63
+ requirements:
64
+ - - ! '>='
65
+ - !ruby/object:Gem::Version
66
+ version: '0'
67
+ type: :development
68
+ prerelease: false
69
+ version_requirements: *2161077760
70
+ - !ruby/object:Gem::Dependency
71
+ name: ruby-debug19
72
+ requirement: &2161077220 !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ type: :development
79
+ prerelease: false
80
+ version_requirements: *2161077220
81
+ - !ruby/object:Gem::Dependency
82
+ name: rb-fsevent
83
+ requirement: &2161076740 !ruby/object:Gem::Requirement
84
+ none: false
85
+ requirements:
86
+ - - ! '>='
87
+ - !ruby/object:Gem::Version
88
+ version: '0'
89
+ type: :development
90
+ prerelease: false
91
+ version_requirements: *2161076740
92
+ - !ruby/object:Gem::Dependency
93
+ name: growl
94
+ requirement: &2161057060 !ruby/object:Gem::Requirement
95
+ none: false
96
+ requirements:
97
+ - - ! '>='
98
+ - !ruby/object:Gem::Version
99
+ version: '0'
100
+ type: :development
101
+ prerelease: false
102
+ version_requirements: *2161057060
103
+ description: Distillery extracts the "content" portion out of an HTML document. It
104
+ applies heuristics based on element type, location, class/id name and other attributes
105
+ to try and find the content part of the HTML document and return it.
106
+ email:
107
+ - jeff.pollard@gmail.com
108
+ executables:
109
+ - distill
110
+ extensions: []
111
+ extra_rdoc_files: []
112
+ files:
113
+ - .gitignore
114
+ - Gemfile
115
+ - Guardfile
116
+ - LICENSE
117
+ - README.md
118
+ - Rakefile
119
+ - TODO
120
+ - bin/distill
121
+ - distillery.gemspec
122
+ - lib/distillery.rb
123
+ - lib/distillery/document.rb
124
+ - lib/distillery/version.rb
125
+ - spec/acceptance_spec.rb
126
+ - spec/fixtures/agave_cookies.html
127
+ - spec/fixtures/baked_ziti.html
128
+ - spec/fixtures/beef_jerkey.html
129
+ - spec/fixtures/clams_and_linguini.html
130
+ - spec/fixtures/clouds_shining_moment.html
131
+ - spec/fixtures/game_blog.html
132
+ - spec/fixtures/ginger_cookies.html
133
+ - spec/fixtures/js_this_keyword.html
134
+ - spec/fixtures/nyt_social_media.html
135
+ - spec/fixtures/pina_collada_cupcakes.html
136
+ - spec/fixtures/vanilla_pound_cake.html
137
+ - spec/lib/distillery/document_spec.rb
138
+ - spec/lib/distillery_spec.rb
139
+ - spec/spec_helper.rb
140
+ has_rdoc: true
141
+ homepage: https://github.com/Fluxx/distillery
142
+ licenses: []
143
+ post_install_message:
144
+ rdoc_options: []
145
+ require_paths:
146
+ - lib
147
+ required_ruby_version: !ruby/object:Gem::Requirement
148
+ none: false
149
+ requirements:
150
+ - - ! '>='
151
+ - !ruby/object:Gem::Version
152
+ version: '0'
153
+ required_rubygems_version: !ruby/object:Gem::Requirement
154
+ none: false
155
+ requirements:
156
+ - - ! '>='
157
+ - !ruby/object:Gem::Version
158
+ version: '0'
159
+ requirements: []
160
+ rubyforge_project: distillery
161
+ rubygems_version: 1.6.1
162
+ signing_key:
163
+ specification_version: 3
164
+ summary: Extract the content portion of an HTML document.
165
+ test_files:
166
+ - spec/acceptance_spec.rb
167
+ - spec/fixtures/agave_cookies.html
168
+ - spec/fixtures/baked_ziti.html
169
+ - spec/fixtures/beef_jerkey.html
170
+ - spec/fixtures/clams_and_linguini.html
171
+ - spec/fixtures/clouds_shining_moment.html
172
+ - spec/fixtures/game_blog.html
173
+ - spec/fixtures/ginger_cookies.html
174
+ - spec/fixtures/js_this_keyword.html
175
+ - spec/fixtures/nyt_social_media.html
176
+ - spec/fixtures/pina_collada_cupcakes.html
177
+ - spec/fixtures/vanilla_pound_cake.html
178
+ - spec/lib/distillery/document_spec.rb
179
+ - spec/lib/distillery_spec.rb
180
+ - spec/spec_helper.rb