RubyGems - distillery - Versions diffs - 0.1.0 → 0.1.1 - Mend

distillery 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

data/README.md +6 -4
data/Rakefile +1 -1
data/lib/distillery/document.rb +54 -24
data/lib/distillery/version.rb +1 -1
data/spec/acceptance_spec.rb +10 -0
data/spec/fixtures/.DS_Store +0 -0
data/spec/fixtures/bourbon_balls.html +950 -0
data/spec/lib/distillery/document_spec.rb +54 -30
metadata +22 -18

data/spec/lib/distillery/document_spec.rb CHANGED Viewed

@@ -74,21 +74,26 @@ module Distillery
     end
-    describe 'coerce_elements_to_paragraphs!' do
+    describe 'mark_scorable_elements!' do
-      it 'converts divs who have no children to paragraphs' do
-        doc = document_of("<div>foo</div>", :coerce_elements_to_paragraphs!)
-        doc.inner_html.should == html_of("<p>foo</p>")
+      it 'marks divs who have no children to paragraphs' do
+        doc = document_of("<div>foo</div>", :mark_scorable_elements!)
+        doc.inner_html.should == html_of('<div data-distillery="scorable">foo</div>')
       end
-      it 'converts divs who have children that are not block-level elements to paragraphs' do
-        doc = document_of("<div><span>foo</span></div>", :coerce_elements_to_paragraphs!)
-        doc.inner_html.should == html_of("<p><span>foo</span></p>")
+      it 'marks divs who have children that are not block-level elements to paragraphs' do
+        doc = document_of("<div><span>foo</span></div>", :mark_scorable_elements!)
+        doc.inner_html.should == html_of('<div data-distillery="scorable"><span>foo</span></div>')
       end
-      it 'converts divs whose have empty child divs to paragrahs' do
-        doc = document_of("<div><pre>foo</pre><div></div></div>", :coerce_elements_to_paragraphs!)
-        doc.inner_html.gsub("\n", "").should == html_of("<p><pre>foo</pre><p></p></p>")
+      it 'marks divs whose have empty child divs to paragrahs' do
+        doc = document_of("<div><pre>foo</pre><div></div></div>", :mark_scorable_elements!)
+        doc.inner_html.gsub("\n", "").should == html_of('<div data-distillery="scorable"><pre>foo</pre><div data-distillery="scorable"></div></div>')
+      end
+      it 'marks all paragraphs' do
+        doc = document_of("<p>foo</p><p></p></p>", :mark_scorable_elements!)
+        doc.inner_html.gsub("\n", "").should == html_of('<p data-distillery="scorable">foo</p><p data-distillery="scorable"></p>')
       end
     end
@@ -102,10 +107,12 @@ module Distillery
         subject.scores.should_not be_empty
       end
-      it 'only calculates scores for paragraphs' do
-        doc = document_of("<p>foo</p><div>bar</div>", :score!)
+      it 'calculates scores for divs that are empty or have no block level children' do
+        doc = document_of("<div><div><div><div>bar</div></div></div></div>", :score!)
+        doc.scores.should have_key('/html/body/div/div/div/div')
+        doc.scores.should have_key('/html/body/div/div/div')
+        doc.scores.should have_key('/html/body/div/div')
         doc.scores.should_not have_key('/html/body/div')
-        doc.scores.should have_key('/html/body/p')
       end
       it 'gives one point per comma in the text of an element' do
@@ -122,14 +129,14 @@ module Distillery
       end
       it 'adds its own points to its parent' do
-        doc = document_of("<p><div><p>foo</p></div></p>", :score!)
-        doc.scores['/html/body/div/p'].should == 2
+        doc = document_of("<div><div>foo</div></div>", :score!)
+        doc.scores['/html/body/div/div'].should == 2
         doc.scores['/html/body/div'].should == 2
       end
       it 'adds 1/2 its points to its grandparent' do
-        doc = document_of("<p><div><div><p>foo</p></div></div></p>", :score!)
-        doc.scores['/html/body/div/div/p'].should == 2
+        doc = document_of("<div><div><div>foo</div></div></div>", :score!)
+        doc.scores['/html/body/div/div/div'].should == 2
         doc.scores['/html/body/div/div'].should == 2
         doc.scores['/html/body/div'].should == 1
       end
@@ -148,65 +155,65 @@ module Distillery
       end
       it 'removes all empty elements' do
-        doc = doc_with_top_scored_html_of("<div>foo <span></span</div>", :clean_top_scoring_element!)
+        doc = doc_with_top_scored_html_of("<div>foo <span></span</div>", :clean_top_scoring_elements!)
         doc.search('span').should be_empty
       end
       it 'does not remove <br> elements' do
-        doc = doc_with_top_scored_html_of("<div>foo<br class='noremove' /></div>", :clean_top_scoring_element!)
+        doc = doc_with_top_scored_html_of("<div>foo,foo,foo<br class='noremove' /></div>", :clean_top_scoring_elements!)
         doc.search('.noremove').should_not be_empty
       end
       %w[iframe form object].each do |tag|
         it "removes any #{tag} elements" do
-          doc = doc_with_top_scored_html_of("foo <#{tag}></#{tag}>", :clean_top_scoring_element!)
+          doc = doc_with_top_scored_html_of("foo <#{tag}></#{tag}>", :clean_top_scoring_elements!)
           doc.search(tag).should be_empty
         end
       end
       it 'removes elements that have negative scores' do
-        doc = doc_with_top_scored_html_of("<div class='widget'><div>bar</div></div>", :clean_top_scoring_element!)
+        doc = doc_with_top_scored_html_of("<div class='widget'><div>bar</div></div>", :clean_top_scoring_elements!)
         doc.search('.widget').should be_empty
       end
       it 'removes elements that have more images than p tags' do
-        doc = doc_with_top_scored_html_of("<div class='remove'><img><img><img><p>bar</p><div>foo</div></div>", :clean_top_scoring_element!)
+        doc = doc_with_top_scored_html_of("<div class='remove'><img><img><img><p>bar</p><div>foo</div></div>", :clean_top_scoring_elements!)
         doc.search('.remove').should be_empty
       end
       it 'removes elements that have way more li elements and it is not a list' do
-        doc = doc_with_top_scored_html_of("<div class='remove'><div>me<ul>#{'<li>a</li>'*200}</ul></div></div>", :clean_top_scoring_element!)
+        doc = doc_with_top_scored_html_of("<div class='remove'><div>me<ul>#{'<li>a</li>'*200}</ul></div></div>", :clean_top_scoring_elements!)
         doc.search('.remove').should be_empty
       end
       it 'removes elements that have more inputs than 1/3 the amount of p tags' do
-        doc = doc_with_top_scored_html_of("<div class='remove'><div><input><input><p>f</p><p>f</p><p>f</p></div></div>", :clean_top_scoring_element!)
+        doc = doc_with_top_scored_html_of("<div class='remove'><div><input><input><p>f</p><p>f</p><p>f</p></div></div>", :clean_top_scoring_elements!)
         doc.search('.remove').should be_empty
-        doc = doc_with_top_scored_html_of("<div class='remove'><input><p>#{'f'*25}</p><p>f</p><p>f</p></div>", :clean_top_scoring_element!)
+        doc = doc_with_top_scored_html_of("<div class='remove'><input><p>#{'f'*25}</p><p>f</p><p>f</p></div>", :clean_top_scoring_elements!)
         doc.search('.remove').should_not be_empty
       end
       it 'removes elements that have < 25 characters and (no images or > 2 images' do
-        doc = doc_with_top_scored_html_of("<div class='remove'><div>foo</div></div>", :clean_top_scoring_element!)
+        doc = doc_with_top_scored_html_of("<div class='remove'><div>foo</div></div>", :clean_top_scoring_elements!)
         doc.search('.remove').should be_empty
-        doc = doc_with_top_scored_html_of("<div class='remove'><div>foo <img><img><img></div></div>", :clean_top_scoring_element!)
+        doc = doc_with_top_scored_html_of("<div class='remove'><div>foo <img><img><img></div></div>", :clean_top_scoring_elements!)
         doc.search('.remove').should be_empty
       end
       it 'removes elements that have a weight of < 25 and link density > 0.2' do
-        doc = doc_with_top_scored_html_of("<div class='remove'><div>fffff<a>#{'b'*2}</a></div></div>", :clean_top_scoring_element!)
+        doc = doc_with_top_scored_html_of("<div class='remove'><div>fffff<a>#{'b'*2}</a></div></div>", :clean_top_scoring_elements!)
         doc.search('.remove').should be_empty
       end
       it 'removes elements that have a weight of >= 25 and link density > 0.5' do
-        doc = doc_with_top_scored_html_of("<div class='remove article'><div>#{'f'*100}<a>#{'b'*150}</a></div></div>", :clean_top_scoring_element!)
+        doc = doc_with_top_scored_html_of("<div class='remove article'><div>#{'f'*100}<a>#{'b'*150}</a></div></div>", :clean_top_scoring_elements!)
         doc.search('.remove').should be_empty
       end
       it 'should not clean the conntent elements not of table ul or div' do
-        doc = doc_with_top_scored_html_of("<span class='remove'><strong>Source:</strong> Wikipedia</span>", :clean_top_scoring_element!)
+        doc = doc_with_top_scored_html_of("<span class='remove'><strong>Source:</strong> Wikipedia</span>", :clean_top_scoring_elements!)
         doc.search('.remove').should_not be_empty
       end
@@ -253,6 +260,23 @@ module Distillery
         document_of('foo').distill!.should == 'foo'
       end
+      it 'does not return any elements with a data-distillery attribute' do
+        html = document_of('<div><p>Hello</p></div>')
+        document_of(html).distill!.should_not =~ /data-distillery/
+      end
+      it 'picks the outtermost element in the event of a tie' do
+        doc = document_of("<div><div class='included'>#{'f,'*10}</div></div>")
+        doc.distill!.should =~ /included/
+        doc.scores['/html/body/div/div'].should == 11
+        doc.scores['/html/body/div'].should == 11
+      end
+      it 'returns sibling elements to the top scoring one that have > 25% of the top scoring element\'s score' do
+        doc = document_of("<div><div>#{'f,'*10}</div></div><div><div class='me_too'>#{'f,'*5}</div></div>")
+        doc.distill!.should =~ /me_too/
+      end
     end
   end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: distillery
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.1.1
   prerelease:
 platform: ruby
 authors:
@@ -9,12 +9,12 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-04-30 00:00:00.000000000 -07:00
+date: 2011-05-08 00:00:00.000000000 -07:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
-  requirement: &2161079840 !ruby/object:Gem::Requirement
+  requirement: &2168823320 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>'
@@ -22,10 +22,10 @@ dependencies:
         version: '1.0'
   type: :runtime
   prerelease: false
-  version_requirements: *2161079840
+  version_requirements: *2168823320
 - !ruby/object:Gem::Dependency
   name: slop
-  requirement: &2161079300 !ruby/object:Gem::Requirement
+  requirement: &2168822640 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>'
@@ -33,10 +33,10 @@ dependencies:
         version: '1.0'
   type: :runtime
   prerelease: false
-  version_requirements: *2161079300
+  version_requirements: *2168822640
 - !ruby/object:Gem::Dependency
   name: rspec
-  requirement: &2161078840 !ruby/object:Gem::Requirement
+  requirement: &2168821740 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>'
@@ -44,10 +44,10 @@ dependencies:
         version: '2.0'
   type: :development
   prerelease: false
-  version_requirements: *2161078840
+  version_requirements: *2168821740
 - !ruby/object:Gem::Dependency
   name: guard
-  requirement: &2161078340 !ruby/object:Gem::Requirement
+  requirement: &2168821200 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -55,10 +55,10 @@ dependencies:
         version: '0'
   type: :development
   prerelease: false
-  version_requirements: *2161078340
+  version_requirements: *2168821200
 - !ruby/object:Gem::Dependency
   name: guard-rspec
-  requirement: &2161077760 !ruby/object:Gem::Requirement
+  requirement: &2168820180 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -66,10 +66,10 @@ dependencies:
         version: '0'
   type: :development
   prerelease: false
-  version_requirements: *2161077760
+  version_requirements: *2168820180
 - !ruby/object:Gem::Dependency
   name: ruby-debug19
-  requirement: &2161077220 !ruby/object:Gem::Requirement
+  requirement: &2168811040 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -77,10 +77,10 @@ dependencies:
         version: '0'
   type: :development
   prerelease: false
-  version_requirements: *2161077220
+  version_requirements: *2168811040
 - !ruby/object:Gem::Dependency
   name: rb-fsevent
-  requirement: &2161076740 !ruby/object:Gem::Requirement
+  requirement: &2168810540 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -88,10 +88,10 @@ dependencies:
         version: '0'
   type: :development
   prerelease: false
-  version_requirements: *2161076740
+  version_requirements: *2168810540
 - !ruby/object:Gem::Dependency
   name: growl
-  requirement: &2161057060 !ruby/object:Gem::Requirement
+  requirement: &2168809680 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -99,7 +99,7 @@ dependencies:
         version: '0'
   type: :development
   prerelease: false
-  version_requirements: *2161057060
+  version_requirements: *2168809680
 description: Distillery extracts the "content" portion out of an HTML document. It
   applies heuristics based on element type, location, class/id name and other attributes
   to try and find the content part of the HTML document and return it.
@@ -123,9 +123,11 @@ files:
 - lib/distillery/document.rb
 - lib/distillery/version.rb
 - spec/acceptance_spec.rb
+- spec/fixtures/.DS_Store
 - spec/fixtures/agave_cookies.html
 - spec/fixtures/baked_ziti.html
 - spec/fixtures/beef_jerkey.html
+- spec/fixtures/bourbon_balls.html
 - spec/fixtures/clams_and_linguini.html
 - spec/fixtures/clouds_shining_moment.html
 - spec/fixtures/game_blog.html
@@ -164,9 +166,11 @@ specification_version: 3
 summary: Extract the content portion of an HTML document.
 test_files:
 - spec/acceptance_spec.rb
+- spec/fixtures/.DS_Store
 - spec/fixtures/agave_cookies.html
 - spec/fixtures/baked_ziti.html
 - spec/fixtures/beef_jerkey.html
+- spec/fixtures/bourbon_balls.html
 - spec/fixtures/clams_and_linguini.html
 - spec/fixtures/clouds_shining_moment.html
 - spec/fixtures/game_blog.html