RubyGems - distillery - Versions diffs - 0.1.0 - Mend

distillery 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

data/.gitignore +4 -0
data/Gemfile +2 -0
data/Guardfile +5 -0
data/LICENSE +20 -0
data/README.md +41 -0
data/Rakefile +40 -0
data/TODO +5 -0
data/bin/distill +24 -0
data/distillery.gemspec +31 -0
data/lib/distillery.rb +15 -0
data/lib/distillery/document.rb +181 -0
data/lib/distillery/version.rb +3 -0
data/spec/acceptance_spec.rb +108 -0
data/spec/fixtures/agave_cookies.html +467 -0
data/spec/fixtures/baked_ziti.html +2250 -0
data/spec/fixtures/beef_jerkey.html +457 -0
data/spec/fixtures/clams_and_linguini.html +1009 -0
data/spec/fixtures/clouds_shining_moment.html +2145 -0
data/spec/fixtures/game_blog.html +158 -0
data/spec/fixtures/ginger_cookies.html +181 -0
data/spec/fixtures/js_this_keyword.html +1183 -0
data/spec/fixtures/nyt_social_media.html +418 -0
data/spec/fixtures/pina_collada_cupcakes.html +4481 -0
data/spec/fixtures/vanilla_pound_cake.html +2190 -0
data/spec/lib/distillery/document_spec.rb +259 -0
data/spec/lib/distillery_spec.rb +27 -0
data/spec/spec_helper.rb +13 -0
metadata +180 -0

data/spec/lib/distillery/document_spec.rb ADDED Viewed

@@ -0,0 +1,259 @@
+require 'spec_helper'
+module Distillery
+  describe Document do
+    let(:document) { File.open('./spec/fixtures/pina_collada_cupcakes.html').read }
+    let!(:noko_doc) { ::Nokogiri::HTML(document) }
+    subject { Document.new(document) }
+    def document_of(html, *postprocessing)
+      Document.new(html_of(html)).tap do |doc|
+        postprocessing.each do |method|
+          doc.send(method)
+        end
+      end
+    end
+    def html_of(body)
+      "<html><body>#{body}</body></html>"
+    end
+    describe ".new" do
+      it 'raises an exception without an argument' do
+        expect { Document.new }.to raise_exception(ArgumentError)
+      end
+    end
+    describe 'nokogiri delegation' do
+      before(:each) do
+        ::Nokogiri.stub(:HTML).and_return(noko_doc)
+        noko_doc.stub!(:to_xml).and_return('xml-doc')
+      end
+      it "delegates method_calls to the internal doc" do
+        noko_doc.should_receive(:to_xml).once
+        subject.to_xml.should == 'xml-doc'
+      end
+    end
+    describe 'remove_irrelevant_elements!' do
+      %w[script link meta].each do |tag|
+        it "should strip out ##{tag} tags" do
+          subject.search(tag).should_not be_empty
+          subject.remove_irrelevant_elements!
+          subject.search(tag).should be_empty
+        end
+      end
+      it 'does not remove the body even if it has a bad class or id' do
+        doc = Document.new("<html><body class='sidebar'>foo</body></html>")
+        doc.remove_unlikely_elements!
+        doc.search('body').should_not be_empty
+      end
+    end
+    describe 'remove_unlikely_elements!' do
+      %w[combx comment disqus foot header menu meta nav rss shoutbox sidebar sponsor].each do |klass|
+        it "removes any elements classed .#{klass}, as it is unlikely to be page content" do
+          doc = document_of("<div class='#{klass}'>foo</div>", :remove_unlikely_elements!)
+          doc.inner_html.should == html_of("")
+        end
+        it "removes any elements id'd ##{klass}, as it is unlikely to be page content" do
+          doc = document_of("<div id='#{klass}'>foo</div>", :remove_unlikely_elements!)
+          doc.inner_html.should == html_of("")
+        end
+      end
+    end
+    describe 'coerce_elements_to_paragraphs!' do
+      it 'converts divs who have no children to paragraphs' do
+        doc = document_of("<div>foo</div>", :coerce_elements_to_paragraphs!)
+        doc.inner_html.should == html_of("<p>foo</p>")
+      end
+      it 'converts divs who have children that are not block-level elements to paragraphs' do
+        doc = document_of("<div><span>foo</span></div>", :coerce_elements_to_paragraphs!)
+        doc.inner_html.should == html_of("<p><span>foo</span></p>")
+      end
+      it 'converts divs whose have empty child divs to paragrahs' do
+        doc = document_of("<div><pre>foo</pre><div></div></div>", :coerce_elements_to_paragraphs!)
+        doc.inner_html.gsub("\n", "").should == html_of("<p><pre>foo</pre><p></p></p>")
+      end
+    end
+    describe '#score!' do
+      it 'popualtes the score ivar with data' do
+        subject.scores.should be_a(Hash)
+        subject.scores.should be_empty
+        subject.score!
+        subject.scores.should_not be_empty
+      end
+      it 'only calculates scores for paragraphs' do
+        doc = document_of("<p>foo</p><div>bar</div>", :score!)
+        doc.scores.should_not have_key('/html/body/div')
+        doc.scores.should have_key('/html/body/p')
+      end
+      it 'gives one point per comma in the text of an element' do
+        doc = document_of("<p>foo,bar,baz</p>", :score!)
+        doc.scores['/html/body/p'].should == 4
+      end
+      it 'gives one point per chunk of 100 characters, max of 3' do
+        doc = document_of("<p>#{'f'*201}</p>", :score!)
+        doc.scores['/html/body/p'].should == 4
+        doc = document_of("<p>#{'f'*1000}</p>", :score!)
+        doc.scores['/html/body/p'].should == 5
+      end
+      it 'adds its own points to its parent' do
+        doc = document_of("<p><div><p>foo</p></div></p>", :score!)
+        doc.scores['/html/body/div/p'].should == 2
+        doc.scores['/html/body/div'].should == 2
+      end
+      it 'adds 1/2 its points to its grandparent' do
+        doc = document_of("<p><div><div><p>foo</p></div></div></p>", :score!)
+        doc.scores['/html/body/div/div/p'].should == 2
+        doc.scores['/html/body/div/div'].should == 2
+        doc.scores['/html/body/div'].should == 1
+      end
+      it 'scales the final score by the inverse link density' do
+        doc = document_of("<p>foobar<a>baz</a></p>", :score!)
+        doc.scores['/html/body/p'].should == 1.3333333333333335
+      end
+    end
+    describe 'clean_top_scoring_element!' do
+      def doc_with_top_scored_html_of(markup, *postprocessing)
+        markup = '<div class="winner">' + ('<p>foo,</p>'*5) + markup + '</div>'
+        document_of(markup, *[:prep_for_distillation!, :score!].push(*postprocessing))
+      end
+      it 'removes all empty elements' do
+        doc = doc_with_top_scored_html_of("<div>foo <span></span</div>", :clean_top_scoring_element!)
+        doc.search('span').should be_empty
+      end
+      it 'does not remove <br> elements' do
+        doc = doc_with_top_scored_html_of("<div>foo<br class='noremove' /></div>", :clean_top_scoring_element!)
+        doc.search('.noremove').should_not be_empty
+      end
+      %w[iframe form object].each do |tag|
+        it "removes any #{tag} elements" do
+          doc = doc_with_top_scored_html_of("foo <#{tag}></#{tag}>", :clean_top_scoring_element!)
+          doc.search(tag).should be_empty
+        end
+      end
+      it 'removes elements that have negative scores' do
+        doc = doc_with_top_scored_html_of("<div class='widget'><div>bar</div></div>", :clean_top_scoring_element!)
+        doc.search('.widget').should be_empty
+      end
+      it 'removes elements that have more images than p tags' do
+        doc = doc_with_top_scored_html_of("<div class='remove'><img><img><img><p>bar</p><div>foo</div></div>", :clean_top_scoring_element!)
+        doc.search('.remove').should be_empty
+      end
+      it 'removes elements that have way more li elements and it is not a list' do
+        doc = doc_with_top_scored_html_of("<div class='remove'><div>me<ul>#{'<li>a</li>'*200}</ul></div></div>", :clean_top_scoring_element!)
+        doc.search('.remove').should be_empty
+      end
+      it 'removes elements that have more inputs than 1/3 the amount of p tags' do
+        doc = doc_with_top_scored_html_of("<div class='remove'><div><input><input><p>f</p><p>f</p><p>f</p></div></div>", :clean_top_scoring_element!)
+        doc.search('.remove').should be_empty
+        doc = doc_with_top_scored_html_of("<div class='remove'><input><p>#{'f'*25}</p><p>f</p><p>f</p></div>", :clean_top_scoring_element!)
+        doc.search('.remove').should_not be_empty
+      end
+      it 'removes elements that have < 25 characters and (no images or > 2 images' do
+        doc = doc_with_top_scored_html_of("<div class='remove'><div>foo</div></div>", :clean_top_scoring_element!)
+        doc.search('.remove').should be_empty
+        doc = doc_with_top_scored_html_of("<div class='remove'><div>foo <img><img><img></div></div>", :clean_top_scoring_element!)
+        doc.search('.remove').should be_empty
+      end
+      it 'removes elements that have a weight of < 25 and link density > 0.2' do
+        doc = doc_with_top_scored_html_of("<div class='remove'><div>fffff<a>#{'b'*2}</a></div></div>", :clean_top_scoring_element!)
+        doc.search('.remove').should be_empty
+      end
+      it 'removes elements that have a weight of >= 25 and link density > 0.5' do
+        doc = doc_with_top_scored_html_of("<div class='remove article'><div>#{'f'*100}<a>#{'b'*150}</a></div></div>", :clean_top_scoring_element!)
+        doc.search('.remove').should be_empty
+      end
+      it 'should not clean the conntent elements not of table ul or div' do
+        doc = doc_with_top_scored_html_of("<span class='remove'><strong>Source:</strong> Wikipedia</span>", :clean_top_scoring_element!)
+        doc.search('.remove').should_not be_empty
+      end
+    end
+    describe '#distill!' do
+      it 'returns the page content' do
+        subject.distill!.should =~ /great for lazy bakers/
+      end
+      it 'returns markup without the header' do
+        subject.distill!.should_not =~ /skinnytasteheader_1000_3/
+      end
+      it 'returns markup withouth the footer' do
+        subject.distill!.should_not =~ /Design by Call Me Kristin/
+      end
+      it 'returns markup without navigation' do
+        subject.distill!.should_not =~ /STNavbar1/
+      end
+      it 'returns markup without comments' do
+        subject.distill!.should_not =~ /Cindy said.../
+      end
+      if RUBY_VERSION =~ /^1.9/
+        it 'keeps the encoding of the string was passed in to the constructor' do
+          string = "<html><body><p>foo</p></body></html>"
+          string.encode!('ISO-8859-1')
+          Document.new(string).distill!.encoding.name.should == 'ISO-8859-1'
+        end
+      end
+      it 'does not clean the page if :clean => false is passed' do
+        doc = Document.new(File.open('./spec/fixtures/baked_ziti.html').read)
+        doc.distill!(:clean => false).should =~ /Add to Recipe Box/
+        doc = Document.new(File.open('./spec/fixtures/baked_ziti.html').read)
+        doc.distill!.should_not =~ /Add to Recipe Box/
+      end
+      it 'works with a HTML document that has no winner' do
+        document_of('foo').distill!.should == 'foo'
+      end
+    end
+  end
+end

data/spec/lib/distillery_spec.rb ADDED Viewed

@@ -0,0 +1,27 @@
+require 'spec_helper'
+describe Distillery do
+  describe '.distill' do
+    let(:document) { File.open('./spec/fixtures/pina_collada_cupcakes.html').read }
+    let(:mockdoc) { mock(:doc, :distill => 'test') }
+    it 'takes a string and returns the distilled markup' do
+      Distillery.distill(document).should be_a(String)
+    end
+    it 'defers to Distillery::Document' do
+      Distillery::Document.should_receive(:new).once.with(document).and_return(mockdoc)
+      mockdoc.should_receive(:distill!).once
+      Distillery.distill(document)
+    end
+    it 'passes the same options through to the distill! method' do
+      Distillery::Document.stub!(:new).and_return(mockdoc)
+      mockdoc.should_receive(:distill!).once.with(hash_including(:clean => false))
+      Distillery.distill(document, :clean => false)
+    end
+  end
+end

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,13 @@
+require 'distillery'
+require 'rspec'
+require 'ruby-debug'
+Dir['./spec/support/**/*.rb'].each { |f| require f }
+RSpec.configure do |config|
+  config.color_enabled = true
+  config.debug = true
+  config.filter_run :focus => true
+  config.run_all_when_everything_filtered = true
+end

metadata ADDED Viewed

@@ -0,0 +1,180 @@
+--- !ruby/object:Gem::Specification
+name: distillery
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+  prerelease:
+platform: ruby
+authors:
+- Jeff Pollard
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2011-04-30 00:00:00.000000000 -07:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  requirement: &2161079840 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>'
+      - !ruby/object:Gem::Version
+        version: '1.0'
+  type: :runtime
+  prerelease: false
+  version_requirements: *2161079840
+- !ruby/object:Gem::Dependency
+  name: slop
+  requirement: &2161079300 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>'
+      - !ruby/object:Gem::Version
+        version: '1.0'
+  type: :runtime
+  prerelease: false
+  version_requirements: *2161079300
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: &2161078840 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>'
+      - !ruby/object:Gem::Version
+        version: '2.0'
+  type: :development
+  prerelease: false
+  version_requirements: *2161078840
+- !ruby/object:Gem::Dependency
+  name: guard
+  requirement: &2161078340 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: *2161078340
+- !ruby/object:Gem::Dependency
+  name: guard-rspec
+  requirement: &2161077760 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: *2161077760
+- !ruby/object:Gem::Dependency
+  name: ruby-debug19
+  requirement: &2161077220 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: *2161077220
+- !ruby/object:Gem::Dependency
+  name: rb-fsevent
+  requirement: &2161076740 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: *2161076740
+- !ruby/object:Gem::Dependency
+  name: growl
+  requirement: &2161057060 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: *2161057060
+description: Distillery extracts the "content" portion out of an HTML document. It
+  applies heuristics based on element type, location, class/id name and other attributes
+  to try and find the content part of the HTML document and return it.
+email:
+- jeff.pollard@gmail.com
+executables:
+- distill
+extensions: []
+extra_rdoc_files: []
+files:
+- .gitignore
+- Gemfile
+- Guardfile
+- LICENSE
+- README.md
+- Rakefile
+- TODO
+- bin/distill
+- distillery.gemspec
+- lib/distillery.rb
+- lib/distillery/document.rb
+- lib/distillery/version.rb
+- spec/acceptance_spec.rb
+- spec/fixtures/agave_cookies.html
+- spec/fixtures/baked_ziti.html
+- spec/fixtures/beef_jerkey.html
+- spec/fixtures/clams_and_linguini.html
+- spec/fixtures/clouds_shining_moment.html
+- spec/fixtures/game_blog.html
+- spec/fixtures/ginger_cookies.html
+- spec/fixtures/js_this_keyword.html
+- spec/fixtures/nyt_social_media.html
+- spec/fixtures/pina_collada_cupcakes.html
+- spec/fixtures/vanilla_pound_cake.html
+- spec/lib/distillery/document_spec.rb
+- spec/lib/distillery_spec.rb
+- spec/spec_helper.rb
+has_rdoc: true
+homepage: https://github.com/Fluxx/distillery
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project: distillery
+rubygems_version: 1.6.1
+signing_key:
+specification_version: 3
+summary: Extract the content portion of an HTML document.
+test_files:
+- spec/acceptance_spec.rb
+- spec/fixtures/agave_cookies.html
+- spec/fixtures/baked_ziti.html
+- spec/fixtures/beef_jerkey.html
+- spec/fixtures/clams_and_linguini.html
+- spec/fixtures/clouds_shining_moment.html
+- spec/fixtures/game_blog.html
+- spec/fixtures/ginger_cookies.html
+- spec/fixtures/js_this_keyword.html
+- spec/fixtures/nyt_social_media.html
+- spec/fixtures/pina_collada_cupcakes.html
+- spec/fixtures/vanilla_pound_cake.html
+- spec/lib/distillery/document_spec.rb
+- spec/lib/distillery_spec.rb
+- spec/spec_helper.rb