RubyGems - distillery - Versions diffs - 0.1.0 - Mend

distillery 0.1.0

Files changed (28) hide show

data/.gitignore +4 -0
data/Gemfile +2 -0
data/Guardfile +5 -0
data/LICENSE +20 -0
data/README.md +41 -0
data/Rakefile +40 -0
data/TODO +5 -0
data/bin/distill +24 -0
data/distillery.gemspec +31 -0
data/lib/distillery.rb +15 -0
data/lib/distillery/document.rb +181 -0
data/lib/distillery/version.rb +3 -0
data/spec/acceptance_spec.rb +108 -0
data/spec/fixtures/agave_cookies.html +467 -0
data/spec/fixtures/baked_ziti.html +2250 -0
data/spec/fixtures/beef_jerkey.html +457 -0
data/spec/fixtures/clams_and_linguini.html +1009 -0
data/spec/fixtures/clouds_shining_moment.html +2145 -0
data/spec/fixtures/game_blog.html +158 -0
data/spec/fixtures/ginger_cookies.html +181 -0
data/spec/fixtures/js_this_keyword.html +1183 -0
data/spec/fixtures/nyt_social_media.html +418 -0
data/spec/fixtures/pina_collada_cupcakes.html +4481 -0
data/spec/fixtures/vanilla_pound_cake.html +2190 -0
data/spec/lib/distillery/document_spec.rb +259 -0
data/spec/lib/distillery_spec.rb +27 -0
data/spec/spec_helper.rb +13 -0
metadata +180 -0

data/.gitignore ADDED Viewed

@@ -0,0 +1,4 @@
+*.gem
+.bundle
+Gemfile.lock
+pkg/*

data/Gemfile ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ source "http://rubygems.org"
2	+ gemspec

data/Guardfile ADDED Viewed

@@ -0,0 +1,5 @@
+guard 'rspec' do
+  watch(%r{^spec/.+_spec\.rb})
+  watch(%r{^lib/(.+)\.rb})     { |m| "spec/lib/#{m[1]}_spec.rb" }
+  watch('spec/spec_helper.rb') { "spec" }
+end

data/LICENSE ADDED Viewed

@@ -0,0 +1,20 @@
+Copyright (c) 2011 Jeff Pollard
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,41 @@
+# Distillery
+Distillery extracts the "content" portion out of an HTML document.  It applies heuristics based on element type, location, class/id name and other attributes to try and find the content part of the HTML document and return it.
+The logic for Distillery was heavily influenced by [Readability](https://www.readability.com/), who was nice enough to make [their logic](http://code.google.com/p/arc90labs-readability/source/browse/trunk/js/readability.js) open source.  Distillery does *not* aim to be a direct port of that logic.  See [iterationlabs/ruby-readability](https://github.com/iterationlabs/ruby-readability) for something closer to that.
+Readability and Distillery share nearly the same logic for locating the content HTML element on the page.  Readability, however, also aggressively cleans and transforms the content element HTML to be used for display in a reading environment.  Distillery aims to clean slightly less aggressively, and allow the user of the gem to choose how (and if) they would like to clean content element HTML.
+## Installation
+    gem install distillery
+## Usage
+Usage is quite simple:
+    Distillery.distill(html_doc_as_a_string)
+    > "distilled content"
+If you would like a more OO oriented syntax, Distillery offers a `Distillery::Document` API.  Like the `distill` method above, its constructor takes a string that is the content of the HTML page you would like to distill:
+    doc = Distillery::Document.new(string_of_html)
+Then you simply call `#distill!` on the document object to distill it and return the distilled content.
+    doc.distill!
+    > "distilled content"
+Both the `Distill::Document#distill!` and `Distillery.distill` methods by default will clean the HTML of the content to remove elements from it which are unlikely to be the actual content.  Usually, this is things like social media share buttons, widgets, advertisements, etc.  If you would like to not clean the content, simply pass `:dirty => true` to either method:
+    doc.distill!(:dirty => true)
+    > "raw distilled content"
+## From the command line
+Distillery also ships with an executable that allows you to distill documents at the command line:
+    Usage: distill [options] http://www.example.com/
+        -d, --dirty        Do not clean content HTML
+        -v, --version      Print the version
+        -h, --help         Print this help message

data/Rakefile ADDED Viewed

@@ -0,0 +1,40 @@
+require 'bundler'
+Bundler::GemHelper.install_tasks
+require 'rspec/core/rake_task'
+RSpec::Core::RakeTask.new(:spec) do |t|
+  t.rspec_opts = %w[--profile]
+  t.pattern = 'spec/**/*_spec.rb'
+end
+require "distillery"
+def doc_for_fixture(fixture)
+  file = File.join(File.dirname(__FILE__), 'spec', 'fixtures', fixture)
+  Distillery::Document.new(File.open(file).read)
+end
+namespace :fixture do
+  desc 'Open the fixture with data-score elements added showing an elements score'
+  task :score, :filename do |t, args|
+    doc = doc_for_fixture(args[:filename])
+    doc.prep_for_distillation
+    doc.scores.each do |xpath, score|
+      doc.at(xpath)['data-score'] = score.to_s
+    end
+    outfile = File.open("/tmp/scored.#{args[:filename]}", 'w')
+    outfile << doc.to_s
+    sh "open #{outfile.path}"
+  end
+  desc 'Distill a fixture and open it'
+  task :distill, :filename do |t, args|
+    outfile = File.open("/tmp/distilled.#{args[:filename]}", 'w')
+    outfile << doc_for_fixture(args[:filename]).distill!
+    sh "open #{outfile.path}"
+  end
+end
+task :default => :spec

data/TODO ADDED Viewed

@@ -0,0 +1,5 @@
+- Give users the possibility of preserving the HTML of the content element as it was seen.
+- Instead of a string, return a Node from Nokogiri
+- Remove HTMl comments from output
+- Convert newline breaks to paragraphs
+- Convert text nodes to <p> as well

data/bin/distill ADDED Viewed

@@ -0,0 +1,24 @@
+#!/usr/bin/env ruby
+$LOAD_PATH << File.dirname(__FILE__) + '/../lib/'
+require 'open-uri'
+require 'distillery'
+require 'slop'
+opts = Slop.parse :help => true do
+  on :d, :dirty, 'Do not clean content HTML', default: false
+  on :v, :version, 'Print the version' do
+    puts Distillery::VERSION
+    exit
+  end
+  banner "Usage: distill [options] http://www.example.com/"
+end
+unless ARGV.last =~ /^http/
+  puts opts.help
+else
+  puts Distillery.distill(open(ARGV.last).read, :clean => !opts.dirty?)
+end

data/distillery.gemspec ADDED Viewed

@@ -0,0 +1,31 @@
+# -*- encoding: utf-8 -*-
+$:.push File.expand_path("../lib", __FILE__)
+require "distillery/version"
+Gem::Specification.new do |s|
+  s.name        = "distillery"
+  s.version     = Distillery::VERSION
+  s.platform    = Gem::Platform::RUBY
+  s.authors     = ["Jeff Pollard"]
+  s.email       = ["jeff.pollard@gmail.com"]
+  s.homepage    = "https://github.com/Fluxx/distillery"
+  s.summary     = %q{Extract the content portion of an HTML document.}
+  s.description = %q{Distillery extracts the "content" portion out of an HTML document. It applies heuristics based on element type, location, class/id name and other attributes to try and find the content part of the HTML document and return it.}
+  s.rubyforge_project = "distillery"
+  s.files         = `git ls-files`.split("\n")
+  s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
+  s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
+  s.require_paths = ["lib"]
+  s.add_dependency('nokogiri', '> 1.0')
+  s.add_dependency('slop', '> 1.0')
+  s.add_development_dependency('rspec', '> 2.0')
+  s.add_development_dependency('guard')
+  s.add_development_dependency('guard-rspec')
+  s.add_development_dependency('ruby-debug19')
+  s.add_development_dependency('rb-fsevent')
+  s.add_development_dependency('growl')
+end

data/lib/distillery.rb ADDED Viewed

@@ -0,0 +1,15 @@
+require "distillery/document"
+require "distillery/version"
+module Distillery
+  ROOT = File.dirname(__FILE__)
+  # Distills the HTMl document string to just the conent portion.
+  #
+  # @param [String] str The HTML document to distill as a string.
+  # @param [Hash] options Distillation options
+  # @option options [Symbol] :dirty Do not clean the content element HTML
+  def self.distill(str, options = {})
+    Document.new(str).distill!(options)
+  end
+end

data/lib/distillery/document.rb ADDED Viewed

@@ -0,0 +1,181 @@
+require "delegate"
+require "nokogiri"
+module Distillery
+  # Wraps a Nokogiri document for the HTML page to be disilled and holds all methods to
+  # clean and distill the document down to just its content element.
+  class Document < SimpleDelegator
+    # HTML elements unlikely to contain the content element.
+    UNLIKELY_TAGS = %w[head script link meta]
+    # HTML ids and classes that are unlikely to contain the content element.
+    UNLIKELY_IDENTIFIERS = /combx|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|sponsor/i
+    # "Block" elements who signal its parent is less-likely to be the content element.
+    BLOCK_ELEMENTS = %w[a blockquote dl div img ol p pre table ul]
+    # HTML ids and classes that are positive signals of the content element.
+    POSITIVE_IDENTIFIERS = /article|body|content|entry|hentry|page|pagination|post|text/i
+    # HTML ids and classes that are negative signals of the content element.
+    NEGATIVE_IDENTIFIERS = /combx|comment|contact|foot|footer|footnote|link|media|meta|promo|related|scroll|shoutbox|sponsor|tags|widget/i
+    # HTML elements that are unrelated to the content in the content element.
+    UNRELATED_ELEMENTS = %w[iframe form object]
+    # HTML elements that are possible unrelated to the content of the content HTML
+    # element.
+    POSSIBLE_UNRELATED_ELEMENTS = %w[table ul div]
+    # The Nokogiri document
+    attr_reader :doc
+    # Hash of xpath => content score of elements in this document
+    attr_reader :scores
+    # Create a new Document
+    #
+    # @param [String] str The HTML document to distill as a string.
+    def initialize(page_string)
+      @scores = Hash.new(0)
+      super(::Nokogiri::HTML(page_string))
+    end
+    # Removes irrelevent elements from the document.  This is usually things like <script>,
+    # <link> and other page elements we don't care about
+    def remove_irrelevant_elements!(tags = UNLIKELY_TAGS)
+      search(*tags).each(&:remove)
+    end
+    # Removes unlikely elements from the document.  These are elements who have classes
+    # that seem to indicate they are comments, headers, footers, nav, etc
+    def remove_unlikely_elements!
+      search('*').each do |element|
+        idclass = "#{element['class']}#{element['id']}"
+        element.remove if idclass =~ UNLIKELY_IDENTIFIERS && element.name != 'body'
+      end
+    end
+    # Corrects improper use of HTML tags by coerceing elements that are likely paragraphs
+    # to <p> tags
+    def coerce_elements_to_paragraphs!
+      search('div').each do |div|
+        div.name = "p" if has_no_block_children?(div) || has_only_empty_div_children?(div)
+      end
+    end
+    # Scores the document elements based on an algorithm to find elements which hold page
+    # content.
+    def score!
+      search('p').each do |paragraph|
+        points = 1
+        points += paragraph.text.split(',').length
+        points += [paragraph.text.length / 100, 3].min
+        scores[paragraph.path] = points
+        parent = paragraph.parent
+        scores[parent.path] += points
+        scores[parent.parent.path] += points.to_f/2
+      end
+      augment_scores_by_link_weight!
+    end
+    # Distills the document down to just its content.
+    #
+    # @param [Hash] options Distillation options
+    # @option options [Symbol] :dirty Do not clean the content element HTML
+    def distill!(options = {})
+      prep_for_distillation!
+      score!
+      clean_top_scoring_element! unless options.delete(:clean) == false
+      top_scoring_element.inner_html
+    end
+    # Attempts to clean the top scoring node from non-page content items, such as
+    # advertisements, widgets, etc
+    def clean_top_scoring_element!
+      top_scoring_element.search("*").each do |node|
+        node.remove if has_empty_text?(node)
+      end
+      top_scoring_element.search("*").each do |node|
+        if UNRELATED_ELEMENTS.include?(node.name) ||
+          (node.text.count(',') < 2 && unlikely_to_be_content?(node))
+          node.remove
+        end
+      end
+    end
+    # Prepares the document for distillation by removing irrelevant and unlikely elements,
+    # as well as corecomg some elements to paragraphs for scoring.
+    def prep_for_distillation!
+      remove_irrelevant_elements!
+      remove_unlikely_elements!
+      coerce_elements_to_paragraphs!
+    end
+    private
+    def augment_scores_by_link_weight!
+      scores.each do |xpath, points|
+        scores[xpath] = scores[xpath] * ( 1 - link_density(at(xpath)) )
+      end
+    end
+    def link_density(elem)
+      link_length = elem.search('a').reduce(0) { |total, e| total + e.text.length }
+      total_length = [elem.text.length, 1].max # Protect against dividing by 0
+      link_length.to_f / total_length.to_f
+    end
+    def top_scoring_element
+      winner = scores.sort_by { |xpath, score| score }.reverse.first
+      top_xpath, top_score = winner || ['/html/body', 1]
+      at(top_xpath)
+    end
+    def has_no_block_children?(elem)
+      elem.children.none? { |c| BLOCK_ELEMENTS.include?(c.name) }
+    end
+    def has_only_empty_div_children?(elem)
+      elem.search('div').all? { |subdiv| subdiv.text == "" }
+    end
+    def identifier_weight(elem)
+      {POSITIVE_IDENTIFIERS => 25, NEGATIVE_IDENTIFIERS => -25}.reduce(0) do |weight, pair|
+        regex, score = pair
+        (weight += score if "#{elem['class']}+#{elem['id']}" =~ regex) or weight
+      end
+    end
+    def has_empty_text?(elem)
+      elem.text.gsub(/\s/, '').empty? && elem.name != 'br'
+    end
+    def unlikely_to_be_content?(elem)
+      return false unless POSSIBLE_UNRELATED_ELEMENTS.include?(elem.name)
+      p = elem.search('p').length
+      img = elem.search('img').length
+      li = elem.search('li').length
+      input = elem.search('input').length
+      weight = identifier_weight(elem)
+      link_density = link_density(elem)
+      weight < 0 ||                                        # Terrible weight
+      elem.text.empty? || elem.text.length < 15 ||         # Empty text or too short text
+      img > p ||                                           # More images than paragraphs
+      li > p && !(elem.name =~ /ul|ol/) ||                 # Has lots of list items
+      input > p / 3 ||                                     # Has a high % of inputs
+      elem.text.length < 25 && (img == 0 || img > 2) ||    # Short text + no/high img count
+      weight < 25 && link_density > 0.2 ||                 # Weak content signal and moderate link density
+      weight >= 25 && link_density > 0.5                   # Strong content signal and high link density
+    end
+  end
+end

data/lib/distillery/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module Distillery
+  VERSION = "0.1.0"
+end

data/spec/acceptance_spec.rb ADDED Viewed

@@ -0,0 +1,108 @@
+require 'spec_helper'
+def distillation_of(filename, &block)
+  describe "distillation of #{filename}" do
+    let(:fixture) do
+      File.read(File.join(File.dirname(__FILE__), 'fixtures', filename))
+    end
+    subject { Distillery::Document.new(fixture).distill! }
+    it 'should include the right elements' do
+      instance_eval(&block)
+    end
+  end
+end
+distillation_of 'agave_cookies.html' do
+  should =~ /AGAVE &amp; HONEY OATMEAL M&amp;M COOKIES/
+  should =~ /2 Tbsp lightly beaten egg/
+  should =~ /Recipe Source:/
+  should_not =~ /I am a HUGE fan of agave and cook/         # Post comment
+  should_not =~ /mnuEntertaining/                           # ID of element in header
+  should_not =~ /Get Email Updates/                         # Sidebar
+  should_not =~ /id="footer"/                               # Footer
+end
+distillation_of 'clams_and_linguini.html' do
+  should =~ /<h2>Linguini with Clam Sauce Recipe<\/h2>/
+  should =~ /2 pounds small clams in the shell/
+  should =~ /completely evaporated./
+  should_not =~ /Licorice sounds interesting./              # Comment
+  should_not =~ /Bookmark this page using the following/    # Footer
+  should_not =~ /Google Search/                             # Header
+end
+distillation_of 'beef_jerkey.html' do
+  should =~ /always had a weakness/
+  should =~ /2 pounds trimmed beef top round/
+  should =~ /Om nom nom nom/
+  should_not =~ /Leave a Reply/                             # Footer
+  should_not =~ /EMAIL SUBSCRIPTION/                        # Sidebar
+  should_not =~ /allthingssimpleblog.com\/feed\//           # Header
+end
+distillation_of 'vanilla_pound_cake.html' do
+  should =~ /Tahitian bean for its floral notes/
+  should =~ /beat until light and fluffy/
+  should =~ /cake comes out clean/
+  should_not =~ /Pound cake is a classi/                    # Comments
+  should_not =~ /Simple template. Powered by/               # Footer
+  should_not =~ /Conversions and Measurement Tips/          # Header
+end
+distillation_of 'clouds_shining_moment.html' do
+  should =~ /The Dueling Models of Cloud Computing/
+  should =~ /These kinds of failures don't expose the weaknesses/
+  should =~ /Dynamic DNS pointing to elastic load balancers/
+  should_not =~ /Razi Sharir/                               # Comments
+  should_not =~ /All trademarks and registered/             # Footer
+  should_not =~ /Community Guidelines/                      # Header
+end
+distillation_of 'game_blog.html' do
+  should =~ /Currently in my Plants vs Zombies clone/
+  should =~ /50% they start to show sign/
+  should =~ /can never get enough feedback./
+  should_not =~ /Tutorials/                                 # Header
+  should_not =~ /Java Project/                              # Sidebar
+  should_not =~ /View all comments/                         # Footer
+end
+distillation_of 'js_this_keyword.html' do
+  should =~ /keyword is ubiquitous yet misconceptions abound/
+  should =~ /in ECMAScript parlance these are/
+  should =~ /Annex C/
+  should_not =~ /11 RESPONSES TO UNDERSTANDING/             # Footer
+  should_not =~ /The JavaScript Comma Operator/             # Sidebar
+  should_not =~ /Auto-generating JavaScript Unit Test/      # Header
+end
+distillation_of 'nyt_social_media.html' do
+  should =~ /What happens if you bring together/
+  should =~ /shows a 2D bar-graph-like timeline/
+  should =~ /then to explore several links/
+  should_not =~ /ADD A COMMENT/                             # Comments
+  should_not =~ /ABOUT 1,000 POSTS AGO/                     # Sidebar
+  should_not =~ /iPhone Tracker: How your/                  # Header
+end
+distillation_of 'ginger_cookies.html' do
+  should =~ /Ginger cookies are chilled/
+  should =~ /12 minutes/
+  should =~ /Makes about 4 dozen crispy/
+  should_not =~ /Sponsored Links/                             # Sidebar
+  should_not =~ /User Reviews/                                # Comments
+  should_not =~ /Free Southern Food Newsletter!/              # Header
+end