RubyGems - extractula - Versions diffs - 0.0.1 - Mend

extractula 0.0.1

Files changed (17) hide show

data/README.textile +0 -0
data/lib/extractula/dom_extractor.rb +68 -0
data/lib/extractula/extracted_content.rb +32 -0
data/lib/extractula.rb +24 -0
data/spec/extractula/dom_extractor_spec.rb +109 -0
data/spec/extractula/extracted_content_spec.rb +48 -0
data/spec/extractula_spec.rb +45 -0
data/spec/spec.opts +2 -0
data/spec/spec_helper.rb +14 -0
data/spec/test-files/10-stunning-web-site-prototype-sketches.html +986 -0
data/spec/test-files/nytimes.html +1967 -0
data/spec/test-files/nytimes_story.html +1112 -0
data/spec/test-files/totlol-youtube.html +563 -0
data/spec/test-files/typhoeus-the-best-ruby-http-client-just-got-better.html +576 -0
data/spec/test-files/ustream-new-years-eve.html +840 -0
data/spec/test-files/weather-channel-marriage-proposal-touching-with-a-chance-of-viral-status-video.html +949 -0
metadata +79 -0

data/README.textile ADDED Viewed

File without changes

data/lib/extractula/dom_extractor.rb ADDED Viewed

@@ -0,0 +1,68 @@
+# a basic dom based extractor. it's a generic catch all
+class Extractula::DomExtractor
+  def extract url, html
+    @doc = Nokogiri::HTML(html)
+    extracted = Extractula::ExtractedContent.new :url => url, :title => title, :content => content
+  end
+  def title
+    @title ||= @doc.search("//title").first.text.strip rescue nil
+  end
+  def content
+    candidate_nodes = @doc.search("//div|//p|//br").collect do |node|
+      parent = node.parent
+      if node.node_name == 'div'
+        text_size = calculate_children_text_size(parent, "div")
+        if text_size > 0
+          {:text_size => text_size, :parent => parent}
+        else
+          nil
+        end
+      elsif node.node_name == "p"
+        text_size = calculate_children_text_size(parent, "p")
+        if text_size > 0
+          {:text_size => text_size, :parent => parent}
+        else
+          nil
+        end
+      elsif node.node_name == "br"
+        if node.previous.node_name == "text" && node.next.node_name == "text"
+          text_size = 0
+          parent.children.each do |child|
+            text_size += child.text.strip.size if child.node_name == "text"
+          end
+          if text_size > 0
+            {:text_size => text_size, :parent => parent}
+          else
+            nil
+          end
+        else
+          nil
+        end
+      else
+        nil
+      end
+    end.compact.uniq
+    fragment = candidate_nodes.detect {|n| n[:text_size] > 140}[:parent].inner_html.strip rescue ""
+#    Loofah.fragment(fragment).scrub!(:prune).to_s
+  end
+  def summary
+  end
+  def calculate_children_text_size(parent, node_type)
+    text_size = 0
+    parent.children.each do |child|
+      if child.node_name == node_type
+        child.children.each {|c| text_size += c.text.strip.size if c.node_name == "text"}
+      end
+    end
+    text_size
+  end
+end

data/lib/extractula/extracted_content.rb ADDED Viewed

@@ -0,0 +1,32 @@
+class Extractula::ExtractedContent
+  attr_reader :url, :title, :content
+  def initialize(attributes = {})
+    attributes.each_pair {|k, v| instance_variable_set("@#{k}", v)}
+  end
+  def summary
+    return @summary if @summary
+    @content_doc ||= Nokogiri::HTML(@content)
+    content_fragment = @content_doc.inner_text.slice(0, 350)
+    sentence_break = content_fragment.rindex(/\?|\.|\!|\;/)
+    if sentence_break
+      @summary = content_fragment.slice(0, sentence_break + 1)
+      @summary
+    else
+      @summary = content_fragment
+    end
+  end
+  def image_urls
+    return @image_urls if @image_urls
+    @content_doc ||= Nokogiri::HTML(@content)
+    @image_urls = @content_doc.search("//img").collect {|t| t["src"]}
+  end
+  def video_embed
+    return @video_embed if @video_embed
+    @content_doc ||= Nokogiri::HTML(@content)
+    @content_doc.search("//object").collect {|t| t.to_html}.first
+  end
+end

data/lib/extractula.rb ADDED Viewed

@@ -0,0 +1,24 @@
+$LOAD_PATH.unshift(File.dirname(__FILE__)) unless $LOAD_PATH.include?(File.dirname(__FILE__))
+module Extractula; end
+require 'nokogiri'
+require 'extractula/extracted_content'
+require 'extractula/dom_extractor'
+module Extractula
+  @extractors = []
+  def self.add_extractor(extractor_class)
+    @extractors << extractor_class
+  end
+  def self.remove_extractor(extractor_class)
+    @extractors.delete extractor_class
+  end
+  def self.extract(url, html)
+    extractor = @extractors.detect {|e| e.can_extract? url, html} || DomExtractor
+    extractor.new.extract(url, html)
+  end
+end

data/spec/extractula/dom_extractor_spec.rb ADDED Viewed

@@ -0,0 +1,109 @@
+require File.dirname(__FILE__) + '/../spec_helper'
+describe "dom extractor" do
+  it "returns an extracted content object with the url set" do
+    result = Extractula::DomExtractor.new.extract("http://pauldix.net", "")
+    result.should be_a Extractula::ExtractedContent
+    result.url.should == "http://pauldix.net"
+  end
+end
+describe "extraction cases" do
+  describe "extracting from a typepad blog" do
+    before(:all) do
+      @extracted_content = Extractula::DomExtractor.new.extract(
+        "http://www.pauldix.net/2009/10/typhoeus-the-best-ruby-http-client-just-got-better.html",
+        read_test_file("typhoeus-the-best-ruby-http-client-just-got-better.html"))
+    end
+    it "extracts the title" do
+      @extracted_content.title.should == "Paul Dix Explains Nothing: Typhoeus, the best Ruby HTTP client just got better"
+    end
+    it "extracts the content" do
+      @extracted_content.content.should == "<p>I've been quietly working on Typhoeus for the last few months. With the help of <a href=\"http://metaclass.org/\">Wilson Bilkovich</a> and <a href=\"http://github.com/dbalatero\">David Balatero</a> I've finished what I think is a significant improvement to the library. The new interface removes all the magic and opts instead for clarity.</p>\n<p>It's really slick and includes improved stubing support, caching, memoization, and (of course) parallelism. The <a href=\"http://github.com/pauldix/typhoeus/\">Typhoeus readme</a> highlights all of the awesomeness. It should be noted that the old interface of including Typhoeus into classes and defining remote methods has been deprecated. I'll be removing that sometime in the next six months.</p>\n<p>In addition to thanking everyone using the library and everyone contributing, I should also thank my employer kgbweb. If you're a solid Rubyist that likes parsing, crawling, and stuff, or a machine learning guy, or a Solr/Lucene/indexing bad ass, let me know. We need you and we're doing some crazy awesome stuff.</p>"
+    end
+  end
+  describe "extracting from wordpress - techcrunch" do
+    before(:all) do
+      @extracted_content = Extractula::DomExtractor.new.extract(
+        "http://www.techcrunch.com/2009/12/29/totlol-youtube/",
+        read_test_file("totlol-youtube.html"))
+    end
+    it "extracts the title" do
+      @extracted_content.title.should == "The Sad Tale Of Totlol And How YouTube’s Changing TOS Made It Hard To Make A Buck"
+    end
+    it "extracts the content" do
+      @extracted_content.content.should == Nokogiri::HTML(read_test_file("totlol-youtube.html")).css("div.entry").first.inner_html.strip
+    end
+  end
+  describe "extracting from wordpress - mashable" do
+    before(:all) do
+      @extracted_content = Extractula::DomExtractor.new.extract(
+        "http://mashable.com/2009/12/29/ustream-new-years-eve/",
+        read_test_file("ustream-new-years-eve.html"))
+    end
+    it "extracts the title" do
+      @extracted_content.title.should == "New Years Eve: Watch Live Celebrations on Ustream"
+    end
+    it "extracts the content" do
+      @extracted_content.content.should == Nokogiri::HTML(read_test_file("ustream-new-years-eve.html")).css("div.text-content").first.inner_html.strip
+    end
+    it "extracts content with a video embed" do
+      extracted = Extractula::DomExtractor.new.extract(
+        "http://mashable.com/2009/12/30/weather-channel-marriage-proposal-touching-with-a-chance-of-viral-status-video/",
+        read_test_file("weather-channel-marriage-proposal-touching-with-a-chance-of-viral-status-video.html"))
+      extracted.content.should == "<div style=\"float: left; margin-right: 10px; margin-bottom: 4px;\">\n<div class=\"wdt_button\"><iframe scrolling=\"no\" height=\"61\" frameborder=\"0\" width=\"50\" src=\"http://api.tweetmeme.com/widget.js?url=http://mashable.com/2009/12/30/weather-channel-marriage-proposal-touching-with-a-chance-of-viral-status-video/&amp;style=normal&amp;source=mashable&amp;service=bit.ly\"></iframe></div>\n<div class=\"wdt_button\" style=\"height:59px;\">\n<a name=\"fb_share\" type=\"box_count\" share_url=\"http://mashable.com/2009/12/30/weather-channel-marriage-proposal-touching-with-a-chance-of-viral-status-video/\"></a>\n</div>\n</div>\n<p><a href=\"http://mashable.com/wp-content/uploads/2009/12/weather.jpg\"><img src=\"http://mashable.com/wp-content/uploads/2009/12/weather.jpg\" alt=\"\" title=\"weather\" width=\"266\" height=\"184\" class=\"alignright size-full wp-image-174336\"></a>First <a href=\"http://mashable.com/tag/twitter/\">Twitter</a>, then Foursquare, now the Weather Channel? People are broadcasting their wedding proposals all over the place these days. </p>\n<p>That’s right, the other night Weather Channel meteorologist Kim Perez’s beau, police Sgt. Marty Cunningham (best name EVER), asked her to marry him during a routine forecast. Good thing she said yes, otherwise Cunningham’s disposition would have been cloudy with a serious chance of all-out mortification.<br><span id=\"more-174310\"></span></p>\n<p>Social media and viral videos have taken the place of the jumbotron when it comes to marriage proposals, allowing one to sound one’s not-so barbaric yawp over the roofs of the world. In today’s look-at-me society, public proposals are probably the least offensive byproduct. Meaning that even the most hardened of cynics can admit that they’re kind of sweet.</p>\n<p>Check out Cunningham’s proposal below (I personally enjoy that the weather map reads “<em>ring</em>ing in the New Year”), and then dive right into our list of even more social media wooers. What’s next? Entire domains dedicated to popping the question?</p>\n<p></p>\n<center>\n<object width=\"425\" height=\"344\"><param name=\"movie\" value=\"http://www.youtube.com/v/0dHTIGas4CA&amp;color1=0x3a3a3a&amp;color2=0x999999&amp;hl=en_US&amp;feature=player_embedded&amp;fs=1\">\n<param name=\"allowFullScreen\" value=\"true\">\n<param name=\"allowScriptAccess\" value=\"always\">\n<embed wmode=\"opaque\" src=\"http://www.youtube.com/v/0dHTIGas4CA&amp;color1=0x3a3a3a&amp;color2=0x999999&amp;hl=en_US&amp;feature=player_embedded&amp;fs=1\" type=\"application/x-shockwave-flash\" allowfullscreen=\"true\" allowscriptaccess=\"always\" width=\"425\" height=\"344\"></embed></object>\n<p></p>\n</center>\n<hr>\n<h2>More Wedding Bells and Whistles</h2>\n<hr>\n<p><a href=\"http://mashable.com/2009/08/28/mashable-marriage-proposal/\">CONGRATS: Mashable Marriage Proposal Live at #SocialGood [Video]</a></p>\n<p><a href=\"http://mashable.com/2009/12/19/foursquare-proposal/\">Man Proposes Marriage via Foursquare Check-In</a></p>\n<p><a href=\"http://mashable.com/2008/03/21/max-emily-twitter-proposal/\">Did We Just Witness a Twitter Marriage Proposal?</a></p>\n<p><a href=\"http://mashable.com/2009/06/30/twitter-marriage/\">Successful Marriage Proposal on Twitter Today: We #blamedrewscancer</a></p>\n<p><a href=\"http://mashable.com/2009/12/01/groom-facebook-update/\">Just Married: Groom Changes Facebook Relationship Status at the Altar [VIDEO]</a></p>"
+    end
+  end
+  describe "extracting from alleyinsider" do
+    before(:all) do
+      @extracted_content = Extractula::DomExtractor.new.extract(
+        "http://www.businessinsider.com/10-stunning-web-site-prototype-sketches-2009-12",
+        read_test_file("10-stunning-web-site-prototype-sketches.html"))
+    end
+    it "extracts the title" do
+      @extracted_content.title.should == "10 Stunning Web Site Prototype Sketches"
+    end
+    it "extracts the content" do
+      @extracted_content.content.should == Nokogiri::HTML(read_test_file("10-stunning-web-site-prototype-sketches.html")).css("div.KonaBody").first.inner_html.strip
+    end
+  end
+  describe "extracting from nytimes" do
+    before(:all) do
+      @front_page = Extractula::DomExtractor.new.extract(
+        "http://www.nytimes.com/",
+        read_test_file("nytimes.html"))
+      @story_page = Extractula::DomExtractor.new.extract(
+        "http://www.nytimes.com/2009/12/31/world/asia/31history.html?_r=1&hp",
+        read_test_file("nytimes_story.html"))
+    end
+    it "extracts the title" do
+      @front_page.title.should == "The New York Times - Breaking News, World News & Multimedia"
+    end
+    it "extracts the content" do
+      @front_page.content.should == Nokogiri::HTML(read_test_file("nytimes.html")).css("div.story").first.inner_html.strip
+    end
+    it "extracts a story title" do
+      @story_page.title.should == "Army Historians Document Early Missteps in Afghanistan - NYTimes.com"
+    end
+    it "extracts the story content" do
+      @story_page.content.should == Nokogiri::HTML(read_test_file("nytimes_story.html")).css("nyt_text").first.inner_html.strip
+    end
+  end
+end

data/spec/extractula/extracted_content_spec.rb ADDED Viewed

@@ -0,0 +1,48 @@
+require File.dirname(__FILE__) + '/../spec_helper'
+describe "extracted content" do
+  it "has a url" do
+    Extractula::ExtractedContent.new(:url => "http://pauldix.net").url.should == "http://pauldix.net"
+  end
+  it "has a title" do
+    Extractula::ExtractedContent.new(:title => "whatevs").title.should == "whatevs"
+  end
+  it "has content" do
+    Extractula::ExtractedContent.new(:content => "some content").content.should == "some content"
+  end
+  describe "summary" do
+    it "has a summary" do
+      Extractula::ExtractedContent.new(:summary => "a summary!").summary.should == "a summary!"
+    end
+    it "generates the summary from the content" do
+      extracted = Extractula::ExtractedContent.new(:content => "<p>I've been quietly working on Typhoeus for the last few months. With the help of <a href=\"http://metaclass.org/\">Wilson Bilkovich</a> and <a href=\"http://github.com/dbalatero\">David Balatero</a> I've finished what I think is a significant improvement to the library. The new interface removes all the magic and opts instead for clarity.</p>\n<p>It's really slick and includes improved stubing support, caching, memoization, and (of course) parallelism. The <a href=\"http://github.com/pauldix/typhoeus/\">Typhoeus readme</a> highlights all of the awesomeness. It should be noted that the old interface of including Typhoeus into classes and defining remote methods has been deprecated. I'll be removing that sometime in the next six months.</p>\n<p>In addition to thanking everyone using the library and everyone contributing, I should also thank my employer kgbweb. If you're a solid Rubyist that likes parsing, crawling, and stuff, or a machine learning guy, or a Solr/Lucene/indexing bad ass, let me know. We need you and we're doing some crazy awesome stuff.</p>")
+      extracted.summary.should == "I've been quietly working on Typhoeus for the last few months. With the help of Wilson Bilkovich and David Balatero I've finished what I think is a significant improvement to the library. The new interface removes all the magic and opts instead for clarity."
+    end
+  end
+  describe "image_urls" do
+    it "has image_urls" do
+      Extractula::ExtractedContent.new(:image_urls => ["first.jpg", "second.tiff"]).image_urls.should == ["first.jpg", "second.tiff"]
+    end
+    it "generates the image urls from the content" do
+      extracted = Extractula::ExtractedContent.new(:content => "<p><a href=\"http://www.businessinsider.com/10-stunning-web-site-prototype-sketches-2009-12/early-ember-1\"><img class=\"float_right\" src=\"http://static.businessinsider.com/~~/f?id=4b3a466f000000000086e662&amp;maxX=311&amp;maxY=233\" border=\"0\" height=\"233\" alt=\"Web site wireframes\" width=\"311\"></a></p>\n<div style=\"float: left; padding: 15px 15px 15px 0;\">\n\n</div>\n<p>When designers start a new Web site, they often sketch out a first idea of the page layout using paper and stencil.&nbsp;</p>\n<p>Designers call this sketch a \"wireframe.\"</p>\n<p>Woorkup.com's Antonio Lupetti <a href=\"http://woorkup.com/2009/12/28/10-beautiful-sketches-for-website-prototypes/\">collected</a> 10 beautiful examples of wireframes.</p>\n<p><a href=\"http://www.businessinsider.com/10-stunning-web-site-prototype-sketches-2009-12/early-ember-1\"><strong>He gave us permission to republish them here &gt;</strong></a></p>")
+      extracted.image_urls.should == ["http://static.businessinsider.com/~~/f?id=4b3a466f000000000086e662&maxX=311&maxY=233"]
+    end
+  end
+  describe "video_embed" do
+    it "has a video_embed" do
+      Extractula::ExtractedContent.new(:video_embed => "some embed code").video_embed.should == "some embed code"
+    end
+    it "pulls video embed tags from the content" do
+      extracted = Extractula::ExtractedContent.new(:content => "<div style=\"float: left; margin-right: 10px; margin-bottom: 4px;\">\n<div class=\"wdt_button\"><iframe scrolling=\"no\" height=\"61\" frameborder=\"0\" width=\"50\" src=\"http://api.tweetmeme.com/widget.js?url=http://mashable.com/2009/12/30/weather-channel-marriage-proposal-touching-with-a-chance-of-viral-status-video/&amp;style=normal&amp;source=mashable&amp;service=bit.ly\"></iframe></div>\n<div class=\"wdt_button\" style=\"height:59px;\">\n<a name=\"fb_share\" type=\"box_count\" share_url=\"http://mashable.com/2009/12/30/weather-channel-marriage-proposal-touching-with-a-chance-of-viral-status-video/\"></a>\n</div>\n</div>\n<p><a href=\"http://mashable.com/wp-content/uploads/2009/12/weather.jpg\"><img src=\"http://mashable.com/wp-content/uploads/2009/12/weather.jpg\" alt=\"\" title=\"weather\" width=\"266\" height=\"184\" class=\"alignright size-full wp-image-174336\"></a>First <a href=\"http://mashable.com/tag/twitter/\">Twitter</a>, then Foursquare, now the Weather Channel? People are broadcasting their wedding proposals all over the place these days. </p>\n<p>That’s right, the other night Weather Channel meteorologist Kim Perez’s beau, police Sgt. Marty Cunningham (best name EVER), asked her to marry him during a routine forecast. Good thing she said yes, otherwise Cunningham’s disposition would have been cloudy with a serious chance of all-out mortification.<br><span id=\"more-174310\"></span></p>\n<p>Social media and viral videos have taken the place of the jumbotron when it comes to marriage proposals, allowing one to sound one’s not-so barbaric yawp over the roofs of the world. In today’s look-at-me society, public proposals are probably the least offensive byproduct. Meaning that even the most hardened of cynics can admit that they’re kind of sweet.</p>\n<p>Check out Cunningham’s proposal below (I personally enjoy that the weather map reads “<em>ring</em>ing in the New Year”), and then dive right into our list of even more social media wooers. What’s next? Entire domains dedicated to popping the question?</p>\n<p></p>\n<center>\n<object width=\"425\" height=\"344\"><param name=\"movie\" value=\"http://www.youtube.com/v/0dHTIGas4CA&amp;color1=0x3a3a3a&amp;color2=0x999999&amp;hl=en_US&amp;feature=player_embedded&amp;fs=1\">\n<param name=\"allowFullScreen\" value=\"true\">\n<param name=\"allowScriptAccess\" value=\"always\">\n<embed wmode=\"opaque\" src=\"http://www.youtube.com/v/0dHTIGas4CA&amp;color1=0x3a3a3a&amp;color2=0x999999&amp;hl=en_US&amp;feature=player_embedded&amp;fs=1\" type=\"application/x-shockwave-flash\" allowfullscreen=\"true\" allowscriptaccess=\"always\" width=\"425\" height=\"344\"></embed></object>\n<p></p>\n</center>\n<hr>\n<h2>More Wedding Bells and Whistles</h2>\n<hr>\n<p><a href=\"http://mashable.com/2009/08/28/mashable-marriage-proposal/\">CONGRATS: Mashable Marriage Proposal Live at #SocialGood [Video]</a></p>\n<p><a href=\"http://mashable.com/2009/12/19/foursquare-proposal/\">Man Proposes Marriage via Foursquare Check-In</a></p>\n<p><a href=\"http://mashable.com/2008/03/21/max-emily-twitter-proposal/\">Did We Just Witness a Twitter Marriage Proposal?</a></p>\n<p><a href=\"http://mashable.com/2009/06/30/twitter-marriage/\">Successful Marriage Proposal on Twitter Today: We #blamedrewscancer</a></p>\n<p><a href=\"http://mashable.com/2009/12/01/groom-facebook-update/\">Just Married: Groom Changes Facebook Relationship Status at the Altar [VIDEO]</a></p>")
+      extracted.video_embed.should == "<object width=\"425\" height=\"344\"><param name=\"movie\" value=\"http://www.youtube.com/v/0dHTIGas4CA&amp;color1=0x3a3a3a&amp;color2=0x999999&amp;hl=en_US&amp;feature=player_embedded&amp;fs=1\">\n<param name=\"allowFullScreen\" value=\"true\">\n<param name=\"allowScriptAccess\" value=\"always\">\n<embed wmode=\"opaque\" src=\"http://www.youtube.com/v/0dHTIGas4CA&amp;color1=0x3a3a3a&amp;color2=0x999999&amp;hl=en_US&amp;feature=player_embedded&amp;fs=1\" type=\"application/x-shockwave-flash\" allowfullscreen=\"true\" allowscriptaccess=\"always\" width=\"425\" height=\"344\"></embed></object>"
+    end
+  end
+end

data/spec/extractula_spec.rb ADDED Viewed

@@ -0,0 +1,45 @@
+require File.dirname(__FILE__) + '/spec_helper'
+describe "extractula" do
+  it "can add custom extractors" do
+    custom_extractor = Class.new do
+      def self.can_extract? url, html
+        true
+      end
+      def extract url, html
+        Extractula::ExtractedContent.new :url => "custom extractor url", :summary => "my custom extractor"
+      end
+    end
+    Extractula.add_extractor custom_extractor
+    content = Extractula.extract("http://pauldix.net", "some html")
+    content.url.should == "custom extractor url"
+    content.summary.should == "my custom extractor"
+    Extractula.remove_extractor custom_extractor
+  end
+  it "skips custom extractors that can't extract the passed url and html" do
+    custom_extractor = Class.new do
+      def self.can_extract? url, html
+        false
+      end
+      def extract url, html
+        Extractula::ExtractedContent.new :url => "this url", :summary => "this summary"
+      end
+    end
+    Extractula.add_extractor custom_extractor
+    content = Extractula.extract("http://pauldix.net", "some html")
+    content.url.should_not == "this url"
+    content.summary.should_not == "this summary"
+    Extractula.remove_extractor custom_extractor
+  end
+  it "extracts from a url and document and returns an ExtractedContent object" do
+    result = Extractula.extract("http://pauldix.net", "")
+    result.should be_a Extractula::ExtractedContent
+    result.url.should == "http://pauldix.net"
+  end
+end

data/spec/spec.opts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ --diff
2	+ --color

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,14 @@
+require "rubygems"
+require "spec"
+# gem install redgreen for colored test output
+begin require "redgreen" unless ENV['TM_CURRENT_LINE']; rescue LoadError; end
+path = File.expand_path(File.dirname(__FILE__) + "/../lib/")
+$LOAD_PATH.unshift(path) unless $LOAD_PATH.include?(path)
+require "lib/extractula"
+def read_test_file(file_name)
+  File.read("#{File.dirname(__FILE__)}/test-files/#{file_name}")
+end