RubyGems - content_scrapper - Versions diffs - 0.0.7 → 0.0.8 - Mend

content_scrapper 0.0.7 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

data/VERSION +1 -1
data/content_scrapper.gemspec +2 -2
data/lib/content_scrapper.rb +2 -2
data/lib/content_scrapper/feedzirra.rb +5 -4
data/test/test_content_scrapper.rb +31 -0
metadata +2 -2

data/VERSION CHANGED

	@@ -1 +1 @@
1	- 0.0.7
1	+ 0.0.8

data/content_scrapper.gemspec CHANGED

@@ -5,11 +5,11 @@
 Gem::Specification.new do |s|
   s.name = %q{content_scrapper}
-  s.version = "0.0.7"
+  s.version = "0.0.8"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Gyorgy Frivolt"]
-  s.date = %q{2010-02-28}
+  s.date = %q{2010-03-09}
   s.description = %q{If you want to cut only the content of pages, without any other part (like the menu, header, footer, commercials, etc.), you might find this gem very handy. A DSL is also defined for nifty definitions for your screen scrapping and sanitization.}
   s.email = %q{gyorgy.frivolt@gmail.com}
   s.extra_rdoc_files = [

data/lib/content_scrapper.rb CHANGED

@@ -51,12 +51,12 @@ class ContentScrapper
     end
   end
-  def scrap_content(url)
+  def scrap_content(url, full_page = nil)
     content_mappings.each do | content_mapping |
       if content_mapping.matches_url?(url)
         return nil if content_mapping.content_xpaths_list.empty?
         begin
-          doc = Nokogiri::HTML(Kernel.open(url))
+          doc = Nokogiri::HTML(full_page || Kernel.open(url))
           return content_mapping.scrap_content(doc, content_scrapper = self)
         rescue Exception
           @scrapping_exception_handler_block.call($!) unless @scrapping_exception_handler_block.nil?

data/lib/content_scrapper/feedzirra.rb CHANGED

@@ -1,17 +1,18 @@
 # feedzirra entries are extended by methods for scrapping content
 require 'feedzirra/feed_entry_utilities'
+require 'ruby-debug'
 module Feedzirra
   module FeedEntryUtilities
     # Scrap the content based on the URL and the existing content and return it
-    def scrap_content(scrapper = ContentScrapper.default)
-      scrapper.scrap_content(self.url) || self.content.to_s
+    def scrap_content(scrapper = ContentScrapper.default, full_page = nil)
+      scrapper.scrap_content(self.url, full_page = full_page) || self.content.to_s
     end
     # Scrap the content or use the existing one and change the feed entry
-    def scrap_content!(scrapper = ContentScrapper.default)
-      content = scrap_content(scrapper)
+    def scrap_content!(scrapper = ContentScrapper.default, full_page = nil)
+      self.content = scrap_content(scrapper, full_page = full_page)
     end
   end
 end

data/test/test_content_scrapper.rb CHANGED

@@ -107,6 +107,18 @@ class TestContentScrapper < Test::Unit::TestCase
       should("not match enything") { assert_nil @entry_content }
     end
+    context "on already downloaded document" do
+      setup do
+        pretty_content = File.open("#{File.dirname(__FILE__)}/test_pages/pretty.html").read
+        Kernel.expects(:open).never
+        @scrapped_content = @scrapper.scrap_content('http://www.pretty.url/hsdae',
+                                                    full_page = pretty_content)
+      end
+      should "scrap from the provided full page" do
+        assert_match(%r{<p><strong>This is a strong text</strong></p>}, @scrapped_content)
+      end
+    end
     context "on scrapping with feedzirra" do
       setup do
         require 'content_scrapper/feedzirra'
@@ -132,6 +144,25 @@ class TestContentScrapper < Test::Unit::TestCase
           end
         end
       end
+      context "on feed entry with url and scrapping with full_page" do
+        setup do
+          @feed_entries = [ Feedzirra::Parser::RSSEntry.new, Feedzirra::Parser::AtomEntry.new ]
+          @feed_entries.each do |feed_entry|
+            feed_entry.url = 'http://www.pretty.url/wedhsf'
+          end
+          @pretty_content = File.open("#{File.dirname(__FILE__)}/test_pages/pretty.html").read
+          Kernel.expects(:open).never
+        end
+        should("return the original feed content") do
+          @feed_entries.each do |feed_entry|
+            assert_match(%r{<p><strong>This is a strong text</strong></p>},
+              feed_entry.scrap_content(@scrapper, full_page = @pretty_content))
+            feed_entry.scrap_content!(@scrapper, full_page = @pretty_content)
+            assert_match(%r{<p><strong>This is a strong text</strong></p>}, feed_entry.content)
+          end
+        end
+      end
     end
     context "on failing scrapping" do

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: content_scrapper
 version: !ruby/object:Gem::Version
-  version: 0.0.7
+  version: 0.0.8
 platform: ruby
 authors:
 - Gyorgy Frivolt
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-02-28 00:00:00 +01:00
+date: 2010-03-09 00:00:00 +01:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency