RubyGems - content_scrapper - Versions diffs - 0.0.4 → 0.0.5 - Mend

content_scrapper 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

data/.specification +114 -0
data/VERSION +1 -1
data/content_scrapper.gemspec +3 -2
data/lib/content_scrapper/content_mapping.rb +4 -2
data/lib/content_scrapper.rb +14 -8
data/test/test_content_mapping.rb +2 -1
data/test/test_content_scrapper.rb +35 -3
metadata +3 -2

data/.specification ADDED Viewed

@@ -0,0 +1,114 @@
+--- !ruby/object:Gem::Specification
+name: content_scrapper
+version: !ruby/object:Gem::Version
+  version: 0.0.4
+platform: ruby
+authors:
+- Gyorgy Frivolt
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2010-02-13 00:00:00 +01:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: thoughtbot-shoulda
+  type: :development
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 2.10.2
+    version:
+- !ruby/object:Gem::Dependency
+  name: mocha
+  type: :development
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.9.8
+    version:
+- !ruby/object:Gem::Dependency
+  name: sanitize
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.2.0
+    version:
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.4.1
+    version:
+description: If you want to cut only the content of pages, without any other part (like the menu, header, footer, commercials, etc.), you might find this gem very handy. A DSL is also defined for nifty definitions for your screen scrapping and sanitization.
+email: gyorgy.frivolt@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files:
+- LICENSE
+- README.rdoc
+files:
+- .document
+- .gitignore
+- LICENSE
+- README.rdoc
+- Rakefile
+- VERSION
+- config/content_scrapper.rb
+- content_scrapper.gemspec
+- lib/content_scrapper.rb
+- lib/content_scrapper/content_mapping.rb
+- lib/content_scrapper/feedzirra.rb
+- rails/init.rb
+- test/helper.rb
+- test/test_content_mapping.rb
+- test/test_content_scrapper.rb
+- test/test_pages/pretty.html
+- test/test_pages/twocontent.html
+- test/test_pages/ugly.html
+has_rdoc: true
+homepage: http://github.com/fifigyuri/content_scrapper
+licenses: []
+post_install_message:
+rdoc_options:
+- --charset=UTF-8
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project:
+rubygems_version: 1.3.5
+signing_key:
+specification_version: 3
+summary: Gem for those who want to screen scrap only the content part of web pages, blogs or articles.
+test_files:
+- test/test_content_mapping.rb
+- test/test_content_scrapper.rb
+- test/helper.rb

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.0.4
1	+ 0.0.5

data/content_scrapper.gemspec CHANGED Viewed

@@ -5,11 +5,11 @@
 Gem::Specification.new do |s|
   s.name = %q{content_scrapper}
-  s.version = "0.0.4"
+  s.version = "0.0.5"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Gyorgy Frivolt"]
-  s.date = %q{2010-02-13}
+  s.date = %q{2010-02-22}
   s.description = %q{If you want to cut only the content of pages, without any other part (like the menu, header, footer, commercials, etc.), you might find this gem very handy. A DSL is also defined for nifty definitions for your screen scrapping and sanitization.}
   s.email = %q{gyorgy.frivolt@gmail.com}
   s.extra_rdoc_files = [
@@ -19,6 +19,7 @@ Gem::Specification.new do |s|
   s.files = [
     ".document",
      ".gitignore",
+     ".specification",
      "LICENSE",
      "README.rdoc",
      "Rakefile",

data/lib/content_scrapper/content_mapping.rb CHANGED Viewed

@@ -19,10 +19,12 @@ class ContentMapping
     url =~ @url_pattern_regexp
   end
-  def scrap_content(doc)
+  def scrap_content(doc, content_scrapper = nil)
     @content_xpaths_list.each do |content_xpath|
       content_section = doc.xpath(content_xpath)
-      return content_section.to_a.join("\n") if content_section.count > 0
+      content = content_section.to_a.join("\n")
+      content = Sanitize.clean(content, content_scrapper.sanitize_settings) unless content_scrapper.nil?
+      return content if content_section.count > 0
     end
     nil
   end

data/lib/content_scrapper.rb CHANGED Viewed

@@ -24,7 +24,7 @@ class ContentScrapper
   def initialize(scrapper_config_file = nil)
     @content_mappings = []
-    config_file = ContentScrapper.default_config_file
+    config_file = scrapper_config_file || ContentScrapper.default_config_file
     self.instance_eval(File.read(config_file), config_file) unless config_file.nil?
   end
@@ -34,8 +34,8 @@ class ContentScrapper
     @content_mappings << new_mapping
   end
-  def sanitize_tags(settings)
-    @sanitize_settings = settings
+  def sanitize_tags(&block)
+    @sanitize_settings = block.call()
   end
   def scrap_content(url)
@@ -44,17 +44,23 @@ class ContentScrapper
         return nil if content_mapping.content_xpaths_list.empty?
         begin
           doc = Nokogiri::HTML(Kernel.open(url))
-          content = content_mapping.scrap_content(doc)
-          return nil if content.nil?
-          return Sanitize.clean(content, sanitize_settings)
+          return content_mapping.scrap_content(doc, content_scrapper = self)
         rescue Exception
-          scrap_content_exception($!)
+          @scrapping_exception_handler_block.call($!) unless @scrapping_exception_handler_block.nil?
+          return nil
         end
       end
     end
+    @missing_url_matcher_handler_block.call(url) unless @missing_url_matcher_handler_block.nil?
     nil
   end
-  def scrap_content_exception(exception)
+  def rescue_scrapping(&block)
+    @scrapping_exception_handler_block = block
+  end
+  def missing_url_matcher(&block)
+    @missing_url_matcher_handler_block = block
   end
 end

data/test/test_content_mapping.rb CHANGED Viewed

@@ -26,7 +26,8 @@ class TestContentMapping < Test::Unit::TestCase
         @document = Nokogiri::HTML(pretty_content)
       end
       should "extract the content" do
-        assert_match(%r{<p><strong>This is a strong text</strong></p>}, @mapping.scrap_content(@document))
+        assert_match(%r{<p><strong>This is a strong text</strong></p>},
+                     @mapping.scrap_content(@document))
       end
     end
     context "on document with two content parts" do

data/test/test_content_scrapper.rb CHANGED Viewed

@@ -5,7 +5,7 @@ class TestContentScrapper < Test::Unit::TestCase
   ContentScrapper.default_config_file = nil
-  context "on common setting" do
+  context "on common settings" do
     setup do
       @scrapper = ContentScrapper.new
       @scrapper.instance_eval do
@@ -35,8 +35,10 @@ class TestContentScrapper < Test::Unit::TestCase
           content_at '//div[@id="never_should_be_here"]'
         end
-        sanitize_tags ({:elements => ['p','br', 'b', 'em', 'i', 'strong', 'u', 'a', 'h1', 'h2', 'h3', 'li', 'ol', 'ul'], \
-                       :attributes => { 'a' => ['href'] }})
+        sanitize_tags do
+          {:elements => ['p','br', 'b', 'em', 'i', 'strong', 'u', 'a', 'h1', 'h2', 'h3', 'li', 'ol', 'ul'], \
+                       :attributes => { 'a' => ['href'] }}
+        end
       end
     end
@@ -113,6 +115,34 @@ class TestContentScrapper < Test::Unit::TestCase
         end
       end
     end
+    context "on failing scrapping" do
+      setup do
+        Kernel.expects(:open).raises(Exception, 'something failed')
+        @exception_handle_flag = nil
+        @scrapper.rescue_scrapping do |exception|
+          @exception_handle_flag = exception.message
+        end
+      end
+      should "catch the exception and handle it" do
+        assert_nil @scrapper.scrap_content('http://www.pretty.url')
+        assert_equal 'something failed', @exception_handle_flag
+      end
+    end
+    context "on missing url matcher" do
+      setup do
+        Kernel.expects(:open).never
+        @missing_url_matcher_flag = nil
+        @scrapper.missing_url_matcher do |url|
+          @missing_url_matcher_flag = url
+        end
+        @scrapper.scrap_content('http://missing.url.matcher')
+      end
+      should "call the handler block" do
+        assert_equal 'http://missing.url.matcher', @missing_url_matcher_flag
+      end
+    end
   end
   context "on setting default content scrapper" do
@@ -126,6 +156,7 @@ class TestContentScrapper < Test::Unit::TestCase
         assert_equal @new_scrapper, ContentScrapper.default
       end
     end
     context "for feed entry" do
       setup do
         @feed_entry = Feedzirra::Parser::RSSEntry.new
@@ -138,3 +169,4 @@ class TestContentScrapper < Test::Unit::TestCase
     end
   end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: content_scrapper
 version: !ruby/object:Gem::Version
-  version: 0.0.4
+  version: 0.0.5
 platform: ruby
 authors:
 - Gyorgy Frivolt
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-02-13 00:00:00 +01:00
+date: 2010-02-22 00:00:00 +01:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -64,6 +64,7 @@ extra_rdoc_files:
 files:
 - .document
 - .gitignore
+- .specification
 - LICENSE
 - README.rdoc
 - Rakefile