RubyGems - content_scrapper - Versions diffs - 0.0.6 → 0.0.7 - Mend

content_scrapper 0.0.6 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

data/Rakefile +0 -1
data/VERSION +1 -1
data/content_scrapper.gemspec +4 -5
data/lib/content_scrapper.rb +17 -4
data/lib/content_scrapper/content_mapping.rb +2 -1
data/test/test_content_mapping.rb +2 -2
data/test/test_content_scrapper.rb +18 -0
data/test/test_pages.rb +29 -0
data/test/test_pages/cdata.html +23 -0
metadata +4 -12
data/.specification +0 -115

data/Rakefile CHANGED Viewed

@@ -13,7 +13,6 @@ begin
     gem.add_development_dependency 'thoughtbot-shoulda', '>=2.10.2'
     gem.add_development_dependency 'mocha', '>=0.9.8'
-    gem.add_dependency 'sanitize', '>=1.2.0'
     gem.add_dependency 'nokogiri', '>=1.4.1'
   end
   Jeweler::GemcutterTasks.new

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.0.6
1	+ 0.0.7

data/content_scrapper.gemspec CHANGED Viewed

@@ -5,7 +5,7 @@
 Gem::Specification.new do |s|
   s.name = %q{content_scrapper}
-  s.version = "0.0.6"
+  s.version = "0.0.7"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Gyorgy Frivolt"]
@@ -19,7 +19,6 @@ Gem::Specification.new do |s|
   s.files = [
     ".document",
      ".gitignore",
-     ".specification",
      "LICENSE",
      "README.rdoc",
      "Rakefile",
@@ -33,6 +32,8 @@ Gem::Specification.new do |s|
      "test/helper.rb",
      "test/test_content_mapping.rb",
      "test/test_content_scrapper.rb",
+     "test/test_pages.rb",
+     "test/test_pages/cdata.html",
      "test/test_pages/encoding.html",
      "test/test_pages/pretty.html",
      "test/test_pages/twocontent.html",
@@ -46,6 +47,7 @@ Gem::Specification.new do |s|
   s.test_files = [
     "test/test_content_mapping.rb",
      "test/test_content_scrapper.rb",
+     "test/test_pages.rb",
      "test/helper.rb"
   ]
@@ -56,18 +58,15 @@ Gem::Specification.new do |s|
     if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
       s.add_development_dependency(%q<thoughtbot-shoulda>, [">= 2.10.2"])
       s.add_development_dependency(%q<mocha>, [">= 0.9.8"])
-      s.add_runtime_dependency(%q<sanitize>, [">= 1.2.0"])
       s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.1"])
     else
       s.add_dependency(%q<thoughtbot-shoulda>, [">= 2.10.2"])
       s.add_dependency(%q<mocha>, [">= 0.9.8"])
-      s.add_dependency(%q<sanitize>, [">= 1.2.0"])
       s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
     end
   else
     s.add_dependency(%q<thoughtbot-shoulda>, [">= 2.10.2"])
     s.add_dependency(%q<mocha>, [">= 0.9.8"])
-    s.add_dependency(%q<sanitize>, [">= 1.2.0"])
     s.add_dependency(%q<nokogiri>, [">= 1.4.1"])
   end
 end

data/lib/content_scrapper.rb CHANGED Viewed

@@ -1,7 +1,6 @@
 require 'open-uri'
 require 'nokogiri'
-require 'sanitize'
 require 'content_scrapper/content_mapping'
@@ -20,7 +19,7 @@ class ContentScrapper
     ContentScrapper.default = self
   end
-  attr_accessor :content_mappings, :sanitize_settings
+  attr_reader :content_mappings
   def initialize(scrapper_config_file = nil)
     @content_mappings = []
@@ -34,8 +33,22 @@ class ContentScrapper
     @content_mappings << new_mapping
   end
-  def sanitize_tags(&block)
-    @sanitize_settings = block.call()
+  def clean_content(content)
+    @content_cleaner_block.nil? ? content : @content_cleaner_block.call(content)
+  end
+  def sanitize_tags(&sanitize_settings)
+    @content_cleaner_block = lambda do |content|
+      require 'sanitize'
+      Sanitize.clean(content, sanitize_settings.call())
+    end
+  end
+  def loofah_tags(scrap_type)
+    @content_scrapper_block = lambda do |content|
+      require 'loofah'
+      Loofah.document(content).scrub!(scrap_type).to_s
+    end
   end
   def scrap_content(url)

data/lib/content_scrapper/content_mapping.rb CHANGED Viewed

@@ -29,10 +29,11 @@ class ContentMapping
     @content_xpaths_list.each do |content_xpath|
       content_section = doc.xpath(content_xpath)
       content = content_section.to_a.join("\n")
-      content = Sanitize.clean(content, content_scrapper.sanitize_settings) unless content_scrapper.nil?
+      content = content_scrapper.clean_content(content) unless content_scrapper.nil?
       content = Iconv.conv(to=iconv_to, from=iconv_from, content) unless iconv_to.nil?
       return content if content_section.count > 0
     end
     nil
   end
 end

data/test/test_content_mapping.rb CHANGED Viewed

@@ -9,7 +9,7 @@ class TestContentMapping < Test::Unit::TestCase
       @mapping = ContentMapping.new
       @mapping.instance_eval do
         url_pattern /^http:\/\/www\.matchme\.com\//
-        content_at '//div[@id="failing_content"]'
+          content_at '//div[@id="failing_content"]'
         content_at '//div[@id="itext_content"]'
         content_at '//div[@id="itext_second_content"]'
       end
@@ -46,7 +46,7 @@ class TestContentMapping < Test::Unit::TestCase
       @mapping = ContentMapping.new
       @mapping.instance_eval do
         url_pattern /^http:\/\/www\.matchme\.com\//
-        content_at '//div[@class="node node-story"]/div[@class="content"]/p'
+          content_at '//div[@class="node node-story"]/div[@class="content"]/p'
         iconv :to => 'utf8', :from => 'latin1'
       end
       page = File.open("#{File.dirname(__FILE__)}/test_pages/encoding.html").read

data/test/test_content_scrapper.rb CHANGED Viewed

@@ -5,6 +5,24 @@ class TestContentScrapper < Test::Unit::TestCase
   ContentScrapper.default_config_file = nil
+  context "on settings without sanitization tags" do
+    setup do
+      @scrapper = ContentScrapper.new
+      @scrapper.instance_eval do
+        content_mapping do
+          url_pattern /.*/
+          content_at '//div[@id="itext_content"]'
+        end
+      end
+      content = File.open("#{File.dirname(__FILE__)}/test_pages/pretty.html").read
+      stringio = StringIO.new(content)
+      Kernel.expects(:open).returns(stringio)
+    end
+    should 'not sanitize' do
+      assert !@scrapper.scrap_content('http://www.pretty.url/fsdsd').nil?
+    end
+  end
   context "on common settings" do
     setup do
       @scrapper = ContentScrapper.new

data/test/test_pages.rb ADDED Viewed

@@ -0,0 +1,29 @@
+require 'helper'
+require 'mocha'
+class TestContentScrapper < Test::Unit::TestCase
+  context "on page containing CDATA" do
+    setup do
+      @scrapper = ContentScrapper.new
+      @scrapper.instance_eval do
+        content_mapping do
+          url_pattern /.*/
+          content_at '//div[@class="art-full adwords-text"]'
+        end
+        loofah_tags(:strip)
+      end
+      @scrapper.rescue_scrapping do |exception|
+        puts exception
+      end
+      cdata_content = File.open("#{File.dirname(__FILE__)}/test_pages/cdata.html").read
+      Kernel.expects(:open).returns(StringIO.new(cdata_content))
+    end
+    should "not escape the cdata entries, should leave cdata unvisible" do
+      #<!--<![CDATA[
+      assert_match /<!--</, @scrapper.scrap_content('http://www.cdata.url/hsdae')
+    end
+  end
+end

data/test/test_pages/cdata.html ADDED Viewed

@@ -0,0 +1,23 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<body>
+  <div class="art-full adwords-text">
+    <h1>Jud�nyov�: Celebrity sveta m�dy za�ila na vlastnej ko�i</h1>
+    <div id="zoom" class="zoom2">
+      <div class="art-info">Pravda.sk |
+        <span class="time-date">25.&nbsp;febru�ra&nbsp;2010&nbsp;&nbsp;5:51</span>
+      </div>
+      <div class="perex" id="article-perex">Moder�torka Erika Jud�nyov� sa ned�vno vr�tila z v�letu v New Yorku. Z��astnila sa tam na m�dnom t��dni a s telev�znym �t�bom aj nieko�ko prehliadok. </div>
+      <ul>
+        <li class="fotogaleria">
+          <a href="/foto.asp?r=sk-kkoktail&amp;c=A100224_165338_sk-kkoktail_p20">
+            Gal�ria: Erika Jud�nyov�</a>
+          <script type="text/javascript"><!--<![CDATA[
+            /* SLAVE: perex_sk.perex_sk.perex.koktail.center */
+            ado.slave('adoceanskqdisnunpvu', {myMaster: 'uikHnAPTNwh_AVZX4uAdPP6xUQPhUSb01rCKlMcgapn.97' });
+            //]]>--></script>
+          <!-- [/Koktail/CENTER] -->
+        </div><p>&quot;Videli sme napr�klad �ou Custo Barcelona alebo prehliadku Very Wangovej,&quot; prezradila pre Pravda.sk moder�torka Smot�nky. Spolu so �t�bom zaznamen�vala dianie. Jeden z najzauj�mavej��ch �lovkov pre kameru mark�zackej rel�cie bola ��fredaktorka magaz�nu Vogue Anna Wintour. &quot;Pri�la na prehliadku, nasadila si tmav� okuliare a potichu z�vala. �o ma v�ak najviac prekvapilo, nezostala do konca prehliadky, asi tri min�ty pred koncom sa pov��enecky zdvihla a odi�la,&quot; zaspom�nala si Jud�nyov�.</p>
+      </li>
+    </ul>
+  </body>
+</html>

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: content_scrapper
 version: !ruby/object:Gem::Version
-  version: 0.0.6
+  version: 0.0.7
 platform: ruby
 authors:
 - Gyorgy Frivolt
@@ -32,16 +32,6 @@ dependencies:
       - !ruby/object:Gem::Version
         version: 0.9.8
     version:
-- !ruby/object:Gem::Dependency
-  name: sanitize
-  type: :runtime
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: 1.2.0
-    version:
 - !ruby/object:Gem::Dependency
   name: nokogiri
   type: :runtime
@@ -64,7 +54,6 @@ extra_rdoc_files:
 files:
 - .document
 - .gitignore
-- .specification
 - LICENSE
 - README.rdoc
 - Rakefile
@@ -78,6 +67,8 @@ files:
 - test/helper.rb
 - test/test_content_mapping.rb
 - test/test_content_scrapper.rb
+- test/test_pages.rb
+- test/test_pages/cdata.html
 - test/test_pages/encoding.html
 - test/test_pages/pretty.html
 - test/test_pages/twocontent.html
@@ -113,4 +104,5 @@ summary: Gem for those who want to screen scrap only the content part of web pag
 test_files:
 - test/test_content_mapping.rb
 - test/test_content_scrapper.rb
+- test/test_pages.rb
 - test/helper.rb

data/.specification DELETED Viewed

@@ -1,115 +0,0 @@
---- !ruby/object:Gem::Specification
-name: content_scrapper
-version: !ruby/object:Gem::Version
-  version: 99.99.99
-platform: ruby
-authors:
-- Gyorgy Frivolt
-autorequire:
-bindir: bin
-cert_chain: []
-date: 2010-02-13 00:00:00 +01:00
-default_executable:
-dependencies:
-- !ruby/object:Gem::Dependency
-  name: thoughtbot-shoulda
-  type: :development
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: 2.10.2
-    version:
-- !ruby/object:Gem::Dependency
-  name: mocha
-  type: :development
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: 0.9.8
-    version:
-- !ruby/object:Gem::Dependency
-  name: sanitize
-  type: :runtime
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: 1.2.0
-    version:
-- !ruby/object:Gem::Dependency
-  name: nokogiri
-  type: :runtime
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: 1.4.1
-    version:
-description: If you want to cut only the content of pages, without any other part (like the menu, header, footer, commercials, etc.), you might find this gem very handy. A DSL is also defined for nifty definitions for your screen scrapping and sanitization.
-email: gyorgy.frivolt@gmail.com
-executables: []
-extensions: []
-extra_rdoc_files:
-- LICENSE
-- README.rdoc
-files:
-- .document
-- .gitignore
-- LICENSE
-- README.rdoc
-- Rakefile
-- VERSION
-- config/content_scrapper.rb
-- content_scrapper.gemspec
-- lib/content_scrapper.rb
-- lib/content_scrapper/content_mapping.rb
-- lib/content_scrapper/feedzirra.rb
-- rails/init.rb
-- test/helper.rb
-- test/test_content_mapping.rb
-- test/test_content_scrapper.rb
-- test/test_pages/pretty.html
-- test/test_pages/twocontent.html
-- test/test_pages/ugly.html
-has_rdoc: true
-homepage: http://github.com/fifigyuri/content_scrapper
-licenses: []
-post_install_message:
-rdoc_options:
-- --charset=UTF-8
-require_paths:
-- bin
-- lib
-required_ruby_version: !ruby/object:Gem::Requirement
-  requirements:
-  - - ">="
-    - !ruby/object:Gem::Version
-      version: "0"
-  version:
-required_rubygems_version: !ruby/object:Gem::Requirement
-  requirements:
-  - - ">="
-    - !ruby/object:Gem::Version
-      version: "0"
-  version:
-requirements: []
-rubyforge_project:
-rubygems_version: 1.3.5
-signing_key:
-specification_version: 3
-summary: Gem for those who want to screen scrap only the content part of web pages, blogs or articles.
-test_files:
-- test/test_content_mapping.rb
-- test/test_content_scrapper.rb
-- test/helper.rb