RubyGems - scrapifier - Versions diffs - 0.0.4 → 0.0.5 - Mend

scrapifier 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 67a72b01b9c64d37f15c459e11c536f29bd0abc2
-  data.tar.gz: d83f2f8c7ff3203ee2a5472e2e2099a4389af7aa
+  metadata.gz: e52f7f47695c7b16fed80a51f99a119d3f98a3b7
+  data.tar.gz: 24db438acd2fb2df421ab7b6c148e7cbed3331ee
 SHA512:
-  metadata.gz: 2a4abdb171a16b46b89bfb4c43a3262169c08c70125b3ac88e9c4f13209acd5085368b3a4b9399691b9fb9dc89fc4cacfe34b781a26df3c5646bfd4f20f44435
-  data.tar.gz: c246dd2f9770e3835f4423b497b0bb86d38e51ff9030b59aa516be89f11262db159d2938f94e43f60b91fb14c2c56015965fb8556eec963a13e3c568a4c68d9d
+  metadata.gz: 41e5f58a1760d61196ee3b4f429644025789d4432e7a1cef1a70d31473a26c12a8e38091527adacc78e7650759fc55cbdb80e365ce3f86954fcf493304f08d86
+  data.tar.gz: 248f23ec549f331a942d1847b9fc4d92935dea34993128fe5bd63fca9a4a4b5814ee531ef3c390e38502cd7f802c55a12dfeef6aa7eb434782d6f735a4d29e28

data/.travis.yml CHANGED

@@ -1,4 +1,5 @@
 language: ruby
 rvm:
   - 2.0.0
-  - 1.9.3
+  - 2.1.0
+  - 2.1.1

data/README.md CHANGED

@@ -7,6 +7,8 @@
 It's a Ruby gem that brings a very simple way to extract meta information from URIs using the screen scraping technique.
+Note: This gem is mainly focused on screen scraping URLs (presence of protocol, such as: "http", "https" and "ftp"), but it also works with URIs which have the "www" without any protocol defined, like: "www.google.com".
 ## Installation
 Compatible with Ruby 1.9.3+
@@ -31,8 +33,6 @@ An then require the gem:
 The String#scrapify method finds URIs in a string and then gets their metadata, e.g., the page's title, description, images and URI. All the data is returned in a well-formatted hash.
-Note: This gem is mainly focused on screen scraping URLs (presence of protocol, such as: "http", "https" and "ftp"), but it also works with URIs which have the "www" without any protocol defined, like: "www.google.com".
 #### Default usage.
 ``` ruby

data/lib/scrapifier/methods.rb CHANGED

@@ -6,7 +6,7 @@ require 'scrapifier/support'
 module Scrapifier
   # Methods which will be included into the String class.
   module Methods
-    include Scrapifier::Support
+    include Support
     # Get metadata from an URI using the screen scraping technique.
     #

data/lib/scrapifier/support.rb CHANGED

@@ -1,6 +1,9 @@
+require 'scrapifier/xpath'
 module Scrapifier
   # Support methods to get, check and organize data.
   module Support
+    include XPath
     module_function
     # Evaluate the URI's HTML document and get its metadata.
@@ -25,7 +28,7 @@ module Scrapifier
       doc = Nokogiri::HTML(open(uri).read)
       doc.encoding, meta = 'utf-8', { uri: uri }
-      [:title, :description].each do |k|
+      [:title, :description, :keywords, :lang, :encode, :reply_to, :author].each do |k|
         node = doc.xpath(sf_xpaths[k])[0]
         meta[k] = node.nil? ? '-' : node.text
       end
@@ -111,45 +114,16 @@ module Scrapifier
       %r{(^http{1}[s]?://([w]{3}\.)?.+\.(#{exts.join('|')})(\?.+)?$)}i
     end
-    # Collection of xpath that are used to get nodes
-    # from the parsed HTML.
+    # Organize XPaths.
     def sf_xpaths
-      {
-        title: sf_title_xpath,
-        description: sf_desc_xpath,
-        image: sf_img_xpath
-      }
-    end
-    def sf_title_xpath
-      <<-END.gsub(/^\s+\|/, '')
-        |//meta[@property = "og:title"]/@content|
-        |//meta[@name = "title"]/@content|
-        |//meta[@name = "Title"]/@content|
-        |//title|//h1
-      END
-    end
-    def sf_desc_xpath
-      <<-END.gsub(/^\s+\|/, '')
-        |//meta[@property = "og:description"]/@content|
-        |//meta[@name = "description"]/@content|
-        |//meta[@name = "Description"]/@content|
-        |//h1|//h3|//p|//span|//font
-      END
-    end
-    def sf_img_xpath
-      <<-END.gsub(/^\s+\|/, '')
-        |//meta[@property = "og:image"]/@content|
-        |//link[@rel = "image_src"]/@href|
-        |//meta[@itemprop = "image"]/@content|
-        |//div[@id = "logo"]/img/@src|//a[@id = "logo"]/img/@src|
-        |//div[@class = "logo"]/img/@src|//a[@class = "logo"]/img/@src|
-        |//a//img[@width]/@src|//img[@width]/@src|
-        |//a//img[@height]/@src|//img[@height]/@src|
-        |//a//img/@src|//span//img/@src|//img/@src
-      END
+      { title: XPath::TITLE,
+        description: XPath::DESC,
+        keywords: XPath::KEYWORDS,
+        lang: XPath::LANG,
+        encode: XPath::ENCODE,
+        reply_to: XPath::REPLY_TO,
+        author: XPath::AUTHOR,
+        image: XPath::IMG }
     end
     # Check and return only the valid image URIs.

data/lib/scrapifier/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Scrapifier
-  VERSION = '0.0.4'
+  VERSION = '0.0.5'
 end

data/lib/scrapifier/xpath.rb ADDED

@@ -0,0 +1,66 @@
+# coding: utf-8
+module Scrapifier
+  # Collection of all XPaths which are used to find
+  # the nodes within the parsed HTML doc.
+  module XPath
+    TITLE =
+      <<-END.gsub(/^\s+\|/, '')
+        |//meta[@property="og:title"]/@content|
+        |//meta[@name="title"]/@content|
+        |//meta[@name="Title"]/@content|
+        |//title|//h1
+      END
+    DESC =
+      <<-END.gsub(/^\s+\|/, '')
+        |//meta[@property="og:description"]/@content|
+        |//meta[@name="description"]/@content|
+        |//meta[@name="Description"]/@content|
+        |//h1|//h3|//p|//span|//font
+      END
+    KEYWORDS =
+      <<-END.gsub(/^\s+\|/, '')
+        |//meta[@name="keywords"]/@content|
+        |//meta[@name="Keywords"]/@content|
+        |//meta[@property="og:type"]/@content
+      END
+    LANG =
+      <<-END.gsub(/^\s+\|/, '')
+        |//html/@lang|
+        |//meta[@property="og:locale"]/@content|
+        |//meta[@http-equiv="content-language"]/@content
+      END
+    ENCODE =
+      <<-END.gsub(/^\s+\|/, '')
+        |//meta/@charset|
+        |//meta[@http-equiv="content-type"]/@content
+      END
+    REPLY_TO =
+      <<-END.gsub(/^\s+\|/, '')
+        |//meta[@name="reply_to"]/@content
+      END
+    AUTHOR =
+      <<-END.gsub(/^\s+\|/, '')
+        |//meta[@name="author"]/@content|
+        |//meta[@name="Author"]/@content|
+        |//meta[@name="reply_to"]/@content
+      END
+    IMG =
+      <<-END.gsub(/^\s+\|/, '')
+        |//meta[@property="og:image"]/@content|
+        |//link[@rel="image_src"]/@href|
+        |//meta[@itemprop="image"]/@content|
+        |//div[@id="logo"]/img/@src|//a[@id="logo"]/img/@src|
+        |//div[@class="logo"]/img/@src|//a[@class="logo"]/img/@src|
+        |//a//img[@width]/@src|//img[@width]/@src|
+        |//a//img[@height]/@src|//img[@height]/@src|
+        |//a//img/@src|//span//img/@src|//img/@src
+      END
+  end
+end

data/spec/factories/samples.rb CHANGED

@@ -1,3 +1,4 @@
+# coding: utf-8
 module Factories
   private

data/spec/scrapifier_spec.rb CHANGED

@@ -46,24 +46,44 @@ describe String do
       subject(:hash) { "Look this awesome site #{misc[:http]}".scrapify }
       it "includes a field with the site's title" do
-        hash[:title].is_a?(String).should be_true
-        hash[:title].empty?.should be_false
+        hash[:title].is_a?(String).should be true
+        hash[:title].empty?.should be false
       end
       it "includes a field with the site's description" do
-        hash[:description].is_a?(String).should be_true
-        hash[:description].empty?.should be_false
+        hash[:description].is_a?(String).should be true
+        hash[:description].empty?.should be false
       end
       it 'includes a field with the page URI' do
-        hash[:uri].is_a?(String).should be_true
-        hash[:uri].empty?.should be_false
+        hash[:uri].is_a?(String).should be true
+        hash[:uri].empty?.should be false
         hash[:uri].should eq(misc[:http])
       end
+      it "includes a field with the site's keywords" do
+        hash[:keywords].is_a?(String).should be true
+      end
+      it "includes a field with the site's language" do
+        hash[:lang].is_a?(String).should be true
+      end
+      it "includes a field with the site's encode" do
+        hash[:encode].is_a?(String).should be true
+      end
+      it "includes a field with the site's reply email address" do
+        hash[:reply_to].is_a?(String).should be true
+      end
+      it "includes a field with the site's author name or email" do
+        hash[:author].is_a?(String).should be true
+      end
       it "includes a field with image URIs from the site's head/body" do
         unless hash[:images].empty?
-          hash[:images].is_a?(Array).should be_true
+          hash[:images].is_a?(Array).should be true
           hash[:images].sample.should match(regexes[:image][:all])
         end
       end
@@ -77,8 +97,8 @@ describe String do
     it "can choose the URI in the String to be scrapified" do
       hash = "Check out these awesome sites: #{misc[:http]} and #{misc[:www]}".scrapify(which: 1, images: :png)
       [:title, :description, :uri].each do |key|
-        hash[key].is_a?(String).should be_true
-        hash[key].empty?.should be_false
+        hash[key].is_a?(String).should be true
+        hash[key].empty?.should be false
       end
       hash[:uri].should eq("http://#{misc[:www]}")
       hash[:images].sample.should match(regexes[:image][:png])
@@ -175,7 +195,7 @@ describe String do
     end
     it 'always returns an Array' do
-      checked.each { |c| c[1].is_a?(Array).should be_true }
+      checked.each { |c| c[1].is_a?(Array).should be true }
     end
   end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: scrapifier
 version: !ruby/object:Gem::Version
-  version: 0.0.4
+  version: 0.0.5
 platform: ruby
 authors:
 - Tiago Guedes
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-06-06 00:00:00.000000000 Z
+date: 2014-06-28 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
@@ -84,6 +84,7 @@ files:
 - lib/scrapifier/methods.rb
 - lib/scrapifier/support.rb
 - lib/scrapifier/version.rb
+- lib/scrapifier/xpath.rb
 - scrapifier.gemspec
 - spec/factories/samples.rb
 - spec/scrapifier_spec.rb