RubyGems - scrapifier - Versions diffs - 0.0.3 → 0.0.4 - Mend

scrapifier 0.0.3 → 0.0.4

Files changed (8) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 1bf5a65bfe5f1fd54830bf6b2ef57b286661c515
-  data.tar.gz: af7058d4dc2ef44d03c930b869003c12616b1edd
+  metadata.gz: 67a72b01b9c64d37f15c459e11c536f29bd0abc2
+  data.tar.gz: d83f2f8c7ff3203ee2a5472e2e2099a4389af7aa
 SHA512:
-  metadata.gz: c0d9fbe9986f730e57dff1b2164508581e928f1b2a7ccd7b48e0e812fa5002befbf953c30c2385498a21a196ebc3c09cff57106b7d3188956a820ed9bdecc26f
-  data.tar.gz: 1c5f1d0171f91f68acd9d642491adb9de3f1b210087c75117582da68d2cb22457812ae1ee747145bf300652aa175a8c65f9092c5f39ca9fcce73df1fa50de8a6
+  metadata.gz: 2a4abdb171a16b46b89bfb4c43a3262169c08c70125b3ac88e9c4f13209acd5085368b3a4b9399691b9fb9dc89fc4cacfe34b781a26df3c5646bfd4f20f44435
+  data.tar.gz: c246dd2f9770e3835f4423b497b0bb86d38e51ff9030b59aa516be89f11262db159d2938f94e43f60b91fb14c2c56015965fb8556eec963a13e3c568a4c68d9d

data/README.md CHANGED

@@ -23,9 +23,15 @@ Or install it yourself as:
     $ gem install scrapifier
+An then require the gem:
+    $ require 'scrapifier'
 ## Usage
-The method finds an URI in the String and gets some meta information from it, like the page's title, description, images and the URI. All the data is returned in a well-formatted Hash.
+The String#scrapify method finds URIs in a string and then gets their metadata, e.g., the page's title, description, images and URI. All the data is returned in a well-formatted hash.
+Note: This gem is mainly focused on screen scraping URLs (presence of protocol, such as: "http", "https" and "ftp"), but it also works with URIs which have the "www" without any protocol defined, like: "www.google.com".
 #### Default usage.
@@ -42,7 +48,7 @@ The method finds an URI in the String and gets some meta information from it, li
 #### Allow only certain image types.
 ``` ruby
-'Wow! What an awesome site: http://adtangerine.com!'.scrapify images: :jpg
+'Wow! What an awesome site: http://adtangerine.com!'.scrapify(images: :jpg)
 #=> {
 #   title:       "AdTangerine | Advertising Platform for Social Media",
 #   description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
@@ -50,7 +56,7 @@ The method finds an URI in the String and gets some meta information from it, li
 #   uri:         "http://adtangerine.com"
 # }
-'Wow! What an awesome site: http://adtangerine.com!'.scrapify images: [:png, :gif]
+'Wow! What an awesome site: http://adtangerine.com!'.scrapify(images: [:png, :gif])
 #=> {
 #   title:       "AdTangerine | Advertising Platform for Social Media",
 #   description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
@@ -62,7 +68,7 @@ The method finds an URI in the String and gets some meta information from it, li
 #### Choose which URI you want it to be scraped.
 ``` ruby
-'Check out: http://adtangerine.com and www.twitflink.com'.scrapify which: 1
+'Check out: http://adtangerine.com and www.twitflink.com'.scrapify(which: 1)
 #=> {
 #   title:       "TwitFlink | Find a link!",
 #   description: "TwitFlink is a very simple searching tool that allows people to find out links tweeted by any user from Twitter.",
@@ -70,7 +76,7 @@ The method finds an URI in the String and gets some meta information from it, li
 #   uri:         "http://www.twitflink.com"
 # }
-'Check out: http://adtangerine.com and www.twitflink.com'.scrapify({ which: 0, images: :gif })
+'Check out: http://adtangerine.com and www.twitflink.com'.scrapify(which: 0, images: :gif)
 #=> {
 #   title:       "AdTangerine | Advertising Platform for Social Media",
 #   description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",

data/lib/scrapifier/support.rb CHANGED

@@ -1,4 +1,5 @@
 module Scrapifier
+  # Support methods to get, check and organize data.
   module Support
     module_function
@@ -18,14 +19,17 @@ module Scrapifier
     # Arguments:
     #   uri: (String)
     #     - URI.
-    #   imgs: (Array)
+    #   exts: (Array)
     #     - Allowed type of images.
-    def sf_eval_uri(uri, imgs = [])
+    def sf_eval_uri(uri, exts = [])
       doc = Nokogiri::HTML(open(uri).read)
       doc.encoding, meta = 'utf-8', { uri: uri }
-      [:title, :description].each { |k| meta[k] = (doc.xpath(sf_paths[k])[0].text rescue '-') }
-      meta[:images] = sf_fix_imgs(doc.xpath(sf_paths[:image]), uri, imgs)
+      [:title, :description].each do |k|
+        node = doc.xpath(sf_xpaths[k])[0]
+        meta[k] = node.nil? ? '-' : node.text
+      end
+      meta[:images] = sf_fix_imgs(doc.xpath(sf_xpaths[:image]), uri, exts)
       meta
     rescue SocketError
@@ -33,11 +37,14 @@ module Scrapifier
     end
     # Filter images returning those with the allowed extentions.
-    #
+    #
     # Example:
     #   >> sf_check_img_ext('http://source.com/image.gif', :jpg)
     #   => []
-    #   >> sf_check_img_ext(['http://source.com/image.gif', 'http://source.com/image.jpg'], [:jpg, :png])
+    #   >> sf_check_img_ext(
+    #        ['http://source.com/image.gif','http://source.com/image.jpg'],
+    #        [:jpg, :png]
+    #      )
     #   => ['http://source.com/image.jpg']
     # Arguments:
     #   images: (String or Array)
@@ -55,32 +62,37 @@ module Scrapifier
     end
     # Select regexes for URIs, protocols and image extensions.
-    #
+    #
     # Example:
     #   >> sf_regex(:uri)
-    #   => /\b((((ht|f)tp[s]?:\/\/)|([a-z0-9]+\.))+(?<!@)([a-z0-9\_\-]+)(\.[a-z]+)+([\?\/\:][a-z0-9_=%&@\?\.\/\-\:\#\(\)]+)?\/?)/i,
+    #   => /\b((((ht|f)tp[s]?:\/\/).../i,
     #   >> sf_regex(:image, :jpg)
     #   => /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg)(\?.+)?$)/i
     # Arguments:
     #   type: (Symbol or String)
-    #     - Regex type.
+    #     - Regex type: :uri, :protocol, :image
     #   args: (*)
     #     - Anything.
     def sf_regex(type, *args)
       type = type.to_sym unless type.is_a? Symbol
-      if type == :image
-        sf_img_regex args.flatten
-      else
-        regexes = {
-          uri:      /\b((((ht|f)tp[s]?:\/\/)|([a-z0-9]+\.))+(?<!@)([a-z0-9\_\-]+)(\.[a-z]+)+([\?\/\:][a-z0-9_=%&@\?\.\/\-\:\#\(\)]+)?\/?)/i,
-          protocol: /((ht|f)tp[s]?)/i
-        }
-        regexes[type]
-      end
+      type == :image && sf_img_regex(args.flatten) || sf_uri_regex[type]
+    end
+    # Build a hash with the URI regexes.
+    def sf_uri_regex
+      { uri: %r{\b(
+               (((ht|f)tp[s]?://)|([a-z0-9]+\.))+
+               (?<!@)
+               ([a-z0-9\_\-]+)
+               (\.[a-z]+)+
+               ([\?/\:][a-z0-9_=%&@\?\./\-\:\#\(\)]+)?
+               /?
+             )}ix,
+        protocol: /((ht|f)tp[s]?)/i }
     end
     # Build image regexes according to the required extensions.
-    #
+    #
     # Example:
     #   >> sf_img_regex
     #   => /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg|png|gif)(\?.+)?$)/i
@@ -91,54 +103,90 @@ module Scrapifier
     #     - Image extensions which will be included in the regex.
     def sf_img_regex(exts = [])
       exts = [exts].flatten unless exts.is_a?(Array)
-      if exts.nil? or exts.empty?
+      if exts.nil? || exts.empty?
         exts = %w(jpg jpeg png gif)
-      elsif exts.include?(:jpg) and !exts.include?(:jpeg)
+      elsif exts.include?(:jpg) && !exts.include?(:jpeg)
         exts.push :jpeg
-      end
-      eval "/(^http{1}[s]?:\\/\\/([w]{3}\\.)?.+\\.(#{exts.join('|')})(\\?.+)?$)/i"
+      end
+      %r{(^http{1}[s]?://([w]{3}\.)?.+\.(#{exts.join('|')})(\?.+)?$)}i
     end
-    # Collection of paths used to get content from HTML tags via Node#xpath method.
-    # See more: http://nokogiri.org/tutorials/searching_a_xml_html_document.html
-    #
-    # Example:
-    #   >> sf_paths[:title]
-    #   => '//meta[@property = "og:title"]/@content | //meta[@name = "title"]/@content | //meta[@name = "Title"]/@content | //title | //h1'
-    def sf_paths
+    # Collection of xpath that are used to get nodes
+    # from the parsed HTML.
+    def sf_xpaths
       {
-        title:       '//meta[@property = "og:title"]/@content | //meta[@name = "title"]/@content | //meta[@name = "Title"]/@content | //title | //h1',
-        description: '//meta[@property = "og:description"]/@content | //meta[@name = "description"]/@content | //meta[@name = "Description"]/@content | //h1 | //h3 | //p | //span | //font',
-        image:       '//meta[@property = "og:image"]/@content | //link[@rel = "image_src"]/@href | //meta[@itemprop = "image"]/@content | //div[@id = "logo"]/img/@src | //a[@id = "logo"]/img/@src | //div[@class = "logo"]/img/@src | //a[@class = "logo"]/img/@src | //a//img[@width]/@src | //img[@width]/@src | //a//img[@height]/@src | //img[@height]/@src | //a//img/@src | //span//img/@src'
+        title: sf_title_xpath,
+        description: sf_desc_xpath,
+        image: sf_img_xpath
       }
     end
+    def sf_title_xpath
+      <<-END.gsub(/^\s+\|/, '')
+        |//meta[@property = "og:title"]/@content|
+        |//meta[@name = "title"]/@content|
+        |//meta[@name = "Title"]/@content|
+        |//title|//h1
+      END
+    end
+    def sf_desc_xpath
+      <<-END.gsub(/^\s+\|/, '')
+        |//meta[@property = "og:description"]/@content|
+        |//meta[@name = "description"]/@content|
+        |//meta[@name = "Description"]/@content|
+        |//h1|//h3|//p|//span|//font
+      END
+    end
+    def sf_img_xpath
+      <<-END.gsub(/^\s+\|/, '')
+        |//meta[@property = "og:image"]/@content|
+        |//link[@rel = "image_src"]/@href|
+        |//meta[@itemprop = "image"]/@content|
+        |//div[@id = "logo"]/img/@src|//a[@id = "logo"]/img/@src|
+        |//div[@class = "logo"]/img/@src|//a[@class = "logo"]/img/@src|
+        |//a//img[@width]/@src|//img[@width]/@src|
+        |//a//img[@height]/@src|//img[@height]/@src|
+        |//a//img/@src|//span//img/@src|//img/@src
+      END
+    end
     # Check and return only the valid image URIs.
-    #
+    #
     # Example:
-    #   >>  sf_fix_imgs(['http://adtangerine.com/image.png', '/assets/image.jpg'], 'http://adtangerine.com', :jpg)
+    #   >>  sf_fix_imgs(
+    #         ['http://adtangerine.com/image.png', '/assets/image.jpg'],
+    #         'http://adtangerine.com',
+    #         :jpg
+    #       )
     #   => ['http://adtangerine/assets/image.jpg']
     # Arguments:
     #   imgs: (Array)
     #     - Image URIs got from the HTML doc.
     #   uri: (String)
-    #     - Used as basis to the URIs that don't have any protocol/domain set.
+    #     - Used as basis to the URIs that don't have any protocol/domain set.
     #   exts: (Symbol or Array)
     #     -  Allowed image extesntions.
     def sf_fix_imgs(imgs, uri, exts = [])
       sf_check_img_ext(imgs.map do |img|
-        img = img.to_s
-        img = sf_fix_protocol(img, sf_domain(uri)) unless img =~ sf_regex(:protocol)
-        img if (img =~ sf_regex(:image))
+        img = img.to_s
+        unless img =~ sf_regex(:protocol)
+          img = sf_fix_protocol(img, sf_domain(uri))
+        end
+        img if img =~ sf_regex(:image)
       end.compact, exts)
     end
     # Fix image URIs that don't have a protocol/domain set.
-    #
+    #
     # Example:
     #   >> sf_fix_protocol('/assets/image.jpg', 'http://adtangerine.com')
     #   => 'http://adtangerine/assets/image.jpg'
-    #   >> sf_fix_protocol('//s.ytimg.com/yts/img/youtub_img.png', 'https://youtube.com')
+    #   >> sf_fix_protocol(
+    #        '//s.ytimg.com/yts/img/youtub_img.png',
+    #        'https://youtube.com'
+    #      )
     #   => 'https://s.ytimg.com/yts/img/youtub_img.png'
     # Arguments:
     #   path: (String)
@@ -146,15 +194,15 @@ module Scrapifier
     #   domain: (String)
     #     - Domain that will be prepended into the path.
     def sf_fix_protocol(path, domain)
-      if path =~ /^\/\/[^\/]+/
+      if path =~ %r{^//[^/]+}
         'http:' << path
       else
-         "http://#{domain}#{'/' unless path =~ /^\/[^\/]+/}#{path}"
-      end
+        "http://#{domain}#{'/' unless path =~ %r{^/[^/]+}}#{path}"
+      end
     end
     # Return the URI domain.
-    #
+    #
     # Example:
     #   >> sf_domain('http://adtangerine.com')
     #   => 'adtangerine.com'
@@ -162,7 +210,8 @@ module Scrapifier
     #   uri: (String)
     #     - URI.
     def sf_domain(uri)
-      (uri.split('/')[2] rescue '')
+      uri = uri.to_s.split('/')
+      uri.empty? ? '' : uri[2]
     end
   end
-end
+end

data/lib/scrapifier/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Scrapifier
-  VERSION = '0.0.3'
+  VERSION = '0.0.4'
 end

data/spec/factories/samples.rb CHANGED

@@ -1,40 +1,41 @@
 module Factories
   private
-    def sf_samples
-      {
-        misc: {
-          http:  'http://adtangerine.com',
-          https: 'https://rubygems.org/gems/string_awesome',
-          ftp:   'ftp://ftpserver.com',
-          www:   'www.twitflink.com'
+  def sf_samples
+    {
+      misc: {
+        http:  'http://adtangerine.com',
+        https: 'https://rubygems.org/gems/string_awesome',
+        ftp:   'ftp://ftpserver.com',
+        www:   'www.twitflink.com'
+      },
+      images: {
+        jpg: [
+          'http://jlcauvin.com/wp-content/uploads/2013/09/heisenberg-breaking-bad.jpg',
+          'https://www.foobar.com/awesome_image.jpeg?foo=bar&bar=foo',
+          'http://foobar.com.br/nice-image.jpg'
+        ],
+        png: [
+          'http://www.faniq.com/images/blog/58389e481aee9c5abbf49ff0a263f3ca.png',
+          'https://foobar.br/awesome_image.png',
+          'https://bar.foobar.br/foo/var/image.png?foo=bar',
+        ],
+        gif: [
+          'http://31.media.tumblr.com/6eec77e355fe50bae424291fd8c58622/tumblr_me7ucl8kO61rf089no1_500.gif',
+          'http://foobar.com/ugly_image.gif',
+          'https://bar.foobar.br/foo/var/stop_using.gif?foo=bar'
+        ]
+      },
+      regexes: {
+        image: {
+          all: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg|png|gif)(\?.+)?$)/i,
+          jpg: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg)(\?.+)?$)/i,
+          png: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(png)(\?.+)?$)/i,
+          gif: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(gif)(\?.+)?$)/i
         },
-        images: {
-          jpg: [
-            'http://jlcauvin.com/wp-content/uploads/2013/09/heisenberg-breaking-bad.jpg',
-            'https://www.foobar.com/awesome_image.jpeg?foo=bar&bar=foo',
-            'http://foobar.com.br/nice-image.jpg'
-          ],
-          png: [
-            'http://www.faniq.com/images/blog/58389e481aee9c5abbf49ff0a263f3ca.png',
-            'https://foobar.br/awesome_image.png',
-            'https://bar.foobar.br/foo/var/image.png?foo=bar',
-          ],
-          gif: [
-            'http://31.media.tumblr.com/6eec77e355fe50bae424291fd8c58622/tumblr_me7ucl8kO61rf089no1_500.gif',
-            'http://foobar.com/ugly_image.gif',
-            'https://bar.foobar.br/foo/var/stop_using.gif?foo=bar'
-          ]
-        },
-        regexes: {
-          image: {
-            all: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg|png|gif)(\?.+)?$)/i,
-            jpg: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg)(\?.+)?$)/i,
-            png: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(png)(\?.+)?$)/i,
-            gif: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(gif)(\?.+)?$)/i
-          },
-          uri:      /\b((((ht|f)tp[s]?:\/\/)|([a-z0-9]+\.))+(?<!@)([a-z0-9\_\-]+)(\.[a-z]+)+([\?\/\:][a-z0-9_=%&@\?\.\/\-\:\#\(\)]+)?\/?)/i,
-          protocol: /((ht|f)tp[s]?)/i
-        }
+        uri:      /\b((((ht|f)tp[s]?:\/\/)|([a-z0-9]+\.))+(?<!@)([a-z0-9\_\-]+)(\.[a-z]+)+([\?\/\:][a-z0-9_=%&@\?\.\/\-\:\#\(\)]+)?\/?)/i,
+        protocol: /((ht|f)tp[s]?)/i
       }
-    end
-end
+    }
+  end
+end

data/spec/scrapifier_spec.rb CHANGED

@@ -62,13 +62,16 @@ describe String do
       end
       it "includes a field with image URIs from the site's head/body" do
-        hash[:images].is_a?(Array).should be_true
-        hash[:images].sample.should match(regexes[:image][:all])
+        unless hash[:images].empty?
+          hash[:images].is_a?(Array).should be_true
+          hash[:images].sample.should match(regexes[:image][:all])
+        end
       end
     end
     it "includes a field with only the allowed types of image URIs from the site's head/body" do
-      misc[:http].scrapify(images: :png)[:images].sample.should match(regexes[:image][:png])
+      image = misc[:http].scrapify(images: :png)[:images].sample
+      image.should match(regexes[:image][:png]) unless image.nil?
     end
     it "can choose the URI in the String to be scrapified" do

data/spec/spec_helper.rb CHANGED

@@ -2,4 +2,3 @@ require 'rubygems'
 require 'bundler/setup'
 require 'scrapifier'
 require 'factories/samples'

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: scrapifier
 version: !ruby/object:Gem::Version
-  version: 0.0.3
+  version: 0.0.4
 platform: ruby
 authors:
 - Tiago Guedes
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-04-30 00:00:00.000000000 Z
+date: 2014-06-06 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri