RubyGems - scrapifier - Versions diffs - 0.0.1 - Mend

scrapifier 0.0.1

Files changed (15) hide show

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: d645984640446b98bdc5bdf71972c991b361b75e
+  data.tar.gz: 9bd772bf8ab26ab4dda602fa69eee8c12ae45e39
+SHA512:
+  metadata.gz: ff5dd829fd8e41af883fccd65ade03bb44d968f71a5457a0f2ceeb3afb0e389f71b44c0b559b352793293f4675cd9556c6e0bbab9799b637f1c4e6e7bdbb61ee
+  data.tar.gz: 3b974c372000f5d4f795f32074bd4da64e4fd910e8623f8446169c18b6880b1967aee105bf5df4e9de36fd054697c567727d4a8eee48def87f51b93b4117f556

data/.gitignore ADDED Viewed

@@ -0,0 +1,18 @@
+*.gem
+*.rbc
+.bundle
+.config
+.yardoc
+.rspec
+Gemfile.lock
+InstalledFiles
+_yardoc
+coverage
+doc/
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp

data/Gemfile ADDED Viewed

@@ -0,0 +1,4 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in scrapifier.gemspec
+gemspec

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,22 @@
+Copyright (c) 2014 Tiago Guedes
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,83 @@
+# Scrapifier
+It's a Ruby gem that brings a very simple way to extract meta information from URIs using the screen scraping technique.
+## Installation
+Compatible with Ruby 1.9.3+
+Add this line to your application's Gemfile:
+    gem 'scrapifier'
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install scrapifier
+## Usage
+The method finds an URI in the String and gets some meta information from it, like the page's title, description, images and the URI. All the data is returned in a well-formatted Hash.
+#### Default usage.
+``` ruby
+'Wow! What an awesome site: http://adtangerine.com!'.scrapify
+#=> {
+#   title:       "AdTangerine | Advertising Platform for Social Media",
+#   description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
+#   images:      ["http://adtangerine.com/assets/logo_adt_og.png", "http://adtangerine.com/assets/logo_adt_og.png", "http://s3-us-west-2.amazonaws.com/adtangerine-prod/users/avatars/000/000/834/thumb/275747_1118382211_1929809351_n.jpg", "http://adtangerine.com/assets/foobar.gif"],
+#   uri:         "http://adtangerine.com"
+# }
+```
+#### Allow only certain image types.
+``` ruby
+'Wow! What an awesome site: http://adtangerine.com!'.scrapify images: :jpg
+#=> {
+#   title:       "AdTangerine | Advertising Platform for Social Media",
+#   description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
+#   images:      ["http://s3-us-west-2.amazonaws.com/adtangerine-prod/users/avatars/000/000/834/thumb/275747_1118382211_1929809351_n.jpg"],
+#   uri:         "http://adtangerine.com"
+# }
+'Wow! What an awesome site: http://adtangerine.com!'.scrapify images: [:png, :gif]
+#=> {
+#   title:       "AdTangerine | Advertising Platform for Social Media",
+#   description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
+#   images:      ["http://adtangerine.com/assets/logo_adt_og.png", "http://adtangerine.com/assets/logo_adt_og.png", "http://adtangerine.com/assets/foobar.gif"],
+#   uri:         "http://adtangerine.com"
+# }
+```
+#### Choose which URI you want it to be scraped.
+``` ruby
+'Check out: http://adtangerine.com and www.twitflink.com'.scrapify which: 1
+#=> {
+#   title:       "TwitFlink | Find a link!",
+#   description: "TwitFlink is a very simple searching tool that allows people to find out links tweeted by any user from Twitter.",
+#   images:      ["http://www.twitflink.com//assets/tf_logo.png", "http://twitflink.com/assets/tf_logo.png"],
+#   uri:         "http://www.twitflink.com"
+# }
+'Check out: http://adtangerine.com and www.twitflink.com'.scrapify({ which: 0, images: :gif })
+#=> {
+#   title:       "AdTangerine | Advertising Platform for Social Media",
+#   description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
+#   images:      ["http://adtangerine.com/assets/foobar.gif"],
+#   uri:         "http://adtangerine.com"
+# }
+```
+## Contributing
+1. Fork it
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create new Pull Request

data/Rakefile ADDED Viewed

@@ -0,0 +1,5 @@
+require 'bundler/gem_tasks'
+require 'rspec/core/rake_task'
+RSpec::Core::RakeTask.new(:spec)
+task default: :spec

data/lib/scrapifier/methods.rb ADDED Viewed

@@ -0,0 +1,70 @@
+# coding: utf-8
+require 'nokogiri'
+require 'open-uri'
+require 'scrapifier/support'
+module Scrapifier
+  module Methods
+    include Scrapifier::Support
+    # Gets meta data from an URI using the screen scraping technique.
+    #
+    # Example:
+    #   >> 'Wow! What an awesome site: http://adtangerine.com!'.scrapify
+    #   => {
+    #        :title => "AdTangerine | Advertising Platform for Social Media",
+    #        :description => "AdTangerine is an advertising platform that uses the tangerine as a virtual currency...",
+    #        :images => ["http://adtangerine.com/assets/logo_adt_og.png", "http://adtangerine.com/assets/logo_adt_og.png],
+    #        :uri => "http://adtangerine.com"
+    #      }
+    # Arguments:
+    #   options: (Hash)
+    #     - which: (Integer) Indicates which URI in the String will be used. It starts from 0 to N.
+    #     - images: (Symbol or Array) Indicates the image extensions which are allowed to be returned as result.
+    def scrapify(options = {})
+      meta, uri = {}, find_uri(options[:which])
+      begin
+        if uri.nil?
+          raise
+        elsif uri =~ sf_regex(:image)
+          uri = (sf_check_img_ext(uri, options[:images])[0] rescue [])
+          raise if uri.empty?
+          [:title, :description, :uri, :images].each { |key| meta[key] = uri }
+        else
+          doc          = Nokogiri::HTML(open(uri).read)
+          doc.encoding = 'utf-8'
+          [:title, :description].each do |key|
+            meta[key] = (doc.xpath(sf_paths[key])[0].text rescue '-')
+          end
+          meta[:images] = sf_fix_imgs(doc.xpath(sf_paths[:image]), uri, options[:images])
+          meta[:uri]    = uri
+        end
+      rescue
+        meta = {}
+      end
+      meta
+    end
+    # Looks for URIs in the String.
+    #
+    # Example:
+    #   >> 'Wow! What an awesome site: http://adtangerine.com!'.find_uri
+    #   => 'http://adtangerine.com'
+    #   >> 'Wow! What an awesome sites: http://adtangerine.com and www.twitflink.com'.find_uri 1
+    #   => 'www.twitflink.com'
+    # Arguments:
+    #   which: (Integer)
+    #     - Which URI in the String: first (0), second (1) and so on.
+    def find_uri(which = 0)
+      which ||= which.to_i
+      which = self.scan(sf_regex(:uri))[which][0] rescue nil
+      (which.nil? or which =~ sf_regex(:protocol)) ? which : 'http://' << which
+    end
+  end
+end

data/lib/scrapifier/support.rb ADDED Viewed

@@ -0,0 +1,144 @@
+module Scrapifier
+  module Support
+    private
+      # Filters images returning those with the allowed extentions.
+      #
+      # Example:
+      #   >> sf_check_img_ext('http://source.com/image.gif', :jpg)
+      #   => []
+      #   >> sf_check_img_ext(['http://source.com/image.gif', 'http://source.com/image.jpg'], [:jpg, :png])
+      #   => ['http://source.com/image.jpg']
+      # Arguments:
+      #   images: (String or Array)
+      #     - Images which will be checked.
+      #   allowed: (String, Symbol or Array)
+      #     - Allowed types of image extension.
+      def sf_check_img_ext(images, allowed = [])
+        allowed ||= []
+        if images.is_a?(String)
+          images = images.split
+        elsif !images.is_a?(Array)
+          images = []
+        end
+        images.select { |i| i =~ sf_regex(:image, allowed) }
+      end
+      # Selects regexes for URIs, protocols and image extensions.
+      #
+      # Example:
+      #   >> sf_regex(:uri)
+      #   => /\b((((ht|f)tp[s]?:\/\/)|([a-z0-9]+\.))+(?<!@)([a-z0-9\_\-]+)(\.[a-z]+)+([\?\/\:][a-z0-9_=%&@\?\.\/\-\:\#\(\)]+)?\/?)/i,
+      #   >> sf_regex(:image, :jpg)
+      #   => /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg)(\?.+)?$)/i
+      # Arguments:
+      #   type: (Symbol or String)
+      #     - Regex type.
+      #   args: (*)
+      #     - Anything.
+      def sf_regex(type, *args)
+        type = type.to_sym unless type.is_a? Symbol
+        if type == :image
+          sf_img_regex args.flatten
+        else
+          regexes = {
+            uri:      /\b((((ht|f)tp[s]?:\/\/)|([a-z0-9]+\.))+(?<!@)([a-z0-9\_\-]+)(\.[a-z]+)+([\?\/\:][a-z0-9_=%&@\?\.\/\-\:\#\(\)]+)?\/?)/i,
+            protocol: /((ht|f)tp[s]?)/i
+          }
+          regexes[type]
+        end
+      end
+      # Builds image regexes according to the required extensions.
+      #
+      # Example:
+      #   >> sf_img_regex
+      #   => /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg|png|gif)(\?.+)?$)/i
+      #   >> sf_img_regex([:jpg, :png])
+      #   => /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|png)(\?.+)?$)/i
+      # Arguments:
+      #   exts: (Array)
+      #     - Image extensions which will be included in the regex.
+      def sf_img_regex(exts = [])
+        exts = [exts].flatten unless exts.is_a?(Array)
+        if exts.nil? or exts.empty?
+          exts = %w(jpg jpeg png gif)
+        elsif exts.include?(:jpg) and !exts.include?(:jpeg)
+          exts.push :jpeg
+        end
+        eval "/(^http{1}[s]?:\\/\\/([w]{3}\\.)?.+\\.(#{exts.join('|')})(\\?.+)?$)/i"
+      end
+      # Collection of paths used to get content from HTML tags via Node#xpath method.
+      # See more: http://nokogiri.org/tutorials/searching_a_xml_html_document.html
+      #
+      # Example:
+      #   >> sf_paths[:title]
+      #   => '//meta[@property = "og:title"]/@content | //meta[@name = "title"]/@content | //meta[@name = "Title"]/@content | //title | //h1'
+      def sf_paths
+        {
+          title:       '//meta[@property = "og:title"]/@content | //meta[@name = "title"]/@content | //meta[@name = "Title"]/@content | //title | //h1',
+          description: '//meta[@property = "og:description"]/@content | //meta[@name = "description"]/@content | //meta[@name = "Description"]/@content | //h1 | //h3 | //p | //span | //font',
+          image:       '//meta[@property = "og:image"]/@content | //link[@rel = "image_src"]/@href | //meta[@itemprop = "image"]/@content | //div[@id = "logo"]/img/@src | //a[@id = "logo"]/img/@src | //div[@class = "logo"]/img/@src | //a[@class = "logo"]/img/@src | //a//img[@width]/@src | //img[@width]/@src | //a//img[@height]/@src | //img[@height]/@src | //a//img/@src | //span//img/@src'
+        }
+      end
+      # Checks and returns only the valid image URIs.
+      #
+      # Example:
+      #   >>  sf_fix_imgs(['http://adtangerine.com/image.png', '/assets/image.jpg'], 'http://adtangerine.com', :jpg)
+      #   => ['http://adtangerine/assets/image.jpg']
+      # Arguments:
+      #   imgs: (Array)
+      #     - Image URIs got from the HTML doc.
+      #   uri: (String)
+      #     - Used as basis to the URIs that don't have any protocol/domain set.
+      #   exts: (Symbol or Array)
+      #     -  Allowed image extesntions.
+      def sf_fix_imgs(imgs, uri, exts = [])
+        sf_check_img_ext(imgs.map do |img|
+          img = img.to_s
+          img = sf_fix_protocol(img, sf_domain(uri)) unless img =~ sf_regex(:protocol)
+          img if (img =~ sf_regex(:image))
+        end.compact, exts)
+      end
+      # Fixes image URIs that doesn't present protocol/domain.
+      #
+      # Example:
+      #   >> sf_fix_protocol('/assets/image.jpg', 'http://adtangerine.com')
+      #   => 'http://adtangerine/assets/image.jpg'
+      #   >> sf_fix_protocol('//s.ytimg.com/yts/img/youtub_img.png', 'https://youtube.com')
+      #   => 'https://s.ytimg.com/yts/img/youtub_img.png'
+      # Arguments:
+      #   path: (String)
+      #     - URI path having no protocol/domain set.
+      #   domain: (String)
+      #     - Domain that will be prepended into the path.
+      def sf_fix_protocol(path, domain)
+        if path =~ /^\/\/[^\/]+/
+          'http:' << path
+        else
+           "http://#{domain}#{'/' unless path =~ /^\/[^\/]+/}#{path}"
+        end
+      end
+      # Returns the domain from an URI
+      #
+      # Example:
+      #   >> sf_domain('http://adtangerine.com')
+      #   => 'adtangerine.com'
+      # Arguments:
+      #   uri: (String)
+      #     - URI.
+      def sf_domain(uri)
+        (uri.split('/')[2] rescue '')
+      end
+  end
+end

data/lib/scrapifier/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module Scrapifier
+  VERSION = '0.0.1'
+end

data/lib/scrapifier.rb ADDED Viewed

@@ -0,0 +1,4 @@
+# coding: utf-8
+require 'scrapifier/methods'
+String.send :include, Scrapifier::Methods

data/scrapifier.gemspec ADDED Viewed

@@ -0,0 +1,26 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'scrapifier/version'
+Gem::Specification.new do |spec|
+  spec.name          = 'scrapifier'
+  spec.version       = Scrapifier::VERSION
+  spec.authors       = ['Tiago Guedes']
+  spec.email         = ['tiagopog@gmail.com']
+  spec.description   = 'A very simple way to extract meta information from URIs using the screen scraping technique.'
+  spec.summary       = 'Extends the Ruby String class with a screen scraping method.'
+  spec.homepage      = 'https://github.com/tiagopog/scrapifier'
+  spec.license       = 'MIT'
+  spec.files         = `git ls-files`.split($/)
+  spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
+  spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
+  spec.require_paths = ['lib']
+  spec.add_runtime_dependency 'nokogiri', '~> 1.6'
+  spec.add_development_dependency 'bundler', '~> 1.3'
+  spec.add_development_dependency 'rspec',   '~> 2.14'
+  spec.add_development_dependency 'rake',    '~> 10.1'
+end

data/spec/factories/uris.rb ADDED Viewed

@@ -0,0 +1,40 @@
+module Factories
+  private
+    def sf_samples
+      {
+        misc: {
+          http:  'http://adtangerine.com',
+          https: 'https://rubygems.org/gems/string_awesome',
+          ftp:   'ftp://ftpserver.com',
+          www:   'www.twitflink.com'
+        },
+        images: {
+          jpg: [
+            'http://jlcauvin.com/wp-content/uploads/2013/09/heisenberg-breaking-bad.jpg',
+            'https://www.foobar.com/awesome_image.jpeg?foo=bar&bar=foo',
+            'http://foobar.com.br/nice-image.jpg'
+          ],
+          png: [
+            'http://www.faniq.com/images/blog/58389e481aee9c5abbf49ff0a263f3ca.png',
+            'https://foobar.br/awesome_image.png',
+            'https://bar.foobar.br/foo/var/image.png?foo=bar',
+          ],
+          gif: [
+            'http://31.media.tumblr.com/6eec77e355fe50bae424291fd8c58622/tumblr_me7ucl8kO61rf089no1_500.gif',
+            'http://foobar.com/ugly_image.gif',
+            'https://bar.foobar.br/foo/var/stop_using.gif?foo=bar'
+          ]
+        },
+        regexes: {
+          image: {
+            all: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg|png|gif)(\?.+)?$)/i,
+            jpg: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg)(\?.+)?$)/i,
+            png: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(png)(\?.+)?$)/i,
+            gif: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(gif)(\?.+)?$)/i
+          },
+          uri:      /\b((((ht|f)tp[s]?:\/\/)|([a-z0-9]+\.))+(?<!@)([a-z0-9\_\-]+)(\.[a-z]+)+([\?\/\:][a-z0-9_=%&@\?\.\/\-\:\#\(\)]+)?\/?)/i,
+          protocol: /((ht|f)tp[s]?)/i
+        }
+      }
+    end
+end

data/spec/scrapifier_spec.rb ADDED Viewed

@@ -0,0 +1,269 @@
+# coding: utf-8
+require 'spec_helper'
+include Factories
+describe String do
+  let(:images)  { sf_samples[:images] }
+  let(:misc)    { sf_samples[:misc]   }
+  let(:regexes) { sf_samples[:regexes] }
+  #
+  # String#scrapify
+  #
+  describe '#scrapify' do
+    context 'when no URI is matched in the String' do
+      subject { 'String without any URI.'.scrapify }
+      it { should eq({}) }
+    end
+    context 'when the website was not found' do
+      subject { 'Check out this http://someweirduri.com.br'.scrapify }
+      it { should eq({}) }
+    end
+    context 'when an image URI is matched' do
+      let(:jpg) { images[:jpg][0] }
+      let(:png) { images[:png][0] }
+      let(:gif) { images[:gif][0] }
+      it 'sets the same value for :title, :description and :uri keys' do
+        "Say my name: #{jpg}".scrapify.should include(title: jpg, description: jpg, uri: jpg)
+      end
+      it 'allows all the standard image extensions by default (even GIFs)' do
+        "Smile GIF! Oh, wait... #{gif}".scrapify.should include(title: gif, description: gif, uri: gif)
+      end
+      it 'returns an empty Hash if the extension is not allowed' do
+        "PNG is awesome! #{png}".scrapify(images: [:jpg]).should eq({})
+      end
+    end
+    context 'when a website URI is matched in the String and a Hash is returned' do
+      subject(:hash) { "Look this awesome site #{misc[:http]}".scrapify }
+      it "includes a field with the site's title" do
+        hash[:title].is_a?(String).should be_true
+        hash[:title].empty?.should be_false
+      end
+      it "includes a field with the site's description" do
+        hash[:description].is_a?(String).should be_true
+        hash[:description].empty?.should be_false
+      end
+      it 'includes a field with the page URI' do
+        hash[:uri].is_a?(String).should be_true
+        hash[:uri].empty?.should be_false
+        hash[:uri].should eq(misc[:http])
+      end
+      it "includes a field with image URIs from the site's head/body" do
+        hash[:images].is_a?(Array).should be_true
+        hash[:images].sample.should match(regexes[:image][:all])
+      end
+    end
+    it "includes a field with only the allowed types of image URIs from the site's head/body" do
+      misc[:http].scrapify(images: :png)[:images].sample.should match(regexes[:image][:png])
+    end
+    it "can choose the URI in the String to be scrapified" do
+      hash = "Check out these awesome sites: #{misc[:http]} and #{misc[:www]}".scrapify(which: 1, images: :png)
+      [:title, :description, :uri].each do |key|
+        hash[key].is_a?(String).should be_true
+        hash[key].empty?.should be_false
+      end
+      hash[:uri].should eq("http://#{misc[:www]}")
+      hash[:images].sample.should match(regexes[:image][:png])
+    end
+  end
+  #
+  # String#find_uri
+  #
+  describe '#find_uri' do
+    let(:sample_uris) { misc.map { |u| u[1] } }
+    let(:str)         { "Awesome sites: #{sample_uris.join ' and '}" }
+    it 'matches the first URI in the String by default' do
+      str.send(:find_uri).should eq(sample_uris[0])
+    end
+    it 'matches the second URI in the String (https)' do
+      str.send(:find_uri, 1).should eq(sample_uris[1])
+    end
+    it 'matches the third URI in the String (www)' do
+      str.send(:find_uri, 2).should eq(sample_uris[2])
+    end
+    context 'when no URI is matched' do
+      it 'returns nil' do
+        'Lorem ipsum dolor.'.send(:find_uri).should be_nil
+      end
+      it 'returns nil (no presence of http|https|ftp|www)' do
+        'Check this out: google.com'.send(:find_uri).should be_nil
+      end
+    end
+  end
+  #
+  # String#sf_check_img_ext
+  #
+  describe '#sf_check_img_ext' do
+    let(:img)  { images[:jpg].sample }
+    let(:imgs) { images.map { |i| i[1] }.flatten }
+    let(:checked) do
+      {
+        str:   ''.send(:sf_check_img_ext, img),
+        array: ''.send(:sf_check_img_ext, imgs),
+        jpg:   ''.send(:sf_check_img_ext, imgs, [:jpg]),
+        png:   ''.send(:sf_check_img_ext, imgs, :png),
+        gif:   ''.send(:sf_check_img_ext, imgs, 'gif')
+      }
+    end
+    context 'when no arument is passed' do
+      it { expect { ''.send(:sf_check_img_ext) }.to raise_error(ArgumentError) }
+    end
+    context 'when only the first argument is defined' do
+      it 'allows a String as argument' do
+        checked[:str].should have(1).item
+      end
+      it 'allows an Array as argument' do
+        checked[:jpg].should have(3).item
+      end
+      it 'allows all the image extensions by default' do
+        checked[:array].should have(9).item
+      end
+    end
+    context 'when the two arguments are defined' do
+      it 'allows a Symbol as the second argument' do
+        checked[:png].should have(3).item
+      end
+      it 'allows a String as the second argument' do
+        checked[:gif].should have(3).item
+      end
+      it 'allows an Array as the second argument' do
+        checked[:jpg].should have(3).item
+      end
+      it 'returns an Array with only image types allowed' do
+        [:jpg, :png, :gif].each { |ext| checked[ext].should have(3).item }
+      end
+    end
+    context 'when no image is found/allowed' do
+      it 'returns an empty Array' do
+      end
+    end
+    it 'always returns an Array' do
+      checked.each { |c| c[1].is_a?(Array).should be_true }
+    end
+  end
+  #
+  # String#sf_regex
+  #
+  describe '#sf_regex' do
+    context 'when it needs a regex to match any kind of URI' do
+      subject { ''.send(:sf_regex, :uri) }
+      [:http, :https, :ftp, :www].each do |p|
+        it { should match(misc[:http]) }
+      end
+    end
+    context 'when it needs a regex to match only image uris' do
+      subject { ''.send(:sf_regex, :image) }
+      [:jpg, :png, :gif].each do |ext|
+        it { should match(sf_samples[:images][ext].sample) }
+      end
+    end
+  end
+  #
+  # String#sf_img_regex
+  #
+  describe '#sf_img_regex' do
+    let(:img_regexes) { regexes[:image] }
+    context 'when no argument is passed' do
+      subject(:regex) { ''.send(:sf_img_regex) }
+      it 'returns a regex that matches all image extensions' do
+        regex.should eq(img_regexes[:all])
+      end
+      it 'matches all image extensions' do
+        [:jpg, :png, :gif].each { |ext| images[ext].sample.should match(regex) }
+      end
+    end
+    context 'when only jpg is allowed' do
+      subject(:regex) { ''.send(:sf_img_regex, [:jpg]) }
+      it 'returns a regex that matches only jpg images' do
+        regex.should eq(img_regexes[:jpg])
+      end
+      it 'matches only the defined extension' do
+        regex.should match(images[:jpg].sample)
+      end
+      it "doesn't match any other extension" do
+        [:png, :gif].each { |ext| regex.should_not match(images[ext].sample) }
+      end
+    end
+    context 'when only png is allowed' do
+      subject(:regex) { ''.send(:sf_img_regex, :png) }
+      it 'returns a regex that matches only png images' do
+        regex.should eq(img_regexes[:png])
+      end
+      it 'matches only the defined extension' do
+        regex.should match(images[:png].sample)
+      end
+      it "doesn't match any other extension" do
+        [:jpg, :gif].each { |ext| regex.should_not match(images[ext].sample) }
+      end
+    end
+    context 'when only gif (argh!) is allowed' do
+      subject(:regex) { ''.send(:sf_img_regex, :gif) }
+      it 'returns a regex that matches only gif images' do
+        regex.should eq(img_regexes[:gif])
+      end
+      it 'matches only the defined extension' do
+        regex.should match(images[:gif].sample)
+      end
+      it "doesn't match any other extension" do
+        [:jpg, :png].each { |ext| regex.should_not match(images[ext].sample) }
+      end
+    end
+  end
+end

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,5 @@
+require 'rubygems'
+require 'bundler/setup'
+require 'scrapifier'
+require 'factories/uris'

metadata ADDED Viewed

@@ -0,0 +1,117 @@
+--- !ruby/object:Gem::Specification
+name: scrapifier
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- Tiago Guedes
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2014-04-07 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.6'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.6'
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.3'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.3'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.14'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.14'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.1'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.1'
+description: A very simple way to extract meta information from URIs using the screen
+  scraping technique.
+email:
+- tiagopog@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- ".gitignore"
+- Gemfile
+- LICENSE.txt
+- README.md
+- Rakefile
+- lib/scrapifier.rb
+- lib/scrapifier/methods.rb
+- lib/scrapifier/support.rb
+- lib/scrapifier/version.rb
+- scrapifier.gemspec
+- spec/factories/uris.rb
+- spec/scrapifier_spec.rb
+- spec/spec_helper.rb
+homepage: https://github.com/tiagopog/scrapifier
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.2.2
+signing_key:
+specification_version: 4
+summary: Extends the Ruby String class with a screen scraping method.
+test_files:
+- spec/factories/uris.rb
+- spec/scrapifier_spec.rb
+- spec/spec_helper.rb