RubyGems - url_scraper - Versions diffs - 0.0.4 → 0.0.5 - Mend

url_scraper 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

data/lib/url_scraper.rb CHANGED

@@ -4,17 +4,18 @@ require 'nokogiri'
 require 'restclient'
 require 'logger'
 require 'thor'
+require 'cgi'
 module UrlScraper
   # Tell rails to load all assets
   class Engine < Rails::Engine
   end
   class CLI < Thor
   end
   # Handles the url request
   # Fetch Open Graph data from the specified URI. Makes an
@@ -23,14 +24,14 @@ module UrlScraper
   #
   # Pass <tt>false</tt> for the second argument if you want to
   # see invalid (i.e. missing a required attribute) data.
-  def self.fetch(uri, strict = true)
-    parse(RestClient.get(uri).body, strict)
+  def self.fetch(uri, strict = true)
+    parse(RestClient.get(uri).body, strict, uri)
     rescue RestClient::Exception, SocketError
       false
   end
-  def self.parse(html, strict = true)
+  def self.parse(html, strict = true, uri)
     logger = Logger.new(STDOUT)
     doc = Nokogiri::HTML.parse(html)
     page = UrlScraper::Object.new
@@ -40,20 +41,28 @@ module UrlScraper
       end
     end
-    page.title = (doc.at_css('title').text rescue nil) if page.title.nil?
+    page.title = (doc.at_css('title').text rescue nil) if page.title.nil?
     if page.description.nil?
       page.description = doc.at_css("meta[name='description']")['content'] unless doc.at_css("meta[name='description']").nil?
     end
     if page.image.nil?
-      image_array = doc.css("img").take(3).collect{|img| img['src']}
+      image_array = []
+      doc.css("img").each do |img|
+        next if img["src"].to_s.empty?
+        image = URI.escape(img["src"].strip)
+        image = image.gsub(/([{}|\^\[\]\@`])/) {|s| CGI.escape(s)} # escape characters that URI.escape doesn't get
+        image = URI.parse(uri).merge(URI.parse image.to_s).to_s
+        image_array << image
+      end
       page.image = image_array unless image_array.empty?
     end
     # return false if page.keys.empty?
     # return false unless page.valid? if strict
+    page.image = Array.wrap(page.image)
     page
     # return doc
   end
   TYPES = {
     'activity' => %w(activity sport),
     'business' => %w(bar company cafe hotel restaurant),
@@ -64,38 +73,38 @@ module UrlScraper
     'product' => %w(album book drink food game movie product song tv_show),
     'website' => %w(blog website)
   }
   # The UrlScraper::Object is a Hash with method accessors for
   # all detected Open Graph attributes.
   class Object < Hashie::Mash
     MANDATORY_ATTRIBUTES = %w(title type image url)
     # The object type.
     def type
       self['type']
     end
     # The schema under which this particular object lies. May be any of
     # the keys of the TYPES constant.
     def schema
-      UrlScraper::TYPES.each_pair do |schema, types|
+      UrlScraper::TYPES.each_pair do |schema, types|
         return schema if types.include?(self.type)
       end
       nil
     end
     UrlScraper::TYPES.values.flatten.each do |type|
       define_method "#{type}?" do
         self.type == type
       end
     end
     UrlScraper::TYPES.keys.each do |scheme|
       define_method "#{scheme}?" do
         self.type == scheme || UrlScraper::TYPES[scheme].include?(self.type)
       end
     end
     # If the Open Graph information for this object doesn't contain
     # the mandatory attributes, this will be <tt>false</tt>.
     def valid?

data/lib/url_scraper/version.rb CHANGED

@@ -1,3 +1,3 @@
 module UrlScraper
-  VERSION = "0.0.4"
+  VERSION = "0.0.5"
 end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: url_scraper
 version: !ruby/object:Gem::Version
-  version: 0.0.4
+  version: 0.0.5
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-05-08 00:00:00.000000000 Z
+date: 2013-07-23 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -144,18 +144,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
   - - ! '>='
     - !ruby/object:Gem::Version
       version: '0'
-      segments:
-      - 0
-      hash: 2025570187396456876
 required_rubygems_version: !ruby/object:Gem::Requirement
   none: false
   requirements:
   - - ! '>='
     - !ruby/object:Gem::Version
       version: '0'
-      segments:
-      - 0
-      hash: 2025570187396456876
 requirements: []
 rubyforge_project:
 rubygems_version: 1.8.24