url_scraper 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,17 +4,18 @@ require 'nokogiri'
4
4
  require 'restclient'
5
5
  require 'logger'
6
6
  require 'thor'
7
+ require 'cgi'
7
8
 
8
9
  module UrlScraper
9
10
  # Tell rails to load all assets
10
11
  class Engine < Rails::Engine
11
-
12
+
12
13
  end
13
14
 
14
15
  class CLI < Thor
15
16
 
16
17
  end
17
-
18
+
18
19
  # Handles the url request
19
20
 
20
21
  # Fetch Open Graph data from the specified URI. Makes an
@@ -23,14 +24,14 @@ module UrlScraper
23
24
  #
24
25
  # Pass <tt>false</tt> for the second argument if you want to
25
26
  # see invalid (i.e. missing a required attribute) data.
26
-
27
- def self.fetch(uri, strict = true)
28
- parse(RestClient.get(uri).body, strict)
27
+
28
+ def self.fetch(uri, strict = true)
29
+ parse(RestClient.get(uri).body, strict, uri)
29
30
  rescue RestClient::Exception, SocketError
30
31
  false
31
32
  end
32
-
33
- def self.parse(html, strict = true)
33
+
34
+ def self.parse(html, strict = true, uri)
34
35
  logger = Logger.new(STDOUT)
35
36
  doc = Nokogiri::HTML.parse(html)
36
37
  page = UrlScraper::Object.new
@@ -40,20 +41,28 @@ module UrlScraper
40
41
  end
41
42
  end
42
43
 
43
- page.title = (doc.at_css('title').text rescue nil) if page.title.nil?
44
+ page.title = (doc.at_css('title').text rescue nil) if page.title.nil?
44
45
  if page.description.nil?
45
46
  page.description = doc.at_css("meta[name='description']")['content'] unless doc.at_css("meta[name='description']").nil?
46
47
  end
47
48
  if page.image.nil?
48
- image_array = doc.css("img").take(3).collect{|img| img['src']}
49
+ image_array = []
50
+ doc.css("img").each do |img|
51
+ next if img["src"].to_s.empty?
52
+ image = URI.escape(img["src"].strip)
53
+ image = image.gsub(/([{}|\^\[\]\@`])/) {|s| CGI.escape(s)} # escape characters that URI.escape doesn't get
54
+ image = URI.parse(uri).merge(URI.parse image.to_s).to_s
55
+ image_array << image
56
+ end
49
57
  page.image = image_array unless image_array.empty?
50
58
  end
51
59
  # return false if page.keys.empty?
52
60
  # return false unless page.valid? if strict
61
+ page.image = Array.wrap(page.image)
53
62
  page
54
63
  # return doc
55
64
  end
56
-
65
+
57
66
  TYPES = {
58
67
  'activity' => %w(activity sport),
59
68
  'business' => %w(bar company cafe hotel restaurant),
@@ -64,38 +73,38 @@ module UrlScraper
64
73
  'product' => %w(album book drink food game movie product song tv_show),
65
74
  'website' => %w(blog website)
66
75
  }
67
-
76
+
68
77
  # The UrlScraper::Object is a Hash with method accessors for
69
78
  # all detected Open Graph attributes.
70
79
  class Object < Hashie::Mash
71
80
  MANDATORY_ATTRIBUTES = %w(title type image url)
72
-
81
+
73
82
  # The object type.
74
83
  def type
75
84
  self['type']
76
85
  end
77
-
86
+
78
87
  # The schema under which this particular object lies. May be any of
79
88
  # the keys of the TYPES constant.
80
89
  def schema
81
- UrlScraper::TYPES.each_pair do |schema, types|
90
+ UrlScraper::TYPES.each_pair do |schema, types|
82
91
  return schema if types.include?(self.type)
83
92
  end
84
93
  nil
85
94
  end
86
-
95
+
87
96
  UrlScraper::TYPES.values.flatten.each do |type|
88
97
  define_method "#{type}?" do
89
98
  self.type == type
90
99
  end
91
100
  end
92
-
101
+
93
102
  UrlScraper::TYPES.keys.each do |scheme|
94
103
  define_method "#{scheme}?" do
95
104
  self.type == scheme || UrlScraper::TYPES[scheme].include?(self.type)
96
105
  end
97
106
  end
98
-
107
+
99
108
  # If the Open Graph information for this object doesn't contain
100
109
  # the mandatory attributes, this will be <tt>false</tt>.
101
110
  def valid?
@@ -1,3 +1,3 @@
1
1
  module UrlScraper
2
- VERSION = "0.0.4"
2
+ VERSION = "0.0.5"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: url_scraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-05-08 00:00:00.000000000 Z
12
+ date: 2013-07-23 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -144,18 +144,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
144
144
  - - ! '>='
145
145
  - !ruby/object:Gem::Version
146
146
  version: '0'
147
- segments:
148
- - 0
149
- hash: 2025570187396456876
150
147
  required_rubygems_version: !ruby/object:Gem::Requirement
151
148
  none: false
152
149
  requirements:
153
150
  - - ! '>='
154
151
  - !ruby/object:Gem::Version
155
152
  version: '0'
156
- segments:
157
- - 0
158
- hash: 2025570187396456876
159
153
  requirements: []
160
154
  rubyforge_project:
161
155
  rubygems_version: 1.8.24