url_scraper 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -4,17 +4,18 @@ require 'nokogiri'
4
4
  require 'restclient'
5
5
  require 'logger'
6
6
  require 'thor'
7
+ require 'cgi'
7
8
 
8
9
  module UrlScraper
9
10
  # Tell rails to load all assets
10
11
  class Engine < Rails::Engine
11
-
12
+
12
13
  end
13
14
 
14
15
  class CLI < Thor
15
16
 
16
17
  end
17
-
18
+
18
19
  # Handles the url request
19
20
 
20
21
  # Fetch Open Graph data from the specified URI. Makes an
@@ -23,14 +24,14 @@ module UrlScraper
23
24
  #
24
25
  # Pass <tt>false</tt> for the second argument if you want to
25
26
  # see invalid (i.e. missing a required attribute) data.
26
-
27
- def self.fetch(uri, strict = true)
28
- parse(RestClient.get(uri).body, strict)
27
+
28
+ def self.fetch(uri, strict = true)
29
+ parse(RestClient.get(uri).body, strict, uri)
29
30
  rescue RestClient::Exception, SocketError
30
31
  false
31
32
  end
32
-
33
- def self.parse(html, strict = true)
33
+
34
+ def self.parse(html, strict = true, uri)
34
35
  logger = Logger.new(STDOUT)
35
36
  doc = Nokogiri::HTML.parse(html)
36
37
  page = UrlScraper::Object.new
@@ -40,20 +41,28 @@ module UrlScraper
40
41
  end
41
42
  end
42
43
 
43
- page.title = (doc.at_css('title').text rescue nil) if page.title.nil?
44
+ page.title = (doc.at_css('title').text rescue nil) if page.title.nil?
44
45
  if page.description.nil?
45
46
  page.description = doc.at_css("meta[name='description']")['content'] unless doc.at_css("meta[name='description']").nil?
46
47
  end
47
48
  if page.image.nil?
48
- image_array = doc.css("img").take(3).collect{|img| img['src']}
49
+ image_array = []
50
+ doc.css("img").each do |img|
51
+ next if img["src"].to_s.empty?
52
+ image = URI.escape(img["src"].strip)
53
+ image = image.gsub(/([{}|\^\[\]\@`])/) {|s| CGI.escape(s)} # escape characters that URI.escape doesn't get
54
+ image = URI.parse(uri).merge(URI.parse image.to_s).to_s
55
+ image_array << image
56
+ end
49
57
  page.image = image_array unless image_array.empty?
50
58
  end
51
59
  # return false if page.keys.empty?
52
60
  # return false unless page.valid? if strict
61
+ page.image = Array.wrap(page.image)
53
62
  page
54
63
  # return doc
55
64
  end
56
-
65
+
57
66
  TYPES = {
58
67
  'activity' => %w(activity sport),
59
68
  'business' => %w(bar company cafe hotel restaurant),
@@ -64,38 +73,38 @@ module UrlScraper
64
73
  'product' => %w(album book drink food game movie product song tv_show),
65
74
  'website' => %w(blog website)
66
75
  }
67
-
76
+
68
77
  # The UrlScraper::Object is a Hash with method accessors for
69
78
  # all detected Open Graph attributes.
70
79
  class Object < Hashie::Mash
71
80
  MANDATORY_ATTRIBUTES = %w(title type image url)
72
-
81
+
73
82
  # The object type.
74
83
  def type
75
84
  self['type']
76
85
  end
77
-
86
+
78
87
  # The schema under which this particular object lies. May be any of
79
88
  # the keys of the TYPES constant.
80
89
  def schema
81
- UrlScraper::TYPES.each_pair do |schema, types|
90
+ UrlScraper::TYPES.each_pair do |schema, types|
82
91
  return schema if types.include?(self.type)
83
92
  end
84
93
  nil
85
94
  end
86
-
95
+
87
96
  UrlScraper::TYPES.values.flatten.each do |type|
88
97
  define_method "#{type}?" do
89
98
  self.type == type
90
99
  end
91
100
  end
92
-
101
+
93
102
  UrlScraper::TYPES.keys.each do |scheme|
94
103
  define_method "#{scheme}?" do
95
104
  self.type == scheme || UrlScraper::TYPES[scheme].include?(self.type)
96
105
  end
97
106
  end
98
-
107
+
99
108
  # If the Open Graph information for this object doesn't contain
100
109
  # the mandatory attributes, this will be <tt>false</tt>.
101
110
  def valid?
@@ -1,3 +1,3 @@
1
1
  module UrlScraper
2
- VERSION = "0.0.4"
2
+ VERSION = "0.0.5"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: url_scraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-05-08 00:00:00.000000000 Z
12
+ date: 2013-07-23 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -144,18 +144,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
144
144
  - - ! '>='
145
145
  - !ruby/object:Gem::Version
146
146
  version: '0'
147
- segments:
148
- - 0
149
- hash: 2025570187396456876
150
147
  required_rubygems_version: !ruby/object:Gem::Requirement
151
148
  none: false
152
149
  requirements:
153
150
  - - ! '>='
154
151
  - !ruby/object:Gem::Version
155
152
  version: '0'
156
- segments:
157
- - 0
158
- hash: 2025570187396456876
159
153
  requirements: []
160
154
  rubyforge_project:
161
155
  rubygems_version: 1.8.24