url_scraper 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/url_scraper.rb +26 -17
- data/lib/url_scraper/version.rb +1 -1
- metadata +2 -8
data/lib/url_scraper.rb
CHANGED
@@ -4,17 +4,18 @@ require 'nokogiri'
|
|
4
4
|
require 'restclient'
|
5
5
|
require 'logger'
|
6
6
|
require 'thor'
|
7
|
+
require 'cgi'
|
7
8
|
|
8
9
|
module UrlScraper
|
9
10
|
# Tell rails to load all assets
|
10
11
|
class Engine < Rails::Engine
|
11
|
-
|
12
|
+
|
12
13
|
end
|
13
14
|
|
14
15
|
class CLI < Thor
|
15
16
|
|
16
17
|
end
|
17
|
-
|
18
|
+
|
18
19
|
# Handles the url request
|
19
20
|
|
20
21
|
# Fetch Open Graph data from the specified URI. Makes an
|
@@ -23,14 +24,14 @@ module UrlScraper
|
|
23
24
|
#
|
24
25
|
# Pass <tt>false</tt> for the second argument if you want to
|
25
26
|
# see invalid (i.e. missing a required attribute) data.
|
26
|
-
|
27
|
-
def self.fetch(uri, strict = true)
|
28
|
-
parse(RestClient.get(uri).body, strict)
|
27
|
+
|
28
|
+
def self.fetch(uri, strict = true)
|
29
|
+
parse(RestClient.get(uri).body, strict, uri)
|
29
30
|
rescue RestClient::Exception, SocketError
|
30
31
|
false
|
31
32
|
end
|
32
|
-
|
33
|
-
def self.parse(html, strict = true)
|
33
|
+
|
34
|
+
def self.parse(html, strict = true, uri)
|
34
35
|
logger = Logger.new(STDOUT)
|
35
36
|
doc = Nokogiri::HTML.parse(html)
|
36
37
|
page = UrlScraper::Object.new
|
@@ -40,20 +41,28 @@ module UrlScraper
|
|
40
41
|
end
|
41
42
|
end
|
42
43
|
|
43
|
-
page.title = (doc.at_css('title').text rescue nil) if page.title.nil?
|
44
|
+
page.title = (doc.at_css('title').text rescue nil) if page.title.nil?
|
44
45
|
if page.description.nil?
|
45
46
|
page.description = doc.at_css("meta[name='description']")['content'] unless doc.at_css("meta[name='description']").nil?
|
46
47
|
end
|
47
48
|
if page.image.nil?
|
48
|
-
image_array =
|
49
|
+
image_array = []
|
50
|
+
doc.css("img").each do |img|
|
51
|
+
next if img["src"].to_s.empty?
|
52
|
+
image = URI.escape(img["src"].strip)
|
53
|
+
image = image.gsub(/([{}|\^\[\]\@`])/) {|s| CGI.escape(s)} # escape characters that URI.escape doesn't get
|
54
|
+
image = URI.parse(uri).merge(URI.parse image.to_s).to_s
|
55
|
+
image_array << image
|
56
|
+
end
|
49
57
|
page.image = image_array unless image_array.empty?
|
50
58
|
end
|
51
59
|
# return false if page.keys.empty?
|
52
60
|
# return false unless page.valid? if strict
|
61
|
+
page.image = Array.wrap(page.image)
|
53
62
|
page
|
54
63
|
# return doc
|
55
64
|
end
|
56
|
-
|
65
|
+
|
57
66
|
TYPES = {
|
58
67
|
'activity' => %w(activity sport),
|
59
68
|
'business' => %w(bar company cafe hotel restaurant),
|
@@ -64,38 +73,38 @@ module UrlScraper
|
|
64
73
|
'product' => %w(album book drink food game movie product song tv_show),
|
65
74
|
'website' => %w(blog website)
|
66
75
|
}
|
67
|
-
|
76
|
+
|
68
77
|
# The UrlScraper::Object is a Hash with method accessors for
|
69
78
|
# all detected Open Graph attributes.
|
70
79
|
class Object < Hashie::Mash
|
71
80
|
MANDATORY_ATTRIBUTES = %w(title type image url)
|
72
|
-
|
81
|
+
|
73
82
|
# The object type.
|
74
83
|
def type
|
75
84
|
self['type']
|
76
85
|
end
|
77
|
-
|
86
|
+
|
78
87
|
# The schema under which this particular object lies. May be any of
|
79
88
|
# the keys of the TYPES constant.
|
80
89
|
def schema
|
81
|
-
UrlScraper::TYPES.each_pair do |schema, types|
|
90
|
+
UrlScraper::TYPES.each_pair do |schema, types|
|
82
91
|
return schema if types.include?(self.type)
|
83
92
|
end
|
84
93
|
nil
|
85
94
|
end
|
86
|
-
|
95
|
+
|
87
96
|
UrlScraper::TYPES.values.flatten.each do |type|
|
88
97
|
define_method "#{type}?" do
|
89
98
|
self.type == type
|
90
99
|
end
|
91
100
|
end
|
92
|
-
|
101
|
+
|
93
102
|
UrlScraper::TYPES.keys.each do |scheme|
|
94
103
|
define_method "#{scheme}?" do
|
95
104
|
self.type == scheme || UrlScraper::TYPES[scheme].include?(self.type)
|
96
105
|
end
|
97
106
|
end
|
98
|
-
|
107
|
+
|
99
108
|
# If the Open Graph information for this object doesn't contain
|
100
109
|
# the mandatory attributes, this will be <tt>false</tt>.
|
101
110
|
def valid?
|
data/lib/url_scraper/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: url_scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-07-23 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
@@ -144,18 +144,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
144
144
|
- - ! '>='
|
145
145
|
- !ruby/object:Gem::Version
|
146
146
|
version: '0'
|
147
|
-
segments:
|
148
|
-
- 0
|
149
|
-
hash: 2025570187396456876
|
150
147
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
151
148
|
none: false
|
152
149
|
requirements:
|
153
150
|
- - ! '>='
|
154
151
|
- !ruby/object:Gem::Version
|
155
152
|
version: '0'
|
156
|
-
segments:
|
157
|
-
- 0
|
158
|
-
hash: 2025570187396456876
|
159
153
|
requirements: []
|
160
154
|
rubyforge_project:
|
161
155
|
rubygems_version: 1.8.24
|