url_scraper 0.0.4 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/url_scraper.rb +26 -17
- data/lib/url_scraper/version.rb +1 -1
- metadata +2 -8
data/lib/url_scraper.rb
CHANGED
@@ -4,17 +4,18 @@ require 'nokogiri'
|
|
4
4
|
require 'restclient'
|
5
5
|
require 'logger'
|
6
6
|
require 'thor'
|
7
|
+
require 'cgi'
|
7
8
|
|
8
9
|
module UrlScraper
|
9
10
|
# Tell rails to load all assets
|
10
11
|
class Engine < Rails::Engine
|
11
|
-
|
12
|
+
|
12
13
|
end
|
13
14
|
|
14
15
|
class CLI < Thor
|
15
16
|
|
16
17
|
end
|
17
|
-
|
18
|
+
|
18
19
|
# Handles the url request
|
19
20
|
|
20
21
|
# Fetch Open Graph data from the specified URI. Makes an
|
@@ -23,14 +24,14 @@ module UrlScraper
|
|
23
24
|
#
|
24
25
|
# Pass <tt>false</tt> for the second argument if you want to
|
25
26
|
# see invalid (i.e. missing a required attribute) data.
|
26
|
-
|
27
|
-
def self.fetch(uri, strict = true)
|
28
|
-
parse(RestClient.get(uri).body, strict)
|
27
|
+
|
28
|
+
def self.fetch(uri, strict = true)
|
29
|
+
parse(RestClient.get(uri).body, strict, uri)
|
29
30
|
rescue RestClient::Exception, SocketError
|
30
31
|
false
|
31
32
|
end
|
32
|
-
|
33
|
-
def self.parse(html, strict = true)
|
33
|
+
|
34
|
+
def self.parse(html, strict = true, uri)
|
34
35
|
logger = Logger.new(STDOUT)
|
35
36
|
doc = Nokogiri::HTML.parse(html)
|
36
37
|
page = UrlScraper::Object.new
|
@@ -40,20 +41,28 @@ module UrlScraper
|
|
40
41
|
end
|
41
42
|
end
|
42
43
|
|
43
|
-
page.title = (doc.at_css('title').text rescue nil) if page.title.nil?
|
44
|
+
page.title = (doc.at_css('title').text rescue nil) if page.title.nil?
|
44
45
|
if page.description.nil?
|
45
46
|
page.description = doc.at_css("meta[name='description']")['content'] unless doc.at_css("meta[name='description']").nil?
|
46
47
|
end
|
47
48
|
if page.image.nil?
|
48
|
-
image_array =
|
49
|
+
image_array = []
|
50
|
+
doc.css("img").each do |img|
|
51
|
+
next if img["src"].to_s.empty?
|
52
|
+
image = URI.escape(img["src"].strip)
|
53
|
+
image = image.gsub(/([{}|\^\[\]\@`])/) {|s| CGI.escape(s)} # escape characters that URI.escape doesn't get
|
54
|
+
image = URI.parse(uri).merge(URI.parse image.to_s).to_s
|
55
|
+
image_array << image
|
56
|
+
end
|
49
57
|
page.image = image_array unless image_array.empty?
|
50
58
|
end
|
51
59
|
# return false if page.keys.empty?
|
52
60
|
# return false unless page.valid? if strict
|
61
|
+
page.image = Array.wrap(page.image)
|
53
62
|
page
|
54
63
|
# return doc
|
55
64
|
end
|
56
|
-
|
65
|
+
|
57
66
|
TYPES = {
|
58
67
|
'activity' => %w(activity sport),
|
59
68
|
'business' => %w(bar company cafe hotel restaurant),
|
@@ -64,38 +73,38 @@ module UrlScraper
|
|
64
73
|
'product' => %w(album book drink food game movie product song tv_show),
|
65
74
|
'website' => %w(blog website)
|
66
75
|
}
|
67
|
-
|
76
|
+
|
68
77
|
# The UrlScraper::Object is a Hash with method accessors for
|
69
78
|
# all detected Open Graph attributes.
|
70
79
|
class Object < Hashie::Mash
|
71
80
|
MANDATORY_ATTRIBUTES = %w(title type image url)
|
72
|
-
|
81
|
+
|
73
82
|
# The object type.
|
74
83
|
def type
|
75
84
|
self['type']
|
76
85
|
end
|
77
|
-
|
86
|
+
|
78
87
|
# The schema under which this particular object lies. May be any of
|
79
88
|
# the keys of the TYPES constant.
|
80
89
|
def schema
|
81
|
-
UrlScraper::TYPES.each_pair do |schema, types|
|
90
|
+
UrlScraper::TYPES.each_pair do |schema, types|
|
82
91
|
return schema if types.include?(self.type)
|
83
92
|
end
|
84
93
|
nil
|
85
94
|
end
|
86
|
-
|
95
|
+
|
87
96
|
UrlScraper::TYPES.values.flatten.each do |type|
|
88
97
|
define_method "#{type}?" do
|
89
98
|
self.type == type
|
90
99
|
end
|
91
100
|
end
|
92
|
-
|
101
|
+
|
93
102
|
UrlScraper::TYPES.keys.each do |scheme|
|
94
103
|
define_method "#{scheme}?" do
|
95
104
|
self.type == scheme || UrlScraper::TYPES[scheme].include?(self.type)
|
96
105
|
end
|
97
106
|
end
|
98
|
-
|
107
|
+
|
99
108
|
# If the Open Graph information for this object doesn't contain
|
100
109
|
# the mandatory attributes, this will be <tt>false</tt>.
|
101
110
|
def valid?
|
data/lib/url_scraper/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: url_scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-07-23 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
@@ -144,18 +144,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
144
144
|
- - ! '>='
|
145
145
|
- !ruby/object:Gem::Version
|
146
146
|
version: '0'
|
147
|
-
segments:
|
148
|
-
- 0
|
149
|
-
hash: 2025570187396456876
|
150
147
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
151
148
|
none: false
|
152
149
|
requirements:
|
153
150
|
- - ! '>='
|
154
151
|
- !ruby/object:Gem::Version
|
155
152
|
version: '0'
|
156
|
-
segments:
|
157
|
-
- 0
|
158
|
-
hash: 2025570187396456876
|
159
153
|
requirements: []
|
160
154
|
rubyforge_project:
|
161
155
|
rubygems_version: 1.8.24
|