scrapifier 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +11 -5
- data/lib/scrapifier/support.rb +96 -47
- data/lib/scrapifier/version.rb +1 -1
- data/spec/factories/samples.rb +37 -36
- data/spec/scrapifier_spec.rb +6 -3
- data/spec/spec_helper.rb +0 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 67a72b01b9c64d37f15c459e11c536f29bd0abc2
|
4
|
+
data.tar.gz: d83f2f8c7ff3203ee2a5472e2e2099a4389af7aa
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2a4abdb171a16b46b89bfb4c43a3262169c08c70125b3ac88e9c4f13209acd5085368b3a4b9399691b9fb9dc89fc4cacfe34b781a26df3c5646bfd4f20f44435
|
7
|
+
data.tar.gz: c246dd2f9770e3835f4423b497b0bb86d38e51ff9030b59aa516be89f11262db159d2938f94e43f60b91fb14c2c56015965fb8556eec963a13e3c568a4c68d9d
|
data/README.md
CHANGED
@@ -23,9 +23,15 @@ Or install it yourself as:
|
|
23
23
|
|
24
24
|
$ gem install scrapifier
|
25
25
|
|
26
|
+
An then require the gem:
|
27
|
+
|
28
|
+
$ require 'scrapifier'
|
29
|
+
|
26
30
|
## Usage
|
27
31
|
|
28
|
-
The method finds
|
32
|
+
The String#scrapify method finds URIs in a string and then gets their metadata, e.g., the page's title, description, images and URI. All the data is returned in a well-formatted hash.
|
33
|
+
|
34
|
+
Note: This gem is mainly focused on screen scraping URLs (presence of protocol, such as: "http", "https" and "ftp"), but it also works with URIs which have the "www" without any protocol defined, like: "www.google.com".
|
29
35
|
|
30
36
|
#### Default usage.
|
31
37
|
|
@@ -42,7 +48,7 @@ The method finds an URI in the String and gets some meta information from it, li
|
|
42
48
|
#### Allow only certain image types.
|
43
49
|
|
44
50
|
``` ruby
|
45
|
-
'Wow! What an awesome site: http://adtangerine.com!'.scrapify
|
51
|
+
'Wow! What an awesome site: http://adtangerine.com!'.scrapify(images: :jpg)
|
46
52
|
#=> {
|
47
53
|
# title: "AdTangerine | Advertising Platform for Social Media",
|
48
54
|
# description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
|
@@ -50,7 +56,7 @@ The method finds an URI in the String and gets some meta information from it, li
|
|
50
56
|
# uri: "http://adtangerine.com"
|
51
57
|
# }
|
52
58
|
|
53
|
-
'Wow! What an awesome site: http://adtangerine.com!'.scrapify
|
59
|
+
'Wow! What an awesome site: http://adtangerine.com!'.scrapify(images: [:png, :gif])
|
54
60
|
#=> {
|
55
61
|
# title: "AdTangerine | Advertising Platform for Social Media",
|
56
62
|
# description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
|
@@ -62,7 +68,7 @@ The method finds an URI in the String and gets some meta information from it, li
|
|
62
68
|
#### Choose which URI you want it to be scraped.
|
63
69
|
|
64
70
|
``` ruby
|
65
|
-
'Check out: http://adtangerine.com and www.twitflink.com'.scrapify
|
71
|
+
'Check out: http://adtangerine.com and www.twitflink.com'.scrapify(which: 1)
|
66
72
|
#=> {
|
67
73
|
# title: "TwitFlink | Find a link!",
|
68
74
|
# description: "TwitFlink is a very simple searching tool that allows people to find out links tweeted by any user from Twitter.",
|
@@ -70,7 +76,7 @@ The method finds an URI in the String and gets some meta information from it, li
|
|
70
76
|
# uri: "http://www.twitflink.com"
|
71
77
|
# }
|
72
78
|
|
73
|
-
'Check out: http://adtangerine.com and www.twitflink.com'.scrapify(
|
79
|
+
'Check out: http://adtangerine.com and www.twitflink.com'.scrapify(which: 0, images: :gif)
|
74
80
|
#=> {
|
75
81
|
# title: "AdTangerine | Advertising Platform for Social Media",
|
76
82
|
# description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
|
data/lib/scrapifier/support.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
module Scrapifier
|
2
|
+
# Support methods to get, check and organize data.
|
2
3
|
module Support
|
3
4
|
module_function
|
4
5
|
|
@@ -18,14 +19,17 @@ module Scrapifier
|
|
18
19
|
# Arguments:
|
19
20
|
# uri: (String)
|
20
21
|
# - URI.
|
21
|
-
#
|
22
|
+
# exts: (Array)
|
22
23
|
# - Allowed type of images.
|
23
|
-
def sf_eval_uri(uri,
|
24
|
+
def sf_eval_uri(uri, exts = [])
|
24
25
|
doc = Nokogiri::HTML(open(uri).read)
|
25
26
|
doc.encoding, meta = 'utf-8', { uri: uri }
|
26
27
|
|
27
|
-
[:title, :description].each
|
28
|
-
|
28
|
+
[:title, :description].each do |k|
|
29
|
+
node = doc.xpath(sf_xpaths[k])[0]
|
30
|
+
meta[k] = node.nil? ? '-' : node.text
|
31
|
+
end
|
32
|
+
meta[:images] = sf_fix_imgs(doc.xpath(sf_xpaths[:image]), uri, exts)
|
29
33
|
|
30
34
|
meta
|
31
35
|
rescue SocketError
|
@@ -33,11 +37,14 @@ module Scrapifier
|
|
33
37
|
end
|
34
38
|
|
35
39
|
# Filter images returning those with the allowed extentions.
|
36
|
-
#
|
40
|
+
#
|
37
41
|
# Example:
|
38
42
|
# >> sf_check_img_ext('http://source.com/image.gif', :jpg)
|
39
43
|
# => []
|
40
|
-
# >> sf_check_img_ext(
|
44
|
+
# >> sf_check_img_ext(
|
45
|
+
# ['http://source.com/image.gif','http://source.com/image.jpg'],
|
46
|
+
# [:jpg, :png]
|
47
|
+
# )
|
41
48
|
# => ['http://source.com/image.jpg']
|
42
49
|
# Arguments:
|
43
50
|
# images: (String or Array)
|
@@ -55,32 +62,37 @@ module Scrapifier
|
|
55
62
|
end
|
56
63
|
|
57
64
|
# Select regexes for URIs, protocols and image extensions.
|
58
|
-
#
|
65
|
+
#
|
59
66
|
# Example:
|
60
67
|
# >> sf_regex(:uri)
|
61
|
-
# => /\b((((ht|f)tp[s]?:\/\/)
|
68
|
+
# => /\b((((ht|f)tp[s]?:\/\/).../i,
|
62
69
|
# >> sf_regex(:image, :jpg)
|
63
70
|
# => /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg)(\?.+)?$)/i
|
64
71
|
# Arguments:
|
65
72
|
# type: (Symbol or String)
|
66
|
-
# - Regex type
|
73
|
+
# - Regex type: :uri, :protocol, :image
|
67
74
|
# args: (*)
|
68
75
|
# - Anything.
|
69
76
|
def sf_regex(type, *args)
|
70
77
|
type = type.to_sym unless type.is_a? Symbol
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
78
|
+
type == :image && sf_img_regex(args.flatten) || sf_uri_regex[type]
|
79
|
+
end
|
80
|
+
|
81
|
+
# Build a hash with the URI regexes.
|
82
|
+
def sf_uri_regex
|
83
|
+
{ uri: %r{\b(
|
84
|
+
(((ht|f)tp[s]?://)|([a-z0-9]+\.))+
|
85
|
+
(?<!@)
|
86
|
+
([a-z0-9\_\-]+)
|
87
|
+
(\.[a-z]+)+
|
88
|
+
([\?/\:][a-z0-9_=%&@\?\./\-\:\#\(\)]+)?
|
89
|
+
/?
|
90
|
+
)}ix,
|
91
|
+
protocol: /((ht|f)tp[s]?)/i }
|
80
92
|
end
|
81
93
|
|
82
94
|
# Build image regexes according to the required extensions.
|
83
|
-
#
|
95
|
+
#
|
84
96
|
# Example:
|
85
97
|
# >> sf_img_regex
|
86
98
|
# => /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg|png|gif)(\?.+)?$)/i
|
@@ -91,54 +103,90 @@ module Scrapifier
|
|
91
103
|
# - Image extensions which will be included in the regex.
|
92
104
|
def sf_img_regex(exts = [])
|
93
105
|
exts = [exts].flatten unless exts.is_a?(Array)
|
94
|
-
if exts.nil?
|
106
|
+
if exts.nil? || exts.empty?
|
95
107
|
exts = %w(jpg jpeg png gif)
|
96
|
-
elsif exts.include?(:jpg)
|
108
|
+
elsif exts.include?(:jpg) && !exts.include?(:jpeg)
|
97
109
|
exts.push :jpeg
|
98
|
-
end
|
99
|
-
|
110
|
+
end
|
111
|
+
%r{(^http{1}[s]?://([w]{3}\.)?.+\.(#{exts.join('|')})(\?.+)?$)}i
|
100
112
|
end
|
101
113
|
|
102
|
-
# Collection of
|
103
|
-
#
|
104
|
-
|
105
|
-
# Example:
|
106
|
-
# >> sf_paths[:title]
|
107
|
-
# => '//meta[@property = "og:title"]/@content | //meta[@name = "title"]/@content | //meta[@name = "Title"]/@content | //title | //h1'
|
108
|
-
def sf_paths
|
114
|
+
# Collection of xpath that are used to get nodes
|
115
|
+
# from the parsed HTML.
|
116
|
+
def sf_xpaths
|
109
117
|
{
|
110
|
-
title:
|
111
|
-
description:
|
112
|
-
image:
|
118
|
+
title: sf_title_xpath,
|
119
|
+
description: sf_desc_xpath,
|
120
|
+
image: sf_img_xpath
|
113
121
|
}
|
114
122
|
end
|
115
123
|
|
124
|
+
def sf_title_xpath
|
125
|
+
<<-END.gsub(/^\s+\|/, '')
|
126
|
+
|//meta[@property = "og:title"]/@content|
|
127
|
+
|//meta[@name = "title"]/@content|
|
128
|
+
|//meta[@name = "Title"]/@content|
|
129
|
+
|//title|//h1
|
130
|
+
END
|
131
|
+
end
|
132
|
+
|
133
|
+
def sf_desc_xpath
|
134
|
+
<<-END.gsub(/^\s+\|/, '')
|
135
|
+
|//meta[@property = "og:description"]/@content|
|
136
|
+
|//meta[@name = "description"]/@content|
|
137
|
+
|//meta[@name = "Description"]/@content|
|
138
|
+
|//h1|//h3|//p|//span|//font
|
139
|
+
END
|
140
|
+
end
|
141
|
+
|
142
|
+
def sf_img_xpath
|
143
|
+
<<-END.gsub(/^\s+\|/, '')
|
144
|
+
|//meta[@property = "og:image"]/@content|
|
145
|
+
|//link[@rel = "image_src"]/@href|
|
146
|
+
|//meta[@itemprop = "image"]/@content|
|
147
|
+
|//div[@id = "logo"]/img/@src|//a[@id = "logo"]/img/@src|
|
148
|
+
|//div[@class = "logo"]/img/@src|//a[@class = "logo"]/img/@src|
|
149
|
+
|//a//img[@width]/@src|//img[@width]/@src|
|
150
|
+
|//a//img[@height]/@src|//img[@height]/@src|
|
151
|
+
|//a//img/@src|//span//img/@src|//img/@src
|
152
|
+
END
|
153
|
+
end
|
154
|
+
|
116
155
|
# Check and return only the valid image URIs.
|
117
|
-
#
|
156
|
+
#
|
118
157
|
# Example:
|
119
|
-
# >> sf_fix_imgs(
|
158
|
+
# >> sf_fix_imgs(
|
159
|
+
# ['http://adtangerine.com/image.png', '/assets/image.jpg'],
|
160
|
+
# 'http://adtangerine.com',
|
161
|
+
# :jpg
|
162
|
+
# )
|
120
163
|
# => ['http://adtangerine/assets/image.jpg']
|
121
164
|
# Arguments:
|
122
165
|
# imgs: (Array)
|
123
166
|
# - Image URIs got from the HTML doc.
|
124
167
|
# uri: (String)
|
125
|
-
# - Used as basis to the URIs that don't have any protocol/domain set.
|
168
|
+
# - Used as basis to the URIs that don't have any protocol/domain set.
|
126
169
|
# exts: (Symbol or Array)
|
127
170
|
# - Allowed image extesntions.
|
128
171
|
def sf_fix_imgs(imgs, uri, exts = [])
|
129
172
|
sf_check_img_ext(imgs.map do |img|
|
130
|
-
img = img.to_s
|
131
|
-
|
132
|
-
|
173
|
+
img = img.to_s
|
174
|
+
unless img =~ sf_regex(:protocol)
|
175
|
+
img = sf_fix_protocol(img, sf_domain(uri))
|
176
|
+
end
|
177
|
+
img if img =~ sf_regex(:image)
|
133
178
|
end.compact, exts)
|
134
179
|
end
|
135
180
|
|
136
181
|
# Fix image URIs that don't have a protocol/domain set.
|
137
|
-
#
|
182
|
+
#
|
138
183
|
# Example:
|
139
184
|
# >> sf_fix_protocol('/assets/image.jpg', 'http://adtangerine.com')
|
140
185
|
# => 'http://adtangerine/assets/image.jpg'
|
141
|
-
# >> sf_fix_protocol(
|
186
|
+
# >> sf_fix_protocol(
|
187
|
+
# '//s.ytimg.com/yts/img/youtub_img.png',
|
188
|
+
# 'https://youtube.com'
|
189
|
+
# )
|
142
190
|
# => 'https://s.ytimg.com/yts/img/youtub_img.png'
|
143
191
|
# Arguments:
|
144
192
|
# path: (String)
|
@@ -146,15 +194,15 @@ module Scrapifier
|
|
146
194
|
# domain: (String)
|
147
195
|
# - Domain that will be prepended into the path.
|
148
196
|
def sf_fix_protocol(path, domain)
|
149
|
-
if path =~
|
197
|
+
if path =~ %r{^//[^/]+}
|
150
198
|
'http:' << path
|
151
199
|
else
|
152
|
-
|
153
|
-
end
|
200
|
+
"http://#{domain}#{'/' unless path =~ %r{^/[^/]+}}#{path}"
|
201
|
+
end
|
154
202
|
end
|
155
203
|
|
156
204
|
# Return the URI domain.
|
157
|
-
#
|
205
|
+
#
|
158
206
|
# Example:
|
159
207
|
# >> sf_domain('http://adtangerine.com')
|
160
208
|
# => 'adtangerine.com'
|
@@ -162,7 +210,8 @@ module Scrapifier
|
|
162
210
|
# uri: (String)
|
163
211
|
# - URI.
|
164
212
|
def sf_domain(uri)
|
165
|
-
|
213
|
+
uri = uri.to_s.split('/')
|
214
|
+
uri.empty? ? '' : uri[2]
|
166
215
|
end
|
167
216
|
end
|
168
|
-
end
|
217
|
+
end
|
data/lib/scrapifier/version.rb
CHANGED
data/spec/factories/samples.rb
CHANGED
@@ -1,40 +1,41 @@
|
|
1
1
|
module Factories
|
2
2
|
private
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
3
|
+
|
4
|
+
def sf_samples
|
5
|
+
{
|
6
|
+
misc: {
|
7
|
+
http: 'http://adtangerine.com',
|
8
|
+
https: 'https://rubygems.org/gems/string_awesome',
|
9
|
+
ftp: 'ftp://ftpserver.com',
|
10
|
+
www: 'www.twitflink.com'
|
11
|
+
},
|
12
|
+
images: {
|
13
|
+
jpg: [
|
14
|
+
'http://jlcauvin.com/wp-content/uploads/2013/09/heisenberg-breaking-bad.jpg',
|
15
|
+
'https://www.foobar.com/awesome_image.jpeg?foo=bar&bar=foo',
|
16
|
+
'http://foobar.com.br/nice-image.jpg'
|
17
|
+
],
|
18
|
+
png: [
|
19
|
+
'http://www.faniq.com/images/blog/58389e481aee9c5abbf49ff0a263f3ca.png',
|
20
|
+
'https://foobar.br/awesome_image.png',
|
21
|
+
'https://bar.foobar.br/foo/var/image.png?foo=bar',
|
22
|
+
],
|
23
|
+
gif: [
|
24
|
+
'http://31.media.tumblr.com/6eec77e355fe50bae424291fd8c58622/tumblr_me7ucl8kO61rf089no1_500.gif',
|
25
|
+
'http://foobar.com/ugly_image.gif',
|
26
|
+
'https://bar.foobar.br/foo/var/stop_using.gif?foo=bar'
|
27
|
+
]
|
28
|
+
},
|
29
|
+
regexes: {
|
30
|
+
image: {
|
31
|
+
all: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg|png|gif)(\?.+)?$)/i,
|
32
|
+
jpg: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg)(\?.+)?$)/i,
|
33
|
+
png: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(png)(\?.+)?$)/i,
|
34
|
+
gif: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(gif)(\?.+)?$)/i
|
10
35
|
},
|
11
|
-
|
12
|
-
|
13
|
-
'http://jlcauvin.com/wp-content/uploads/2013/09/heisenberg-breaking-bad.jpg',
|
14
|
-
'https://www.foobar.com/awesome_image.jpeg?foo=bar&bar=foo',
|
15
|
-
'http://foobar.com.br/nice-image.jpg'
|
16
|
-
],
|
17
|
-
png: [
|
18
|
-
'http://www.faniq.com/images/blog/58389e481aee9c5abbf49ff0a263f3ca.png',
|
19
|
-
'https://foobar.br/awesome_image.png',
|
20
|
-
'https://bar.foobar.br/foo/var/image.png?foo=bar',
|
21
|
-
],
|
22
|
-
gif: [
|
23
|
-
'http://31.media.tumblr.com/6eec77e355fe50bae424291fd8c58622/tumblr_me7ucl8kO61rf089no1_500.gif',
|
24
|
-
'http://foobar.com/ugly_image.gif',
|
25
|
-
'https://bar.foobar.br/foo/var/stop_using.gif?foo=bar'
|
26
|
-
]
|
27
|
-
},
|
28
|
-
regexes: {
|
29
|
-
image: {
|
30
|
-
all: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg|png|gif)(\?.+)?$)/i,
|
31
|
-
jpg: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg)(\?.+)?$)/i,
|
32
|
-
png: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(png)(\?.+)?$)/i,
|
33
|
-
gif: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(gif)(\?.+)?$)/i
|
34
|
-
},
|
35
|
-
uri: /\b((((ht|f)tp[s]?:\/\/)|([a-z0-9]+\.))+(?<!@)([a-z0-9\_\-]+)(\.[a-z]+)+([\?\/\:][a-z0-9_=%&@\?\.\/\-\:\#\(\)]+)?\/?)/i,
|
36
|
-
protocol: /((ht|f)tp[s]?)/i
|
37
|
-
}
|
36
|
+
uri: /\b((((ht|f)tp[s]?:\/\/)|([a-z0-9]+\.))+(?<!@)([a-z0-9\_\-]+)(\.[a-z]+)+([\?\/\:][a-z0-9_=%&@\?\.\/\-\:\#\(\)]+)?\/?)/i,
|
37
|
+
protocol: /((ht|f)tp[s]?)/i
|
38
38
|
}
|
39
|
-
|
40
|
-
end
|
39
|
+
}
|
40
|
+
end
|
41
|
+
end
|
data/spec/scrapifier_spec.rb
CHANGED
@@ -62,13 +62,16 @@ describe String do
|
|
62
62
|
end
|
63
63
|
|
64
64
|
it "includes a field with image URIs from the site's head/body" do
|
65
|
-
hash[:images].
|
66
|
-
|
65
|
+
unless hash[:images].empty?
|
66
|
+
hash[:images].is_a?(Array).should be_true
|
67
|
+
hash[:images].sample.should match(regexes[:image][:all])
|
68
|
+
end
|
67
69
|
end
|
68
70
|
end
|
69
71
|
|
70
72
|
it "includes a field with only the allowed types of image URIs from the site's head/body" do
|
71
|
-
misc[:http].scrapify(images: :png)[:images].sample
|
73
|
+
image = misc[:http].scrapify(images: :png)[:images].sample
|
74
|
+
image.should match(regexes[:image][:png]) unless image.nil?
|
72
75
|
end
|
73
76
|
|
74
77
|
it "can choose the URI in the String to be scrapified" do
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrapifier
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tiago Guedes
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-06-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|