scrapifier 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +11 -5
- data/lib/scrapifier/support.rb +96 -47
- data/lib/scrapifier/version.rb +1 -1
- data/spec/factories/samples.rb +37 -36
- data/spec/scrapifier_spec.rb +6 -3
- data/spec/spec_helper.rb +0 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 67a72b01b9c64d37f15c459e11c536f29bd0abc2
|
4
|
+
data.tar.gz: d83f2f8c7ff3203ee2a5472e2e2099a4389af7aa
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2a4abdb171a16b46b89bfb4c43a3262169c08c70125b3ac88e9c4f13209acd5085368b3a4b9399691b9fb9dc89fc4cacfe34b781a26df3c5646bfd4f20f44435
|
7
|
+
data.tar.gz: c246dd2f9770e3835f4423b497b0bb86d38e51ff9030b59aa516be89f11262db159d2938f94e43f60b91fb14c2c56015965fb8556eec963a13e3c568a4c68d9d
|
data/README.md
CHANGED
@@ -23,9 +23,15 @@ Or install it yourself as:
|
|
23
23
|
|
24
24
|
$ gem install scrapifier
|
25
25
|
|
26
|
+
An then require the gem:
|
27
|
+
|
28
|
+
$ require 'scrapifier'
|
29
|
+
|
26
30
|
## Usage
|
27
31
|
|
28
|
-
The method finds
|
32
|
+
The String#scrapify method finds URIs in a string and then gets their metadata, e.g., the page's title, description, images and URI. All the data is returned in a well-formatted hash.
|
33
|
+
|
34
|
+
Note: This gem is mainly focused on screen scraping URLs (presence of protocol, such as: "http", "https" and "ftp"), but it also works with URIs which have the "www" without any protocol defined, like: "www.google.com".
|
29
35
|
|
30
36
|
#### Default usage.
|
31
37
|
|
@@ -42,7 +48,7 @@ The method finds an URI in the String and gets some meta information from it, li
|
|
42
48
|
#### Allow only certain image types.
|
43
49
|
|
44
50
|
``` ruby
|
45
|
-
'Wow! What an awesome site: http://adtangerine.com!'.scrapify
|
51
|
+
'Wow! What an awesome site: http://adtangerine.com!'.scrapify(images: :jpg)
|
46
52
|
#=> {
|
47
53
|
# title: "AdTangerine | Advertising Platform for Social Media",
|
48
54
|
# description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
|
@@ -50,7 +56,7 @@ The method finds an URI in the String and gets some meta information from it, li
|
|
50
56
|
# uri: "http://adtangerine.com"
|
51
57
|
# }
|
52
58
|
|
53
|
-
'Wow! What an awesome site: http://adtangerine.com!'.scrapify
|
59
|
+
'Wow! What an awesome site: http://adtangerine.com!'.scrapify(images: [:png, :gif])
|
54
60
|
#=> {
|
55
61
|
# title: "AdTangerine | Advertising Platform for Social Media",
|
56
62
|
# description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
|
@@ -62,7 +68,7 @@ The method finds an URI in the String and gets some meta information from it, li
|
|
62
68
|
#### Choose which URI you want it to be scraped.
|
63
69
|
|
64
70
|
``` ruby
|
65
|
-
'Check out: http://adtangerine.com and www.twitflink.com'.scrapify
|
71
|
+
'Check out: http://adtangerine.com and www.twitflink.com'.scrapify(which: 1)
|
66
72
|
#=> {
|
67
73
|
# title: "TwitFlink | Find a link!",
|
68
74
|
# description: "TwitFlink is a very simple searching tool that allows people to find out links tweeted by any user from Twitter.",
|
@@ -70,7 +76,7 @@ The method finds an URI in the String and gets some meta information from it, li
|
|
70
76
|
# uri: "http://www.twitflink.com"
|
71
77
|
# }
|
72
78
|
|
73
|
-
'Check out: http://adtangerine.com and www.twitflink.com'.scrapify(
|
79
|
+
'Check out: http://adtangerine.com and www.twitflink.com'.scrapify(which: 0, images: :gif)
|
74
80
|
#=> {
|
75
81
|
# title: "AdTangerine | Advertising Platform for Social Media",
|
76
82
|
# description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
|
data/lib/scrapifier/support.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
module Scrapifier
|
2
|
+
# Support methods to get, check and organize data.
|
2
3
|
module Support
|
3
4
|
module_function
|
4
5
|
|
@@ -18,14 +19,17 @@ module Scrapifier
|
|
18
19
|
# Arguments:
|
19
20
|
# uri: (String)
|
20
21
|
# - URI.
|
21
|
-
#
|
22
|
+
# exts: (Array)
|
22
23
|
# - Allowed type of images.
|
23
|
-
def sf_eval_uri(uri,
|
24
|
+
def sf_eval_uri(uri, exts = [])
|
24
25
|
doc = Nokogiri::HTML(open(uri).read)
|
25
26
|
doc.encoding, meta = 'utf-8', { uri: uri }
|
26
27
|
|
27
|
-
[:title, :description].each
|
28
|
-
|
28
|
+
[:title, :description].each do |k|
|
29
|
+
node = doc.xpath(sf_xpaths[k])[0]
|
30
|
+
meta[k] = node.nil? ? '-' : node.text
|
31
|
+
end
|
32
|
+
meta[:images] = sf_fix_imgs(doc.xpath(sf_xpaths[:image]), uri, exts)
|
29
33
|
|
30
34
|
meta
|
31
35
|
rescue SocketError
|
@@ -33,11 +37,14 @@ module Scrapifier
|
|
33
37
|
end
|
34
38
|
|
35
39
|
# Filter images returning those with the allowed extentions.
|
36
|
-
#
|
40
|
+
#
|
37
41
|
# Example:
|
38
42
|
# >> sf_check_img_ext('http://source.com/image.gif', :jpg)
|
39
43
|
# => []
|
40
|
-
# >> sf_check_img_ext(
|
44
|
+
# >> sf_check_img_ext(
|
45
|
+
# ['http://source.com/image.gif','http://source.com/image.jpg'],
|
46
|
+
# [:jpg, :png]
|
47
|
+
# )
|
41
48
|
# => ['http://source.com/image.jpg']
|
42
49
|
# Arguments:
|
43
50
|
# images: (String or Array)
|
@@ -55,32 +62,37 @@ module Scrapifier
|
|
55
62
|
end
|
56
63
|
|
57
64
|
# Select regexes for URIs, protocols and image extensions.
|
58
|
-
#
|
65
|
+
#
|
59
66
|
# Example:
|
60
67
|
# >> sf_regex(:uri)
|
61
|
-
# => /\b((((ht|f)tp[s]?:\/\/)
|
68
|
+
# => /\b((((ht|f)tp[s]?:\/\/).../i,
|
62
69
|
# >> sf_regex(:image, :jpg)
|
63
70
|
# => /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg)(\?.+)?$)/i
|
64
71
|
# Arguments:
|
65
72
|
# type: (Symbol or String)
|
66
|
-
# - Regex type
|
73
|
+
# - Regex type: :uri, :protocol, :image
|
67
74
|
# args: (*)
|
68
75
|
# - Anything.
|
69
76
|
def sf_regex(type, *args)
|
70
77
|
type = type.to_sym unless type.is_a? Symbol
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
78
|
+
type == :image && sf_img_regex(args.flatten) || sf_uri_regex[type]
|
79
|
+
end
|
80
|
+
|
81
|
+
# Build a hash with the URI regexes.
|
82
|
+
def sf_uri_regex
|
83
|
+
{ uri: %r{\b(
|
84
|
+
(((ht|f)tp[s]?://)|([a-z0-9]+\.))+
|
85
|
+
(?<!@)
|
86
|
+
([a-z0-9\_\-]+)
|
87
|
+
(\.[a-z]+)+
|
88
|
+
([\?/\:][a-z0-9_=%&@\?\./\-\:\#\(\)]+)?
|
89
|
+
/?
|
90
|
+
)}ix,
|
91
|
+
protocol: /((ht|f)tp[s]?)/i }
|
80
92
|
end
|
81
93
|
|
82
94
|
# Build image regexes according to the required extensions.
|
83
|
-
#
|
95
|
+
#
|
84
96
|
# Example:
|
85
97
|
# >> sf_img_regex
|
86
98
|
# => /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg|png|gif)(\?.+)?$)/i
|
@@ -91,54 +103,90 @@ module Scrapifier
|
|
91
103
|
# - Image extensions which will be included in the regex.
|
92
104
|
def sf_img_regex(exts = [])
|
93
105
|
exts = [exts].flatten unless exts.is_a?(Array)
|
94
|
-
if exts.nil?
|
106
|
+
if exts.nil? || exts.empty?
|
95
107
|
exts = %w(jpg jpeg png gif)
|
96
|
-
elsif exts.include?(:jpg)
|
108
|
+
elsif exts.include?(:jpg) && !exts.include?(:jpeg)
|
97
109
|
exts.push :jpeg
|
98
|
-
end
|
99
|
-
|
110
|
+
end
|
111
|
+
%r{(^http{1}[s]?://([w]{3}\.)?.+\.(#{exts.join('|')})(\?.+)?$)}i
|
100
112
|
end
|
101
113
|
|
102
|
-
# Collection of
|
103
|
-
#
|
104
|
-
|
105
|
-
# Example:
|
106
|
-
# >> sf_paths[:title]
|
107
|
-
# => '//meta[@property = "og:title"]/@content | //meta[@name = "title"]/@content | //meta[@name = "Title"]/@content | //title | //h1'
|
108
|
-
def sf_paths
|
114
|
+
# Collection of xpath that are used to get nodes
|
115
|
+
# from the parsed HTML.
|
116
|
+
def sf_xpaths
|
109
117
|
{
|
110
|
-
title:
|
111
|
-
description:
|
112
|
-
image:
|
118
|
+
title: sf_title_xpath,
|
119
|
+
description: sf_desc_xpath,
|
120
|
+
image: sf_img_xpath
|
113
121
|
}
|
114
122
|
end
|
115
123
|
|
124
|
+
def sf_title_xpath
|
125
|
+
<<-END.gsub(/^\s+\|/, '')
|
126
|
+
|//meta[@property = "og:title"]/@content|
|
127
|
+
|//meta[@name = "title"]/@content|
|
128
|
+
|//meta[@name = "Title"]/@content|
|
129
|
+
|//title|//h1
|
130
|
+
END
|
131
|
+
end
|
132
|
+
|
133
|
+
def sf_desc_xpath
|
134
|
+
<<-END.gsub(/^\s+\|/, '')
|
135
|
+
|//meta[@property = "og:description"]/@content|
|
136
|
+
|//meta[@name = "description"]/@content|
|
137
|
+
|//meta[@name = "Description"]/@content|
|
138
|
+
|//h1|//h3|//p|//span|//font
|
139
|
+
END
|
140
|
+
end
|
141
|
+
|
142
|
+
def sf_img_xpath
|
143
|
+
<<-END.gsub(/^\s+\|/, '')
|
144
|
+
|//meta[@property = "og:image"]/@content|
|
145
|
+
|//link[@rel = "image_src"]/@href|
|
146
|
+
|//meta[@itemprop = "image"]/@content|
|
147
|
+
|//div[@id = "logo"]/img/@src|//a[@id = "logo"]/img/@src|
|
148
|
+
|//div[@class = "logo"]/img/@src|//a[@class = "logo"]/img/@src|
|
149
|
+
|//a//img[@width]/@src|//img[@width]/@src|
|
150
|
+
|//a//img[@height]/@src|//img[@height]/@src|
|
151
|
+
|//a//img/@src|//span//img/@src|//img/@src
|
152
|
+
END
|
153
|
+
end
|
154
|
+
|
116
155
|
# Check and return only the valid image URIs.
|
117
|
-
#
|
156
|
+
#
|
118
157
|
# Example:
|
119
|
-
# >> sf_fix_imgs(
|
158
|
+
# >> sf_fix_imgs(
|
159
|
+
# ['http://adtangerine.com/image.png', '/assets/image.jpg'],
|
160
|
+
# 'http://adtangerine.com',
|
161
|
+
# :jpg
|
162
|
+
# )
|
120
163
|
# => ['http://adtangerine/assets/image.jpg']
|
121
164
|
# Arguments:
|
122
165
|
# imgs: (Array)
|
123
166
|
# - Image URIs got from the HTML doc.
|
124
167
|
# uri: (String)
|
125
|
-
# - Used as basis to the URIs that don't have any protocol/domain set.
|
168
|
+
# - Used as basis to the URIs that don't have any protocol/domain set.
|
126
169
|
# exts: (Symbol or Array)
|
127
170
|
# - Allowed image extesntions.
|
128
171
|
def sf_fix_imgs(imgs, uri, exts = [])
|
129
172
|
sf_check_img_ext(imgs.map do |img|
|
130
|
-
img = img.to_s
|
131
|
-
|
132
|
-
|
173
|
+
img = img.to_s
|
174
|
+
unless img =~ sf_regex(:protocol)
|
175
|
+
img = sf_fix_protocol(img, sf_domain(uri))
|
176
|
+
end
|
177
|
+
img if img =~ sf_regex(:image)
|
133
178
|
end.compact, exts)
|
134
179
|
end
|
135
180
|
|
136
181
|
# Fix image URIs that don't have a protocol/domain set.
|
137
|
-
#
|
182
|
+
#
|
138
183
|
# Example:
|
139
184
|
# >> sf_fix_protocol('/assets/image.jpg', 'http://adtangerine.com')
|
140
185
|
# => 'http://adtangerine/assets/image.jpg'
|
141
|
-
# >> sf_fix_protocol(
|
186
|
+
# >> sf_fix_protocol(
|
187
|
+
# '//s.ytimg.com/yts/img/youtub_img.png',
|
188
|
+
# 'https://youtube.com'
|
189
|
+
# )
|
142
190
|
# => 'https://s.ytimg.com/yts/img/youtub_img.png'
|
143
191
|
# Arguments:
|
144
192
|
# path: (String)
|
@@ -146,15 +194,15 @@ module Scrapifier
|
|
146
194
|
# domain: (String)
|
147
195
|
# - Domain that will be prepended into the path.
|
148
196
|
def sf_fix_protocol(path, domain)
|
149
|
-
if path =~
|
197
|
+
if path =~ %r{^//[^/]+}
|
150
198
|
'http:' << path
|
151
199
|
else
|
152
|
-
|
153
|
-
end
|
200
|
+
"http://#{domain}#{'/' unless path =~ %r{^/[^/]+}}#{path}"
|
201
|
+
end
|
154
202
|
end
|
155
203
|
|
156
204
|
# Return the URI domain.
|
157
|
-
#
|
205
|
+
#
|
158
206
|
# Example:
|
159
207
|
# >> sf_domain('http://adtangerine.com')
|
160
208
|
# => 'adtangerine.com'
|
@@ -162,7 +210,8 @@ module Scrapifier
|
|
162
210
|
# uri: (String)
|
163
211
|
# - URI.
|
164
212
|
def sf_domain(uri)
|
165
|
-
|
213
|
+
uri = uri.to_s.split('/')
|
214
|
+
uri.empty? ? '' : uri[2]
|
166
215
|
end
|
167
216
|
end
|
168
|
-
end
|
217
|
+
end
|
data/lib/scrapifier/version.rb
CHANGED
data/spec/factories/samples.rb
CHANGED
@@ -1,40 +1,41 @@
|
|
1
1
|
module Factories
|
2
2
|
private
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
3
|
+
|
4
|
+
def sf_samples
|
5
|
+
{
|
6
|
+
misc: {
|
7
|
+
http: 'http://adtangerine.com',
|
8
|
+
https: 'https://rubygems.org/gems/string_awesome',
|
9
|
+
ftp: 'ftp://ftpserver.com',
|
10
|
+
www: 'www.twitflink.com'
|
11
|
+
},
|
12
|
+
images: {
|
13
|
+
jpg: [
|
14
|
+
'http://jlcauvin.com/wp-content/uploads/2013/09/heisenberg-breaking-bad.jpg',
|
15
|
+
'https://www.foobar.com/awesome_image.jpeg?foo=bar&bar=foo',
|
16
|
+
'http://foobar.com.br/nice-image.jpg'
|
17
|
+
],
|
18
|
+
png: [
|
19
|
+
'http://www.faniq.com/images/blog/58389e481aee9c5abbf49ff0a263f3ca.png',
|
20
|
+
'https://foobar.br/awesome_image.png',
|
21
|
+
'https://bar.foobar.br/foo/var/image.png?foo=bar',
|
22
|
+
],
|
23
|
+
gif: [
|
24
|
+
'http://31.media.tumblr.com/6eec77e355fe50bae424291fd8c58622/tumblr_me7ucl8kO61rf089no1_500.gif',
|
25
|
+
'http://foobar.com/ugly_image.gif',
|
26
|
+
'https://bar.foobar.br/foo/var/stop_using.gif?foo=bar'
|
27
|
+
]
|
28
|
+
},
|
29
|
+
regexes: {
|
30
|
+
image: {
|
31
|
+
all: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg|png|gif)(\?.+)?$)/i,
|
32
|
+
jpg: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg)(\?.+)?$)/i,
|
33
|
+
png: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(png)(\?.+)?$)/i,
|
34
|
+
gif: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(gif)(\?.+)?$)/i
|
10
35
|
},
|
11
|
-
|
12
|
-
|
13
|
-
'http://jlcauvin.com/wp-content/uploads/2013/09/heisenberg-breaking-bad.jpg',
|
14
|
-
'https://www.foobar.com/awesome_image.jpeg?foo=bar&bar=foo',
|
15
|
-
'http://foobar.com.br/nice-image.jpg'
|
16
|
-
],
|
17
|
-
png: [
|
18
|
-
'http://www.faniq.com/images/blog/58389e481aee9c5abbf49ff0a263f3ca.png',
|
19
|
-
'https://foobar.br/awesome_image.png',
|
20
|
-
'https://bar.foobar.br/foo/var/image.png?foo=bar',
|
21
|
-
],
|
22
|
-
gif: [
|
23
|
-
'http://31.media.tumblr.com/6eec77e355fe50bae424291fd8c58622/tumblr_me7ucl8kO61rf089no1_500.gif',
|
24
|
-
'http://foobar.com/ugly_image.gif',
|
25
|
-
'https://bar.foobar.br/foo/var/stop_using.gif?foo=bar'
|
26
|
-
]
|
27
|
-
},
|
28
|
-
regexes: {
|
29
|
-
image: {
|
30
|
-
all: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg|png|gif)(\?.+)?$)/i,
|
31
|
-
jpg: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg)(\?.+)?$)/i,
|
32
|
-
png: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(png)(\?.+)?$)/i,
|
33
|
-
gif: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(gif)(\?.+)?$)/i
|
34
|
-
},
|
35
|
-
uri: /\b((((ht|f)tp[s]?:\/\/)|([a-z0-9]+\.))+(?<!@)([a-z0-9\_\-]+)(\.[a-z]+)+([\?\/\:][a-z0-9_=%&@\?\.\/\-\:\#\(\)]+)?\/?)/i,
|
36
|
-
protocol: /((ht|f)tp[s]?)/i
|
37
|
-
}
|
36
|
+
uri: /\b((((ht|f)tp[s]?:\/\/)|([a-z0-9]+\.))+(?<!@)([a-z0-9\_\-]+)(\.[a-z]+)+([\?\/\:][a-z0-9_=%&@\?\.\/\-\:\#\(\)]+)?\/?)/i,
|
37
|
+
protocol: /((ht|f)tp[s]?)/i
|
38
38
|
}
|
39
|
-
|
40
|
-
end
|
39
|
+
}
|
40
|
+
end
|
41
|
+
end
|
data/spec/scrapifier_spec.rb
CHANGED
@@ -62,13 +62,16 @@ describe String do
|
|
62
62
|
end
|
63
63
|
|
64
64
|
it "includes a field with image URIs from the site's head/body" do
|
65
|
-
hash[:images].
|
66
|
-
|
65
|
+
unless hash[:images].empty?
|
66
|
+
hash[:images].is_a?(Array).should be_true
|
67
|
+
hash[:images].sample.should match(regexes[:image][:all])
|
68
|
+
end
|
67
69
|
end
|
68
70
|
end
|
69
71
|
|
70
72
|
it "includes a field with only the allowed types of image URIs from the site's head/body" do
|
71
|
-
misc[:http].scrapify(images: :png)[:images].sample
|
73
|
+
image = misc[:http].scrapify(images: :png)[:images].sample
|
74
|
+
image.should match(regexes[:image][:png]) unless image.nil?
|
72
75
|
end
|
73
76
|
|
74
77
|
it "can choose the URI in the String to be scrapified" do
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrapifier
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tiago Guedes
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-06-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|