scrapifier 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +83 -0
- data/Rakefile +5 -0
- data/lib/scrapifier/methods.rb +70 -0
- data/lib/scrapifier/support.rb +144 -0
- data/lib/scrapifier/version.rb +3 -0
- data/lib/scrapifier.rb +4 -0
- data/scrapifier.gemspec +26 -0
- data/spec/factories/uris.rb +40 -0
- data/spec/scrapifier_spec.rb +269 -0
- data/spec/spec_helper.rb +5 -0
- metadata +117 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: d645984640446b98bdc5bdf71972c991b361b75e
|
4
|
+
data.tar.gz: 9bd772bf8ab26ab4dda602fa69eee8c12ae45e39
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: ff5dd829fd8e41af883fccd65ade03bb44d968f71a5457a0f2ceeb3afb0e389f71b44c0b559b352793293f4675cd9556c6e0bbab9799b637f1c4e6e7bdbb61ee
|
7
|
+
data.tar.gz: 3b974c372000f5d4f795f32074bd4da64e4fd910e8623f8446169c18b6880b1967aee105bf5df4e9de36fd054697c567727d4a8eee48def87f51b93b4117f556
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 Tiago Guedes
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
# Scrapifier
|
2
|
+
|
3
|
+
It's a Ruby gem that brings a very simple way to extract meta information from URIs using the screen scraping technique.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Compatible with Ruby 1.9.3+
|
8
|
+
|
9
|
+
Add this line to your application's Gemfile:
|
10
|
+
|
11
|
+
gem 'scrapifier'
|
12
|
+
|
13
|
+
And then execute:
|
14
|
+
|
15
|
+
$ bundle
|
16
|
+
|
17
|
+
Or install it yourself as:
|
18
|
+
|
19
|
+
$ gem install scrapifier
|
20
|
+
|
21
|
+
## Usage
|
22
|
+
|
23
|
+
The method finds an URI in the String and gets some meta information from it, like the page's title, description, images and the URI. All the data is returned in a well-formatted Hash.
|
24
|
+
|
25
|
+
#### Default usage.
|
26
|
+
|
27
|
+
``` ruby
|
28
|
+
'Wow! What an awesome site: http://adtangerine.com!'.scrapify
|
29
|
+
#=> {
|
30
|
+
# title: "AdTangerine | Advertising Platform for Social Media",
|
31
|
+
# description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
|
32
|
+
# images: ["http://adtangerine.com/assets/logo_adt_og.png", "http://adtangerine.com/assets/logo_adt_og.png", "http://s3-us-west-2.amazonaws.com/adtangerine-prod/users/avatars/000/000/834/thumb/275747_1118382211_1929809351_n.jpg", "http://adtangerine.com/assets/foobar.gif"],
|
33
|
+
# uri: "http://adtangerine.com"
|
34
|
+
# }
|
35
|
+
```
|
36
|
+
|
37
|
+
#### Allow only certain image types.
|
38
|
+
|
39
|
+
``` ruby
|
40
|
+
'Wow! What an awesome site: http://adtangerine.com!'.scrapify images: :jpg
|
41
|
+
#=> {
|
42
|
+
# title: "AdTangerine | Advertising Platform for Social Media",
|
43
|
+
# description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
|
44
|
+
# images: ["http://s3-us-west-2.amazonaws.com/adtangerine-prod/users/avatars/000/000/834/thumb/275747_1118382211_1929809351_n.jpg"],
|
45
|
+
# uri: "http://adtangerine.com"
|
46
|
+
# }
|
47
|
+
|
48
|
+
'Wow! What an awesome site: http://adtangerine.com!'.scrapify images: [:png, :gif]
|
49
|
+
#=> {
|
50
|
+
# title: "AdTangerine | Advertising Platform for Social Media",
|
51
|
+
# description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
|
52
|
+
# images: ["http://adtangerine.com/assets/logo_adt_og.png", "http://adtangerine.com/assets/logo_adt_og.png", "http://adtangerine.com/assets/foobar.gif"],
|
53
|
+
# uri: "http://adtangerine.com"
|
54
|
+
# }
|
55
|
+
```
|
56
|
+
|
57
|
+
#### Choose which URI you want it to be scraped.
|
58
|
+
|
59
|
+
``` ruby
|
60
|
+
'Check out: http://adtangerine.com and www.twitflink.com'.scrapify which: 1
|
61
|
+
#=> {
|
62
|
+
# title: "TwitFlink | Find a link!",
|
63
|
+
# description: "TwitFlink is a very simple searching tool that allows people to find out links tweeted by any user from Twitter.",
|
64
|
+
# images: ["http://www.twitflink.com//assets/tf_logo.png", "http://twitflink.com/assets/tf_logo.png"],
|
65
|
+
# uri: "http://www.twitflink.com"
|
66
|
+
# }
|
67
|
+
|
68
|
+
'Check out: http://adtangerine.com and www.twitflink.com'.scrapify({ which: 0, images: :gif })
|
69
|
+
#=> {
|
70
|
+
# title: "AdTangerine | Advertising Platform for Social Media",
|
71
|
+
# description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
|
72
|
+
# images: ["http://adtangerine.com/assets/foobar.gif"],
|
73
|
+
# uri: "http://adtangerine.com"
|
74
|
+
# }
|
75
|
+
```
|
76
|
+
|
77
|
+
## Contributing
|
78
|
+
|
79
|
+
1. Fork it
|
80
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
81
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
82
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
83
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'scrapifier/support'
|
5
|
+
|
6
|
+
module Scrapifier
|
7
|
+
module Methods
|
8
|
+
include Scrapifier::Support
|
9
|
+
|
10
|
+
# Gets meta data from an URI using the screen scraping technique.
|
11
|
+
#
|
12
|
+
# Example:
|
13
|
+
# >> 'Wow! What an awesome site: http://adtangerine.com!'.scrapify
|
14
|
+
# => {
|
15
|
+
# :title => "AdTangerine | Advertising Platform for Social Media",
|
16
|
+
# :description => "AdTangerine is an advertising platform that uses the tangerine as a virtual currency...",
|
17
|
+
# :images => ["http://adtangerine.com/assets/logo_adt_og.png", "http://adtangerine.com/assets/logo_adt_og.png],
|
18
|
+
# :uri => "http://adtangerine.com"
|
19
|
+
# }
|
20
|
+
# Arguments:
|
21
|
+
# options: (Hash)
|
22
|
+
# - which: (Integer) Indicates which URI in the String will be used. It starts from 0 to N.
|
23
|
+
# - images: (Symbol or Array) Indicates the image extensions which are allowed to be returned as result.
|
24
|
+
|
25
|
+
def scrapify(options = {})
|
26
|
+
meta, uri = {}, find_uri(options[:which])
|
27
|
+
|
28
|
+
begin
|
29
|
+
if uri.nil?
|
30
|
+
raise
|
31
|
+
elsif uri =~ sf_regex(:image)
|
32
|
+
uri = (sf_check_img_ext(uri, options[:images])[0] rescue [])
|
33
|
+
raise if uri.empty?
|
34
|
+
[:title, :description, :uri, :images].each { |key| meta[key] = uri }
|
35
|
+
else
|
36
|
+
doc = Nokogiri::HTML(open(uri).read)
|
37
|
+
doc.encoding = 'utf-8'
|
38
|
+
|
39
|
+
[:title, :description].each do |key|
|
40
|
+
meta[key] = (doc.xpath(sf_paths[key])[0].text rescue '-')
|
41
|
+
end
|
42
|
+
|
43
|
+
meta[:images] = sf_fix_imgs(doc.xpath(sf_paths[:image]), uri, options[:images])
|
44
|
+
meta[:uri] = uri
|
45
|
+
end
|
46
|
+
rescue
|
47
|
+
meta = {}
|
48
|
+
end
|
49
|
+
|
50
|
+
meta
|
51
|
+
end
|
52
|
+
|
53
|
+
# Looks for URIs in the String.
|
54
|
+
#
|
55
|
+
# Example:
|
56
|
+
# >> 'Wow! What an awesome site: http://adtangerine.com!'.find_uri
|
57
|
+
# => 'http://adtangerine.com'
|
58
|
+
# >> 'Wow! What an awesome sites: http://adtangerine.com and www.twitflink.com'.find_uri 1
|
59
|
+
# => 'www.twitflink.com'
|
60
|
+
# Arguments:
|
61
|
+
# which: (Integer)
|
62
|
+
# - Which URI in the String: first (0), second (1) and so on.
|
63
|
+
|
64
|
+
def find_uri(which = 0)
|
65
|
+
which ||= which.to_i
|
66
|
+
which = self.scan(sf_regex(:uri))[which][0] rescue nil
|
67
|
+
(which.nil? or which =~ sf_regex(:protocol)) ? which : 'http://' << which
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,144 @@
|
|
1
|
+
module Scrapifier
|
2
|
+
module Support
|
3
|
+
private
|
4
|
+
# Filters images returning those with the allowed extentions.
|
5
|
+
#
|
6
|
+
# Example:
|
7
|
+
# >> sf_check_img_ext('http://source.com/image.gif', :jpg)
|
8
|
+
# => []
|
9
|
+
# >> sf_check_img_ext(['http://source.com/image.gif', 'http://source.com/image.jpg'], [:jpg, :png])
|
10
|
+
# => ['http://source.com/image.jpg']
|
11
|
+
# Arguments:
|
12
|
+
# images: (String or Array)
|
13
|
+
# - Images which will be checked.
|
14
|
+
# allowed: (String, Symbol or Array)
|
15
|
+
# - Allowed types of image extension.
|
16
|
+
|
17
|
+
def sf_check_img_ext(images, allowed = [])
|
18
|
+
allowed ||= []
|
19
|
+
if images.is_a?(String)
|
20
|
+
images = images.split
|
21
|
+
elsif !images.is_a?(Array)
|
22
|
+
images = []
|
23
|
+
end
|
24
|
+
images.select { |i| i =~ sf_regex(:image, allowed) }
|
25
|
+
end
|
26
|
+
|
27
|
+
# Selects regexes for URIs, protocols and image extensions.
|
28
|
+
#
|
29
|
+
# Example:
|
30
|
+
# >> sf_regex(:uri)
|
31
|
+
# => /\b((((ht|f)tp[s]?:\/\/)|([a-z0-9]+\.))+(?<!@)([a-z0-9\_\-]+)(\.[a-z]+)+([\?\/\:][a-z0-9_=%&@\?\.\/\-\:\#\(\)]+)?\/?)/i,
|
32
|
+
# >> sf_regex(:image, :jpg)
|
33
|
+
# => /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg)(\?.+)?$)/i
|
34
|
+
# Arguments:
|
35
|
+
# type: (Symbol or String)
|
36
|
+
# - Regex type.
|
37
|
+
# args: (*)
|
38
|
+
# - Anything.
|
39
|
+
|
40
|
+
def sf_regex(type, *args)
|
41
|
+
type = type.to_sym unless type.is_a? Symbol
|
42
|
+
if type == :image
|
43
|
+
sf_img_regex args.flatten
|
44
|
+
else
|
45
|
+
regexes = {
|
46
|
+
uri: /\b((((ht|f)tp[s]?:\/\/)|([a-z0-9]+\.))+(?<!@)([a-z0-9\_\-]+)(\.[a-z]+)+([\?\/\:][a-z0-9_=%&@\?\.\/\-\:\#\(\)]+)?\/?)/i,
|
47
|
+
protocol: /((ht|f)tp[s]?)/i
|
48
|
+
}
|
49
|
+
regexes[type]
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# Builds image regexes according to the required extensions.
|
54
|
+
#
|
55
|
+
# Example:
|
56
|
+
# >> sf_img_regex
|
57
|
+
# => /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg|png|gif)(\?.+)?$)/i
|
58
|
+
# >> sf_img_regex([:jpg, :png])
|
59
|
+
# => /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|png)(\?.+)?$)/i
|
60
|
+
# Arguments:
|
61
|
+
# exts: (Array)
|
62
|
+
# - Image extensions which will be included in the regex.
|
63
|
+
|
64
|
+
def sf_img_regex(exts = [])
|
65
|
+
exts = [exts].flatten unless exts.is_a?(Array)
|
66
|
+
if exts.nil? or exts.empty?
|
67
|
+
exts = %w(jpg jpeg png gif)
|
68
|
+
elsif exts.include?(:jpg) and !exts.include?(:jpeg)
|
69
|
+
exts.push :jpeg
|
70
|
+
end
|
71
|
+
eval "/(^http{1}[s]?:\\/\\/([w]{3}\\.)?.+\\.(#{exts.join('|')})(\\?.+)?$)/i"
|
72
|
+
end
|
73
|
+
|
74
|
+
# Collection of paths used to get content from HTML tags via Node#xpath method.
|
75
|
+
# See more: http://nokogiri.org/tutorials/searching_a_xml_html_document.html
|
76
|
+
#
|
77
|
+
# Example:
|
78
|
+
# >> sf_paths[:title]
|
79
|
+
# => '//meta[@property = "og:title"]/@content | //meta[@name = "title"]/@content | //meta[@name = "Title"]/@content | //title | //h1'
|
80
|
+
|
81
|
+
def sf_paths
|
82
|
+
{
|
83
|
+
title: '//meta[@property = "og:title"]/@content | //meta[@name = "title"]/@content | //meta[@name = "Title"]/@content | //title | //h1',
|
84
|
+
description: '//meta[@property = "og:description"]/@content | //meta[@name = "description"]/@content | //meta[@name = "Description"]/@content | //h1 | //h3 | //p | //span | //font',
|
85
|
+
image: '//meta[@property = "og:image"]/@content | //link[@rel = "image_src"]/@href | //meta[@itemprop = "image"]/@content | //div[@id = "logo"]/img/@src | //a[@id = "logo"]/img/@src | //div[@class = "logo"]/img/@src | //a[@class = "logo"]/img/@src | //a//img[@width]/@src | //img[@width]/@src | //a//img[@height]/@src | //img[@height]/@src | //a//img/@src | //span//img/@src'
|
86
|
+
}
|
87
|
+
end
|
88
|
+
|
89
|
+
# Checks and returns only the valid image URIs.
|
90
|
+
#
|
91
|
+
# Example:
|
92
|
+
# >> sf_fix_imgs(['http://adtangerine.com/image.png', '/assets/image.jpg'], 'http://adtangerine.com', :jpg)
|
93
|
+
# => ['http://adtangerine/assets/image.jpg']
|
94
|
+
# Arguments:
|
95
|
+
# imgs: (Array)
|
96
|
+
# - Image URIs got from the HTML doc.
|
97
|
+
# uri: (String)
|
98
|
+
# - Used as basis to the URIs that don't have any protocol/domain set.
|
99
|
+
# exts: (Symbol or Array)
|
100
|
+
# - Allowed image extesntions.
|
101
|
+
|
102
|
+
def sf_fix_imgs(imgs, uri, exts = [])
|
103
|
+
sf_check_img_ext(imgs.map do |img|
|
104
|
+
img = img.to_s
|
105
|
+
img = sf_fix_protocol(img, sf_domain(uri)) unless img =~ sf_regex(:protocol)
|
106
|
+
img if (img =~ sf_regex(:image))
|
107
|
+
end.compact, exts)
|
108
|
+
end
|
109
|
+
|
110
|
+
# Fixes image URIs that doesn't present protocol/domain.
|
111
|
+
#
|
112
|
+
# Example:
|
113
|
+
# >> sf_fix_protocol('/assets/image.jpg', 'http://adtangerine.com')
|
114
|
+
# => 'http://adtangerine/assets/image.jpg'
|
115
|
+
# >> sf_fix_protocol('//s.ytimg.com/yts/img/youtub_img.png', 'https://youtube.com')
|
116
|
+
# => 'https://s.ytimg.com/yts/img/youtub_img.png'
|
117
|
+
# Arguments:
|
118
|
+
# path: (String)
|
119
|
+
# - URI path having no protocol/domain set.
|
120
|
+
# domain: (String)
|
121
|
+
# - Domain that will be prepended into the path.
|
122
|
+
|
123
|
+
def sf_fix_protocol(path, domain)
|
124
|
+
if path =~ /^\/\/[^\/]+/
|
125
|
+
'http:' << path
|
126
|
+
else
|
127
|
+
"http://#{domain}#{'/' unless path =~ /^\/[^\/]+/}#{path}"
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
# Returns the domain from an URI
|
132
|
+
#
|
133
|
+
# Example:
|
134
|
+
# >> sf_domain('http://adtangerine.com')
|
135
|
+
# => 'adtangerine.com'
|
136
|
+
# Arguments:
|
137
|
+
# uri: (String)
|
138
|
+
# - URI.
|
139
|
+
|
140
|
+
def sf_domain(uri)
|
141
|
+
(uri.split('/')[2] rescue '')
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
data/lib/scrapifier.rb
ADDED
data/scrapifier.gemspec
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'scrapifier/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = 'scrapifier'
|
8
|
+
spec.version = Scrapifier::VERSION
|
9
|
+
spec.authors = ['Tiago Guedes']
|
10
|
+
spec.email = ['tiagopog@gmail.com']
|
11
|
+
spec.description = 'A very simple way to extract meta information from URIs using the screen scraping technique.'
|
12
|
+
spec.summary = 'Extends the Ruby String class with a screen scraping method.'
|
13
|
+
spec.homepage = 'https://github.com/tiagopog/scrapifier'
|
14
|
+
spec.license = 'MIT'
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ['lib']
|
20
|
+
|
21
|
+
spec.add_runtime_dependency 'nokogiri', '~> 1.6'
|
22
|
+
|
23
|
+
spec.add_development_dependency 'bundler', '~> 1.3'
|
24
|
+
spec.add_development_dependency 'rspec', '~> 2.14'
|
25
|
+
spec.add_development_dependency 'rake', '~> 10.1'
|
26
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module Factories
|
2
|
+
private
|
3
|
+
def sf_samples
|
4
|
+
{
|
5
|
+
misc: {
|
6
|
+
http: 'http://adtangerine.com',
|
7
|
+
https: 'https://rubygems.org/gems/string_awesome',
|
8
|
+
ftp: 'ftp://ftpserver.com',
|
9
|
+
www: 'www.twitflink.com'
|
10
|
+
},
|
11
|
+
images: {
|
12
|
+
jpg: [
|
13
|
+
'http://jlcauvin.com/wp-content/uploads/2013/09/heisenberg-breaking-bad.jpg',
|
14
|
+
'https://www.foobar.com/awesome_image.jpeg?foo=bar&bar=foo',
|
15
|
+
'http://foobar.com.br/nice-image.jpg'
|
16
|
+
],
|
17
|
+
png: [
|
18
|
+
'http://www.faniq.com/images/blog/58389e481aee9c5abbf49ff0a263f3ca.png',
|
19
|
+
'https://foobar.br/awesome_image.png',
|
20
|
+
'https://bar.foobar.br/foo/var/image.png?foo=bar',
|
21
|
+
],
|
22
|
+
gif: [
|
23
|
+
'http://31.media.tumblr.com/6eec77e355fe50bae424291fd8c58622/tumblr_me7ucl8kO61rf089no1_500.gif',
|
24
|
+
'http://foobar.com/ugly_image.gif',
|
25
|
+
'https://bar.foobar.br/foo/var/stop_using.gif?foo=bar'
|
26
|
+
]
|
27
|
+
},
|
28
|
+
regexes: {
|
29
|
+
image: {
|
30
|
+
all: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg|png|gif)(\?.+)?$)/i,
|
31
|
+
jpg: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg)(\?.+)?$)/i,
|
32
|
+
png: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(png)(\?.+)?$)/i,
|
33
|
+
gif: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(gif)(\?.+)?$)/i
|
34
|
+
},
|
35
|
+
uri: /\b((((ht|f)tp[s]?:\/\/)|([a-z0-9]+\.))+(?<!@)([a-z0-9\_\-]+)(\.[a-z]+)+([\?\/\:][a-z0-9_=%&@\?\.\/\-\:\#\(\)]+)?\/?)/i,
|
36
|
+
protocol: /((ht|f)tp[s]?)/i
|
37
|
+
}
|
38
|
+
}
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,269 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
require 'spec_helper'
|
3
|
+
include Factories
|
4
|
+
|
5
|
+
describe String do
|
6
|
+
let(:images) { sf_samples[:images] }
|
7
|
+
let(:misc) { sf_samples[:misc] }
|
8
|
+
let(:regexes) { sf_samples[:regexes] }
|
9
|
+
|
10
|
+
#
|
11
|
+
# String#scrapify
|
12
|
+
#
|
13
|
+
|
14
|
+
describe '#scrapify' do
|
15
|
+
context 'when no URI is matched in the String' do
|
16
|
+
subject { 'String without any URI.'.scrapify }
|
17
|
+
|
18
|
+
it { should eq({}) }
|
19
|
+
end
|
20
|
+
|
21
|
+
context 'when the website was not found' do
|
22
|
+
subject { 'Check out this http://someweirduri.com.br'.scrapify }
|
23
|
+
|
24
|
+
it { should eq({}) }
|
25
|
+
end
|
26
|
+
|
27
|
+
context 'when an image URI is matched' do
|
28
|
+
let(:jpg) { images[:jpg][0] }
|
29
|
+
let(:png) { images[:png][0] }
|
30
|
+
let(:gif) { images[:gif][0] }
|
31
|
+
|
32
|
+
it 'sets the same value for :title, :description and :uri keys' do
|
33
|
+
"Say my name: #{jpg}".scrapify.should include(title: jpg, description: jpg, uri: jpg)
|
34
|
+
end
|
35
|
+
|
36
|
+
it 'allows all the standard image extensions by default (even GIFs)' do
|
37
|
+
"Smile GIF! Oh, wait... #{gif}".scrapify.should include(title: gif, description: gif, uri: gif)
|
38
|
+
end
|
39
|
+
|
40
|
+
it 'returns an empty Hash if the extension is not allowed' do
|
41
|
+
"PNG is awesome! #{png}".scrapify(images: [:jpg]).should eq({})
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
context 'when a website URI is matched in the String and a Hash is returned' do
|
46
|
+
subject(:hash) { "Look this awesome site #{misc[:http]}".scrapify }
|
47
|
+
|
48
|
+
it "includes a field with the site's title" do
|
49
|
+
hash[:title].is_a?(String).should be_true
|
50
|
+
hash[:title].empty?.should be_false
|
51
|
+
end
|
52
|
+
|
53
|
+
it "includes a field with the site's description" do
|
54
|
+
hash[:description].is_a?(String).should be_true
|
55
|
+
hash[:description].empty?.should be_false
|
56
|
+
end
|
57
|
+
|
58
|
+
it 'includes a field with the page URI' do
|
59
|
+
hash[:uri].is_a?(String).should be_true
|
60
|
+
hash[:uri].empty?.should be_false
|
61
|
+
hash[:uri].should eq(misc[:http])
|
62
|
+
end
|
63
|
+
|
64
|
+
it "includes a field with image URIs from the site's head/body" do
|
65
|
+
hash[:images].is_a?(Array).should be_true
|
66
|
+
hash[:images].sample.should match(regexes[:image][:all])
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
it "includes a field with only the allowed types of image URIs from the site's head/body" do
|
71
|
+
misc[:http].scrapify(images: :png)[:images].sample.should match(regexes[:image][:png])
|
72
|
+
end
|
73
|
+
|
74
|
+
it "can choose the URI in the String to be scrapified" do
|
75
|
+
hash = "Check out these awesome sites: #{misc[:http]} and #{misc[:www]}".scrapify(which: 1, images: :png)
|
76
|
+
[:title, :description, :uri].each do |key|
|
77
|
+
hash[key].is_a?(String).should be_true
|
78
|
+
hash[key].empty?.should be_false
|
79
|
+
end
|
80
|
+
hash[:uri].should eq("http://#{misc[:www]}")
|
81
|
+
hash[:images].sample.should match(regexes[:image][:png])
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
#
|
86
|
+
# String#find_uri
|
87
|
+
#
|
88
|
+
|
89
|
+
describe '#find_uri' do
|
90
|
+
let(:sample_uris) { misc.map { |u| u[1] } }
|
91
|
+
let(:str) { "Awesome sites: #{sample_uris.join ' and '}" }
|
92
|
+
|
93
|
+
it 'matches the first URI in the String by default' do
|
94
|
+
str.send(:find_uri).should eq(sample_uris[0])
|
95
|
+
end
|
96
|
+
|
97
|
+
it 'matches the second URI in the String (https)' do
|
98
|
+
str.send(:find_uri, 1).should eq(sample_uris[1])
|
99
|
+
end
|
100
|
+
|
101
|
+
it 'matches the third URI in the String (www)' do
|
102
|
+
str.send(:find_uri, 2).should eq(sample_uris[2])
|
103
|
+
end
|
104
|
+
|
105
|
+
context 'when no URI is matched' do
|
106
|
+
it 'returns nil' do
|
107
|
+
'Lorem ipsum dolor.'.send(:find_uri).should be_nil
|
108
|
+
end
|
109
|
+
|
110
|
+
it 'returns nil (no presence of http|https|ftp|www)' do
|
111
|
+
'Check this out: google.com'.send(:find_uri).should be_nil
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
#
|
117
|
+
# String#sf_check_img_ext
|
118
|
+
#
|
119
|
+
|
120
|
+
describe '#sf_check_img_ext' do
|
121
|
+
let(:img) { images[:jpg].sample }
|
122
|
+
let(:imgs) { images.map { |i| i[1] }.flatten }
|
123
|
+
let(:checked) do
|
124
|
+
{
|
125
|
+
str: ''.send(:sf_check_img_ext, img),
|
126
|
+
array: ''.send(:sf_check_img_ext, imgs),
|
127
|
+
jpg: ''.send(:sf_check_img_ext, imgs, [:jpg]),
|
128
|
+
png: ''.send(:sf_check_img_ext, imgs, :png),
|
129
|
+
gif: ''.send(:sf_check_img_ext, imgs, 'gif')
|
130
|
+
}
|
131
|
+
end
|
132
|
+
|
133
|
+
context 'when no arument is passed' do
|
134
|
+
it { expect { ''.send(:sf_check_img_ext) }.to raise_error(ArgumentError) }
|
135
|
+
end
|
136
|
+
|
137
|
+
context 'when only the first argument is defined' do
|
138
|
+
it 'allows a String as argument' do
|
139
|
+
checked[:str].should have(1).item
|
140
|
+
end
|
141
|
+
|
142
|
+
it 'allows an Array as argument' do
|
143
|
+
checked[:jpg].should have(3).item
|
144
|
+
end
|
145
|
+
|
146
|
+
it 'allows all the image extensions by default' do
|
147
|
+
checked[:array].should have(9).item
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
context 'when the two arguments are defined' do
|
152
|
+
it 'allows a Symbol as the second argument' do
|
153
|
+
checked[:png].should have(3).item
|
154
|
+
end
|
155
|
+
|
156
|
+
it 'allows a String as the second argument' do
|
157
|
+
checked[:gif].should have(3).item
|
158
|
+
end
|
159
|
+
|
160
|
+
it 'allows an Array as the second argument' do
|
161
|
+
checked[:jpg].should have(3).item
|
162
|
+
end
|
163
|
+
|
164
|
+
it 'returns an Array with only image types allowed' do
|
165
|
+
[:jpg, :png, :gif].each { |ext| checked[ext].should have(3).item }
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
context 'when no image is found/allowed' do
|
170
|
+
it 'returns an empty Array' do
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
it 'always returns an Array' do
|
175
|
+
checked.each { |c| c[1].is_a?(Array).should be_true }
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
|
180
|
+
#
|
181
|
+
# String#sf_regex
|
182
|
+
#
|
183
|
+
|
184
|
+
describe '#sf_regex' do
|
185
|
+
context 'when it needs a regex to match any kind of URI' do
|
186
|
+
subject { ''.send(:sf_regex, :uri) }
|
187
|
+
|
188
|
+
[:http, :https, :ftp, :www].each do |p|
|
189
|
+
it { should match(misc[:http]) }
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
context 'when it needs a regex to match only image uris' do
|
194
|
+
subject { ''.send(:sf_regex, :image) }
|
195
|
+
|
196
|
+
[:jpg, :png, :gif].each do |ext|
|
197
|
+
it { should match(sf_samples[:images][ext].sample) }
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
#
|
203
|
+
# String#sf_img_regex
|
204
|
+
#
|
205
|
+
|
206
|
+
describe '#sf_img_regex' do
|
207
|
+
let(:img_regexes) { regexes[:image] }
|
208
|
+
|
209
|
+
context 'when no argument is passed' do
|
210
|
+
subject(:regex) { ''.send(:sf_img_regex) }
|
211
|
+
|
212
|
+
it 'returns a regex that matches all image extensions' do
|
213
|
+
regex.should eq(img_regexes[:all])
|
214
|
+
end
|
215
|
+
|
216
|
+
it 'matches all image extensions' do
|
217
|
+
[:jpg, :png, :gif].each { |ext| images[ext].sample.should match(regex) }
|
218
|
+
end
|
219
|
+
end
|
220
|
+
|
221
|
+
context 'when only jpg is allowed' do
|
222
|
+
subject(:regex) { ''.send(:sf_img_regex, [:jpg]) }
|
223
|
+
|
224
|
+
it 'returns a regex that matches only jpg images' do
|
225
|
+
regex.should eq(img_regexes[:jpg])
|
226
|
+
end
|
227
|
+
|
228
|
+
it 'matches only the defined extension' do
|
229
|
+
regex.should match(images[:jpg].sample)
|
230
|
+
end
|
231
|
+
|
232
|
+
it "doesn't match any other extension" do
|
233
|
+
[:png, :gif].each { |ext| regex.should_not match(images[ext].sample) }
|
234
|
+
end
|
235
|
+
end
|
236
|
+
|
237
|
+
context 'when only png is allowed' do
|
238
|
+
subject(:regex) { ''.send(:sf_img_regex, :png) }
|
239
|
+
|
240
|
+
it 'returns a regex that matches only png images' do
|
241
|
+
regex.should eq(img_regexes[:png])
|
242
|
+
end
|
243
|
+
|
244
|
+
it 'matches only the defined extension' do
|
245
|
+
regex.should match(images[:png].sample)
|
246
|
+
end
|
247
|
+
|
248
|
+
it "doesn't match any other extension" do
|
249
|
+
[:jpg, :gif].each { |ext| regex.should_not match(images[ext].sample) }
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
253
|
+
context 'when only gif (argh!) is allowed' do
|
254
|
+
subject(:regex) { ''.send(:sf_img_regex, :gif) }
|
255
|
+
|
256
|
+
it 'returns a regex that matches only gif images' do
|
257
|
+
regex.should eq(img_regexes[:gif])
|
258
|
+
end
|
259
|
+
|
260
|
+
it 'matches only the defined extension' do
|
261
|
+
regex.should match(images[:gif].sample)
|
262
|
+
end
|
263
|
+
|
264
|
+
it "doesn't match any other extension" do
|
265
|
+
[:jpg, :png].each { |ext| regex.should_not match(images[ext].sample) }
|
266
|
+
end
|
267
|
+
end
|
268
|
+
end
|
269
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,117 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: scrapifier
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Tiago Guedes
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-04-07 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.6'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.6'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: bundler
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.3'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.3'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '2.14'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '2.14'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rake
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '10.1'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '10.1'
|
69
|
+
description: A very simple way to extract meta information from URIs using the screen
|
70
|
+
scraping technique.
|
71
|
+
email:
|
72
|
+
- tiagopog@gmail.com
|
73
|
+
executables: []
|
74
|
+
extensions: []
|
75
|
+
extra_rdoc_files: []
|
76
|
+
files:
|
77
|
+
- ".gitignore"
|
78
|
+
- Gemfile
|
79
|
+
- LICENSE.txt
|
80
|
+
- README.md
|
81
|
+
- Rakefile
|
82
|
+
- lib/scrapifier.rb
|
83
|
+
- lib/scrapifier/methods.rb
|
84
|
+
- lib/scrapifier/support.rb
|
85
|
+
- lib/scrapifier/version.rb
|
86
|
+
- scrapifier.gemspec
|
87
|
+
- spec/factories/uris.rb
|
88
|
+
- spec/scrapifier_spec.rb
|
89
|
+
- spec/spec_helper.rb
|
90
|
+
homepage: https://github.com/tiagopog/scrapifier
|
91
|
+
licenses:
|
92
|
+
- MIT
|
93
|
+
metadata: {}
|
94
|
+
post_install_message:
|
95
|
+
rdoc_options: []
|
96
|
+
require_paths:
|
97
|
+
- lib
|
98
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
99
|
+
requirements:
|
100
|
+
- - ">="
|
101
|
+
- !ruby/object:Gem::Version
|
102
|
+
version: '0'
|
103
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
104
|
+
requirements:
|
105
|
+
- - ">="
|
106
|
+
- !ruby/object:Gem::Version
|
107
|
+
version: '0'
|
108
|
+
requirements: []
|
109
|
+
rubyforge_project:
|
110
|
+
rubygems_version: 2.2.2
|
111
|
+
signing_key:
|
112
|
+
specification_version: 4
|
113
|
+
summary: Extends the Ruby String class with a screen scraping method.
|
114
|
+
test_files:
|
115
|
+
- spec/factories/uris.rb
|
116
|
+
- spec/scrapifier_spec.rb
|
117
|
+
- spec/spec_helper.rb
|