scrapifier 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +83 -0
- data/Rakefile +5 -0
- data/lib/scrapifier/methods.rb +70 -0
- data/lib/scrapifier/support.rb +144 -0
- data/lib/scrapifier/version.rb +3 -0
- data/lib/scrapifier.rb +4 -0
- data/scrapifier.gemspec +26 -0
- data/spec/factories/uris.rb +40 -0
- data/spec/scrapifier_spec.rb +269 -0
- data/spec/spec_helper.rb +5 -0
- metadata +117 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: d645984640446b98bdc5bdf71972c991b361b75e
|
4
|
+
data.tar.gz: 9bd772bf8ab26ab4dda602fa69eee8c12ae45e39
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: ff5dd829fd8e41af883fccd65ade03bb44d968f71a5457a0f2ceeb3afb0e389f71b44c0b559b352793293f4675cd9556c6e0bbab9799b637f1c4e6e7bdbb61ee
|
7
|
+
data.tar.gz: 3b974c372000f5d4f795f32074bd4da64e4fd910e8623f8446169c18b6880b1967aee105bf5df4e9de36fd054697c567727d4a8eee48def87f51b93b4117f556
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 Tiago Guedes
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
# Scrapifier
|
2
|
+
|
3
|
+
It's a Ruby gem that brings a very simple way to extract meta information from URIs using the screen scraping technique.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Compatible with Ruby 1.9.3+
|
8
|
+
|
9
|
+
Add this line to your application's Gemfile:
|
10
|
+
|
11
|
+
gem 'scrapifier'
|
12
|
+
|
13
|
+
And then execute:
|
14
|
+
|
15
|
+
$ bundle
|
16
|
+
|
17
|
+
Or install it yourself as:
|
18
|
+
|
19
|
+
$ gem install scrapifier
|
20
|
+
|
21
|
+
## Usage
|
22
|
+
|
23
|
+
The method finds an URI in the String and gets some meta information from it, like the page's title, description, images and the URI. All the data is returned in a well-formatted Hash.
|
24
|
+
|
25
|
+
#### Default usage.
|
26
|
+
|
27
|
+
``` ruby
|
28
|
+
'Wow! What an awesome site: http://adtangerine.com!'.scrapify
|
29
|
+
#=> {
|
30
|
+
# title: "AdTangerine | Advertising Platform for Social Media",
|
31
|
+
# description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
|
32
|
+
# images: ["http://adtangerine.com/assets/logo_adt_og.png", "http://adtangerine.com/assets/logo_adt_og.png", "http://s3-us-west-2.amazonaws.com/adtangerine-prod/users/avatars/000/000/834/thumb/275747_1118382211_1929809351_n.jpg", "http://adtangerine.com/assets/foobar.gif"],
|
33
|
+
# uri: "http://adtangerine.com"
|
34
|
+
# }
|
35
|
+
```
|
36
|
+
|
37
|
+
#### Allow only certain image types.
|
38
|
+
|
39
|
+
``` ruby
|
40
|
+
'Wow! What an awesome site: http://adtangerine.com!'.scrapify images: :jpg
|
41
|
+
#=> {
|
42
|
+
# title: "AdTangerine | Advertising Platform for Social Media",
|
43
|
+
# description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
|
44
|
+
# images: ["http://s3-us-west-2.amazonaws.com/adtangerine-prod/users/avatars/000/000/834/thumb/275747_1118382211_1929809351_n.jpg"],
|
45
|
+
# uri: "http://adtangerine.com"
|
46
|
+
# }
|
47
|
+
|
48
|
+
'Wow! What an awesome site: http://adtangerine.com!'.scrapify images: [:png, :gif]
|
49
|
+
#=> {
|
50
|
+
# title: "AdTangerine | Advertising Platform for Social Media",
|
51
|
+
# description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
|
52
|
+
# images: ["http://adtangerine.com/assets/logo_adt_og.png", "http://adtangerine.com/assets/logo_adt_og.png", "http://adtangerine.com/assets/foobar.gif"],
|
53
|
+
# uri: "http://adtangerine.com"
|
54
|
+
# }
|
55
|
+
```
|
56
|
+
|
57
|
+
#### Choose which URI you want it to be scraped.
|
58
|
+
|
59
|
+
``` ruby
|
60
|
+
'Check out: http://adtangerine.com and www.twitflink.com'.scrapify which: 1
|
61
|
+
#=> {
|
62
|
+
# title: "TwitFlink | Find a link!",
|
63
|
+
# description: "TwitFlink is a very simple searching tool that allows people to find out links tweeted by any user from Twitter.",
|
64
|
+
# images: ["http://www.twitflink.com//assets/tf_logo.png", "http://twitflink.com/assets/tf_logo.png"],
|
65
|
+
# uri: "http://www.twitflink.com"
|
66
|
+
# }
|
67
|
+
|
68
|
+
'Check out: http://adtangerine.com and www.twitflink.com'.scrapify({ which: 0, images: :gif })
|
69
|
+
#=> {
|
70
|
+
# title: "AdTangerine | Advertising Platform for Social Media",
|
71
|
+
# description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
|
72
|
+
# images: ["http://adtangerine.com/assets/foobar.gif"],
|
73
|
+
# uri: "http://adtangerine.com"
|
74
|
+
# }
|
75
|
+
```
|
76
|
+
|
77
|
+
## Contributing
|
78
|
+
|
79
|
+
1. Fork it
|
80
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
81
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
82
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
83
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'scrapifier/support'
|
5
|
+
|
6
|
+
module Scrapifier
|
7
|
+
module Methods
|
8
|
+
include Scrapifier::Support
|
9
|
+
|
10
|
+
# Gets meta data from an URI using the screen scraping technique.
|
11
|
+
#
|
12
|
+
# Example:
|
13
|
+
# >> 'Wow! What an awesome site: http://adtangerine.com!'.scrapify
|
14
|
+
# => {
|
15
|
+
# :title => "AdTangerine | Advertising Platform for Social Media",
|
16
|
+
# :description => "AdTangerine is an advertising platform that uses the tangerine as a virtual currency...",
|
17
|
+
# :images => ["http://adtangerine.com/assets/logo_adt_og.png", "http://adtangerine.com/assets/logo_adt_og.png],
|
18
|
+
# :uri => "http://adtangerine.com"
|
19
|
+
# }
|
20
|
+
# Arguments:
|
21
|
+
# options: (Hash)
|
22
|
+
# - which: (Integer) Indicates which URI in the String will be used. It starts from 0 to N.
|
23
|
+
# - images: (Symbol or Array) Indicates the image extensions which are allowed to be returned as result.
|
24
|
+
|
25
|
+
def scrapify(options = {})
|
26
|
+
meta, uri = {}, find_uri(options[:which])
|
27
|
+
|
28
|
+
begin
|
29
|
+
if uri.nil?
|
30
|
+
raise
|
31
|
+
elsif uri =~ sf_regex(:image)
|
32
|
+
uri = (sf_check_img_ext(uri, options[:images])[0] rescue [])
|
33
|
+
raise if uri.empty?
|
34
|
+
[:title, :description, :uri, :images].each { |key| meta[key] = uri }
|
35
|
+
else
|
36
|
+
doc = Nokogiri::HTML(open(uri).read)
|
37
|
+
doc.encoding = 'utf-8'
|
38
|
+
|
39
|
+
[:title, :description].each do |key|
|
40
|
+
meta[key] = (doc.xpath(sf_paths[key])[0].text rescue '-')
|
41
|
+
end
|
42
|
+
|
43
|
+
meta[:images] = sf_fix_imgs(doc.xpath(sf_paths[:image]), uri, options[:images])
|
44
|
+
meta[:uri] = uri
|
45
|
+
end
|
46
|
+
rescue
|
47
|
+
meta = {}
|
48
|
+
end
|
49
|
+
|
50
|
+
meta
|
51
|
+
end
|
52
|
+
|
53
|
+
# Looks for URIs in the String.
|
54
|
+
#
|
55
|
+
# Example:
|
56
|
+
# >> 'Wow! What an awesome site: http://adtangerine.com!'.find_uri
|
57
|
+
# => 'http://adtangerine.com'
|
58
|
+
# >> 'Wow! What an awesome sites: http://adtangerine.com and www.twitflink.com'.find_uri 1
|
59
|
+
# => 'www.twitflink.com'
|
60
|
+
# Arguments:
|
61
|
+
# which: (Integer)
|
62
|
+
# - Which URI in the String: first (0), second (1) and so on.
|
63
|
+
|
64
|
+
def find_uri(which = 0)
|
65
|
+
which ||= which.to_i
|
66
|
+
which = self.scan(sf_regex(:uri))[which][0] rescue nil
|
67
|
+
(which.nil? or which =~ sf_regex(:protocol)) ? which : 'http://' << which
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,144 @@
|
|
1
|
+
module Scrapifier
|
2
|
+
module Support
|
3
|
+
private
|
4
|
+
# Filters images returning those with the allowed extentions.
|
5
|
+
#
|
6
|
+
# Example:
|
7
|
+
# >> sf_check_img_ext('http://source.com/image.gif', :jpg)
|
8
|
+
# => []
|
9
|
+
# >> sf_check_img_ext(['http://source.com/image.gif', 'http://source.com/image.jpg'], [:jpg, :png])
|
10
|
+
# => ['http://source.com/image.jpg']
|
11
|
+
# Arguments:
|
12
|
+
# images: (String or Array)
|
13
|
+
# - Images which will be checked.
|
14
|
+
# allowed: (String, Symbol or Array)
|
15
|
+
# - Allowed types of image extension.
|
16
|
+
|
17
|
+
def sf_check_img_ext(images, allowed = [])
|
18
|
+
allowed ||= []
|
19
|
+
if images.is_a?(String)
|
20
|
+
images = images.split
|
21
|
+
elsif !images.is_a?(Array)
|
22
|
+
images = []
|
23
|
+
end
|
24
|
+
images.select { |i| i =~ sf_regex(:image, allowed) }
|
25
|
+
end
|
26
|
+
|
27
|
+
# Selects regexes for URIs, protocols and image extensions.
|
28
|
+
#
|
29
|
+
# Example:
|
30
|
+
# >> sf_regex(:uri)
|
31
|
+
# => /\b((((ht|f)tp[s]?:\/\/)|([a-z0-9]+\.))+(?<!@)([a-z0-9\_\-]+)(\.[a-z]+)+([\?\/\:][a-z0-9_=%&@\?\.\/\-\:\#\(\)]+)?\/?)/i,
|
32
|
+
# >> sf_regex(:image, :jpg)
|
33
|
+
# => /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg)(\?.+)?$)/i
|
34
|
+
# Arguments:
|
35
|
+
# type: (Symbol or String)
|
36
|
+
# - Regex type.
|
37
|
+
# args: (*)
|
38
|
+
# - Anything.
|
39
|
+
|
40
|
+
def sf_regex(type, *args)
|
41
|
+
type = type.to_sym unless type.is_a? Symbol
|
42
|
+
if type == :image
|
43
|
+
sf_img_regex args.flatten
|
44
|
+
else
|
45
|
+
regexes = {
|
46
|
+
uri: /\b((((ht|f)tp[s]?:\/\/)|([a-z0-9]+\.))+(?<!@)([a-z0-9\_\-]+)(\.[a-z]+)+([\?\/\:][a-z0-9_=%&@\?\.\/\-\:\#\(\)]+)?\/?)/i,
|
47
|
+
protocol: /((ht|f)tp[s]?)/i
|
48
|
+
}
|
49
|
+
regexes[type]
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# Builds image regexes according to the required extensions.
|
54
|
+
#
|
55
|
+
# Example:
|
56
|
+
# >> sf_img_regex
|
57
|
+
# => /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg|png|gif)(\?.+)?$)/i
|
58
|
+
# >> sf_img_regex([:jpg, :png])
|
59
|
+
# => /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|png)(\?.+)?$)/i
|
60
|
+
# Arguments:
|
61
|
+
# exts: (Array)
|
62
|
+
# - Image extensions which will be included in the regex.
|
63
|
+
|
64
|
+
def sf_img_regex(exts = [])
|
65
|
+
exts = [exts].flatten unless exts.is_a?(Array)
|
66
|
+
if exts.nil? or exts.empty?
|
67
|
+
exts = %w(jpg jpeg png gif)
|
68
|
+
elsif exts.include?(:jpg) and !exts.include?(:jpeg)
|
69
|
+
exts.push :jpeg
|
70
|
+
end
|
71
|
+
eval "/(^http{1}[s]?:\\/\\/([w]{3}\\.)?.+\\.(#{exts.join('|')})(\\?.+)?$)/i"
|
72
|
+
end
|
73
|
+
|
74
|
+
# Collection of paths used to get content from HTML tags via Node#xpath method.
|
75
|
+
# See more: http://nokogiri.org/tutorials/searching_a_xml_html_document.html
|
76
|
+
#
|
77
|
+
# Example:
|
78
|
+
# >> sf_paths[:title]
|
79
|
+
# => '//meta[@property = "og:title"]/@content | //meta[@name = "title"]/@content | //meta[@name = "Title"]/@content | //title | //h1'
|
80
|
+
|
81
|
+
def sf_paths
|
82
|
+
{
|
83
|
+
title: '//meta[@property = "og:title"]/@content | //meta[@name = "title"]/@content | //meta[@name = "Title"]/@content | //title | //h1',
|
84
|
+
description: '//meta[@property = "og:description"]/@content | //meta[@name = "description"]/@content | //meta[@name = "Description"]/@content | //h1 | //h3 | //p | //span | //font',
|
85
|
+
image: '//meta[@property = "og:image"]/@content | //link[@rel = "image_src"]/@href | //meta[@itemprop = "image"]/@content | //div[@id = "logo"]/img/@src | //a[@id = "logo"]/img/@src | //div[@class = "logo"]/img/@src | //a[@class = "logo"]/img/@src | //a//img[@width]/@src | //img[@width]/@src | //a//img[@height]/@src | //img[@height]/@src | //a//img/@src | //span//img/@src'
|
86
|
+
}
|
87
|
+
end
|
88
|
+
|
89
|
+
# Checks and returns only the valid image URIs.
|
90
|
+
#
|
91
|
+
# Example:
|
92
|
+
# >> sf_fix_imgs(['http://adtangerine.com/image.png', '/assets/image.jpg'], 'http://adtangerine.com', :jpg)
|
93
|
+
# => ['http://adtangerine/assets/image.jpg']
|
94
|
+
# Arguments:
|
95
|
+
# imgs: (Array)
|
96
|
+
# - Image URIs got from the HTML doc.
|
97
|
+
# uri: (String)
|
98
|
+
# - Used as basis to the URIs that don't have any protocol/domain set.
|
99
|
+
# exts: (Symbol or Array)
|
100
|
+
# - Allowed image extesntions.
|
101
|
+
|
102
|
+
def sf_fix_imgs(imgs, uri, exts = [])
|
103
|
+
sf_check_img_ext(imgs.map do |img|
|
104
|
+
img = img.to_s
|
105
|
+
img = sf_fix_protocol(img, sf_domain(uri)) unless img =~ sf_regex(:protocol)
|
106
|
+
img if (img =~ sf_regex(:image))
|
107
|
+
end.compact, exts)
|
108
|
+
end
|
109
|
+
|
110
|
+
# Fixes image URIs that doesn't present protocol/domain.
|
111
|
+
#
|
112
|
+
# Example:
|
113
|
+
# >> sf_fix_protocol('/assets/image.jpg', 'http://adtangerine.com')
|
114
|
+
# => 'http://adtangerine/assets/image.jpg'
|
115
|
+
# >> sf_fix_protocol('//s.ytimg.com/yts/img/youtub_img.png', 'https://youtube.com')
|
116
|
+
# => 'https://s.ytimg.com/yts/img/youtub_img.png'
|
117
|
+
# Arguments:
|
118
|
+
# path: (String)
|
119
|
+
# - URI path having no protocol/domain set.
|
120
|
+
# domain: (String)
|
121
|
+
# - Domain that will be prepended into the path.
|
122
|
+
|
123
|
+
def sf_fix_protocol(path, domain)
|
124
|
+
if path =~ /^\/\/[^\/]+/
|
125
|
+
'http:' << path
|
126
|
+
else
|
127
|
+
"http://#{domain}#{'/' unless path =~ /^\/[^\/]+/}#{path}"
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
# Returns the domain from an URI
|
132
|
+
#
|
133
|
+
# Example:
|
134
|
+
# >> sf_domain('http://adtangerine.com')
|
135
|
+
# => 'adtangerine.com'
|
136
|
+
# Arguments:
|
137
|
+
# uri: (String)
|
138
|
+
# - URI.
|
139
|
+
|
140
|
+
def sf_domain(uri)
|
141
|
+
(uri.split('/')[2] rescue '')
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
data/lib/scrapifier.rb
ADDED
data/scrapifier.gemspec
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'scrapifier/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = 'scrapifier'
|
8
|
+
spec.version = Scrapifier::VERSION
|
9
|
+
spec.authors = ['Tiago Guedes']
|
10
|
+
spec.email = ['tiagopog@gmail.com']
|
11
|
+
spec.description = 'A very simple way to extract meta information from URIs using the screen scraping technique.'
|
12
|
+
spec.summary = 'Extends the Ruby String class with a screen scraping method.'
|
13
|
+
spec.homepage = 'https://github.com/tiagopog/scrapifier'
|
14
|
+
spec.license = 'MIT'
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ['lib']
|
20
|
+
|
21
|
+
spec.add_runtime_dependency 'nokogiri', '~> 1.6'
|
22
|
+
|
23
|
+
spec.add_development_dependency 'bundler', '~> 1.3'
|
24
|
+
spec.add_development_dependency 'rspec', '~> 2.14'
|
25
|
+
spec.add_development_dependency 'rake', '~> 10.1'
|
26
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module Factories
|
2
|
+
private
|
3
|
+
def sf_samples
|
4
|
+
{
|
5
|
+
misc: {
|
6
|
+
http: 'http://adtangerine.com',
|
7
|
+
https: 'https://rubygems.org/gems/string_awesome',
|
8
|
+
ftp: 'ftp://ftpserver.com',
|
9
|
+
www: 'www.twitflink.com'
|
10
|
+
},
|
11
|
+
images: {
|
12
|
+
jpg: [
|
13
|
+
'http://jlcauvin.com/wp-content/uploads/2013/09/heisenberg-breaking-bad.jpg',
|
14
|
+
'https://www.foobar.com/awesome_image.jpeg?foo=bar&bar=foo',
|
15
|
+
'http://foobar.com.br/nice-image.jpg'
|
16
|
+
],
|
17
|
+
png: [
|
18
|
+
'http://www.faniq.com/images/blog/58389e481aee9c5abbf49ff0a263f3ca.png',
|
19
|
+
'https://foobar.br/awesome_image.png',
|
20
|
+
'https://bar.foobar.br/foo/var/image.png?foo=bar',
|
21
|
+
],
|
22
|
+
gif: [
|
23
|
+
'http://31.media.tumblr.com/6eec77e355fe50bae424291fd8c58622/tumblr_me7ucl8kO61rf089no1_500.gif',
|
24
|
+
'http://foobar.com/ugly_image.gif',
|
25
|
+
'https://bar.foobar.br/foo/var/stop_using.gif?foo=bar'
|
26
|
+
]
|
27
|
+
},
|
28
|
+
regexes: {
|
29
|
+
image: {
|
30
|
+
all: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg|png|gif)(\?.+)?$)/i,
|
31
|
+
jpg: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg)(\?.+)?$)/i,
|
32
|
+
png: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(png)(\?.+)?$)/i,
|
33
|
+
gif: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(gif)(\?.+)?$)/i
|
34
|
+
},
|
35
|
+
uri: /\b((((ht|f)tp[s]?:\/\/)|([a-z0-9]+\.))+(?<!@)([a-z0-9\_\-]+)(\.[a-z]+)+([\?\/\:][a-z0-9_=%&@\?\.\/\-\:\#\(\)]+)?\/?)/i,
|
36
|
+
protocol: /((ht|f)tp[s]?)/i
|
37
|
+
}
|
38
|
+
}
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,269 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
require 'spec_helper'
|
3
|
+
include Factories
|
4
|
+
|
5
|
+
describe String do
|
6
|
+
let(:images) { sf_samples[:images] }
|
7
|
+
let(:misc) { sf_samples[:misc] }
|
8
|
+
let(:regexes) { sf_samples[:regexes] }
|
9
|
+
|
10
|
+
#
|
11
|
+
# String#scrapify
|
12
|
+
#
|
13
|
+
|
14
|
+
describe '#scrapify' do
|
15
|
+
context 'when no URI is matched in the String' do
|
16
|
+
subject { 'String without any URI.'.scrapify }
|
17
|
+
|
18
|
+
it { should eq({}) }
|
19
|
+
end
|
20
|
+
|
21
|
+
context 'when the website was not found' do
|
22
|
+
subject { 'Check out this http://someweirduri.com.br'.scrapify }
|
23
|
+
|
24
|
+
it { should eq({}) }
|
25
|
+
end
|
26
|
+
|
27
|
+
context 'when an image URI is matched' do
|
28
|
+
let(:jpg) { images[:jpg][0] }
|
29
|
+
let(:png) { images[:png][0] }
|
30
|
+
let(:gif) { images[:gif][0] }
|
31
|
+
|
32
|
+
it 'sets the same value for :title, :description and :uri keys' do
|
33
|
+
"Say my name: #{jpg}".scrapify.should include(title: jpg, description: jpg, uri: jpg)
|
34
|
+
end
|
35
|
+
|
36
|
+
it 'allows all the standard image extensions by default (even GIFs)' do
|
37
|
+
"Smile GIF! Oh, wait... #{gif}".scrapify.should include(title: gif, description: gif, uri: gif)
|
38
|
+
end
|
39
|
+
|
40
|
+
it 'returns an empty Hash if the extension is not allowed' do
|
41
|
+
"PNG is awesome! #{png}".scrapify(images: [:jpg]).should eq({})
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
context 'when a website URI is matched in the String and a Hash is returned' do
|
46
|
+
subject(:hash) { "Look this awesome site #{misc[:http]}".scrapify }
|
47
|
+
|
48
|
+
it "includes a field with the site's title" do
|
49
|
+
hash[:title].is_a?(String).should be_true
|
50
|
+
hash[:title].empty?.should be_false
|
51
|
+
end
|
52
|
+
|
53
|
+
it "includes a field with the site's description" do
|
54
|
+
hash[:description].is_a?(String).should be_true
|
55
|
+
hash[:description].empty?.should be_false
|
56
|
+
end
|
57
|
+
|
58
|
+
it 'includes a field with the page URI' do
|
59
|
+
hash[:uri].is_a?(String).should be_true
|
60
|
+
hash[:uri].empty?.should be_false
|
61
|
+
hash[:uri].should eq(misc[:http])
|
62
|
+
end
|
63
|
+
|
64
|
+
it "includes a field with image URIs from the site's head/body" do
|
65
|
+
hash[:images].is_a?(Array).should be_true
|
66
|
+
hash[:images].sample.should match(regexes[:image][:all])
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
it "includes a field with only the allowed types of image URIs from the site's head/body" do
|
71
|
+
misc[:http].scrapify(images: :png)[:images].sample.should match(regexes[:image][:png])
|
72
|
+
end
|
73
|
+
|
74
|
+
it "can choose the URI in the String to be scrapified" do
|
75
|
+
hash = "Check out these awesome sites: #{misc[:http]} and #{misc[:www]}".scrapify(which: 1, images: :png)
|
76
|
+
[:title, :description, :uri].each do |key|
|
77
|
+
hash[key].is_a?(String).should be_true
|
78
|
+
hash[key].empty?.should be_false
|
79
|
+
end
|
80
|
+
hash[:uri].should eq("http://#{misc[:www]}")
|
81
|
+
hash[:images].sample.should match(regexes[:image][:png])
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
#
|
86
|
+
# String#find_uri
|
87
|
+
#
|
88
|
+
|
89
|
+
describe '#find_uri' do
|
90
|
+
let(:sample_uris) { misc.map { |u| u[1] } }
|
91
|
+
let(:str) { "Awesome sites: #{sample_uris.join ' and '}" }
|
92
|
+
|
93
|
+
it 'matches the first URI in the String by default' do
|
94
|
+
str.send(:find_uri).should eq(sample_uris[0])
|
95
|
+
end
|
96
|
+
|
97
|
+
it 'matches the second URI in the String (https)' do
|
98
|
+
str.send(:find_uri, 1).should eq(sample_uris[1])
|
99
|
+
end
|
100
|
+
|
101
|
+
it 'matches the third URI in the String (www)' do
|
102
|
+
str.send(:find_uri, 2).should eq(sample_uris[2])
|
103
|
+
end
|
104
|
+
|
105
|
+
context 'when no URI is matched' do
|
106
|
+
it 'returns nil' do
|
107
|
+
'Lorem ipsum dolor.'.send(:find_uri).should be_nil
|
108
|
+
end
|
109
|
+
|
110
|
+
it 'returns nil (no presence of http|https|ftp|www)' do
|
111
|
+
'Check this out: google.com'.send(:find_uri).should be_nil
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
#
|
117
|
+
# String#sf_check_img_ext
|
118
|
+
#
|
119
|
+
|
120
|
+
describe '#sf_check_img_ext' do
|
121
|
+
let(:img) { images[:jpg].sample }
|
122
|
+
let(:imgs) { images.map { |i| i[1] }.flatten }
|
123
|
+
let(:checked) do
|
124
|
+
{
|
125
|
+
str: ''.send(:sf_check_img_ext, img),
|
126
|
+
array: ''.send(:sf_check_img_ext, imgs),
|
127
|
+
jpg: ''.send(:sf_check_img_ext, imgs, [:jpg]),
|
128
|
+
png: ''.send(:sf_check_img_ext, imgs, :png),
|
129
|
+
gif: ''.send(:sf_check_img_ext, imgs, 'gif')
|
130
|
+
}
|
131
|
+
end
|
132
|
+
|
133
|
+
context 'when no arument is passed' do
|
134
|
+
it { expect { ''.send(:sf_check_img_ext) }.to raise_error(ArgumentError) }
|
135
|
+
end
|
136
|
+
|
137
|
+
context 'when only the first argument is defined' do
|
138
|
+
it 'allows a String as argument' do
|
139
|
+
checked[:str].should have(1).item
|
140
|
+
end
|
141
|
+
|
142
|
+
it 'allows an Array as argument' do
|
143
|
+
checked[:jpg].should have(3).item
|
144
|
+
end
|
145
|
+
|
146
|
+
it 'allows all the image extensions by default' do
|
147
|
+
checked[:array].should have(9).item
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
context 'when the two arguments are defined' do
|
152
|
+
it 'allows a Symbol as the second argument' do
|
153
|
+
checked[:png].should have(3).item
|
154
|
+
end
|
155
|
+
|
156
|
+
it 'allows a String as the second argument' do
|
157
|
+
checked[:gif].should have(3).item
|
158
|
+
end
|
159
|
+
|
160
|
+
it 'allows an Array as the second argument' do
|
161
|
+
checked[:jpg].should have(3).item
|
162
|
+
end
|
163
|
+
|
164
|
+
it 'returns an Array with only image types allowed' do
|
165
|
+
[:jpg, :png, :gif].each { |ext| checked[ext].should have(3).item }
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
context 'when no image is found/allowed' do
|
170
|
+
it 'returns an empty Array' do
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
it 'always returns an Array' do
|
175
|
+
checked.each { |c| c[1].is_a?(Array).should be_true }
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
|
180
|
+
#
|
181
|
+
# String#sf_regex
|
182
|
+
#
|
183
|
+
|
184
|
+
describe '#sf_regex' do
|
185
|
+
context 'when it needs a regex to match any kind of URI' do
|
186
|
+
subject { ''.send(:sf_regex, :uri) }
|
187
|
+
|
188
|
+
[:http, :https, :ftp, :www].each do |p|
|
189
|
+
it { should match(misc[:http]) }
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
context 'when it needs a regex to match only image uris' do
|
194
|
+
subject { ''.send(:sf_regex, :image) }
|
195
|
+
|
196
|
+
[:jpg, :png, :gif].each do |ext|
|
197
|
+
it { should match(sf_samples[:images][ext].sample) }
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
#
|
203
|
+
# String#sf_img_regex
|
204
|
+
#
|
205
|
+
|
206
|
+
describe '#sf_img_regex' do
|
207
|
+
let(:img_regexes) { regexes[:image] }
|
208
|
+
|
209
|
+
context 'when no argument is passed' do
|
210
|
+
subject(:regex) { ''.send(:sf_img_regex) }
|
211
|
+
|
212
|
+
it 'returns a regex that matches all image extensions' do
|
213
|
+
regex.should eq(img_regexes[:all])
|
214
|
+
end
|
215
|
+
|
216
|
+
it 'matches all image extensions' do
|
217
|
+
[:jpg, :png, :gif].each { |ext| images[ext].sample.should match(regex) }
|
218
|
+
end
|
219
|
+
end
|
220
|
+
|
221
|
+
context 'when only jpg is allowed' do
|
222
|
+
subject(:regex) { ''.send(:sf_img_regex, [:jpg]) }
|
223
|
+
|
224
|
+
it 'returns a regex that matches only jpg images' do
|
225
|
+
regex.should eq(img_regexes[:jpg])
|
226
|
+
end
|
227
|
+
|
228
|
+
it 'matches only the defined extension' do
|
229
|
+
regex.should match(images[:jpg].sample)
|
230
|
+
end
|
231
|
+
|
232
|
+
it "doesn't match any other extension" do
|
233
|
+
[:png, :gif].each { |ext| regex.should_not match(images[ext].sample) }
|
234
|
+
end
|
235
|
+
end
|
236
|
+
|
237
|
+
context 'when only png is allowed' do
|
238
|
+
subject(:regex) { ''.send(:sf_img_regex, :png) }
|
239
|
+
|
240
|
+
it 'returns a regex that matches only png images' do
|
241
|
+
regex.should eq(img_regexes[:png])
|
242
|
+
end
|
243
|
+
|
244
|
+
it 'matches only the defined extension' do
|
245
|
+
regex.should match(images[:png].sample)
|
246
|
+
end
|
247
|
+
|
248
|
+
it "doesn't match any other extension" do
|
249
|
+
[:jpg, :gif].each { |ext| regex.should_not match(images[ext].sample) }
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
253
|
+
context 'when only gif (argh!) is allowed' do
|
254
|
+
subject(:regex) { ''.send(:sf_img_regex, :gif) }
|
255
|
+
|
256
|
+
it 'returns a regex that matches only gif images' do
|
257
|
+
regex.should eq(img_regexes[:gif])
|
258
|
+
end
|
259
|
+
|
260
|
+
it 'matches only the defined extension' do
|
261
|
+
regex.should match(images[:gif].sample)
|
262
|
+
end
|
263
|
+
|
264
|
+
it "doesn't match any other extension" do
|
265
|
+
[:jpg, :png].each { |ext| regex.should_not match(images[ext].sample) }
|
266
|
+
end
|
267
|
+
end
|
268
|
+
end
|
269
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,117 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: scrapifier
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Tiago Guedes
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-04-07 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.6'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.6'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: bundler
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.3'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.3'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '2.14'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '2.14'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rake
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '10.1'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '10.1'
|
69
|
+
description: A very simple way to extract meta information from URIs using the screen
|
70
|
+
scraping technique.
|
71
|
+
email:
|
72
|
+
- tiagopog@gmail.com
|
73
|
+
executables: []
|
74
|
+
extensions: []
|
75
|
+
extra_rdoc_files: []
|
76
|
+
files:
|
77
|
+
- ".gitignore"
|
78
|
+
- Gemfile
|
79
|
+
- LICENSE.txt
|
80
|
+
- README.md
|
81
|
+
- Rakefile
|
82
|
+
- lib/scrapifier.rb
|
83
|
+
- lib/scrapifier/methods.rb
|
84
|
+
- lib/scrapifier/support.rb
|
85
|
+
- lib/scrapifier/version.rb
|
86
|
+
- scrapifier.gemspec
|
87
|
+
- spec/factories/uris.rb
|
88
|
+
- spec/scrapifier_spec.rb
|
89
|
+
- spec/spec_helper.rb
|
90
|
+
homepage: https://github.com/tiagopog/scrapifier
|
91
|
+
licenses:
|
92
|
+
- MIT
|
93
|
+
metadata: {}
|
94
|
+
post_install_message:
|
95
|
+
rdoc_options: []
|
96
|
+
require_paths:
|
97
|
+
- lib
|
98
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
99
|
+
requirements:
|
100
|
+
- - ">="
|
101
|
+
- !ruby/object:Gem::Version
|
102
|
+
version: '0'
|
103
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
104
|
+
requirements:
|
105
|
+
- - ">="
|
106
|
+
- !ruby/object:Gem::Version
|
107
|
+
version: '0'
|
108
|
+
requirements: []
|
109
|
+
rubyforge_project:
|
110
|
+
rubygems_version: 2.2.2
|
111
|
+
signing_key:
|
112
|
+
specification_version: 4
|
113
|
+
summary: Extends the Ruby String class with a screen scraping method.
|
114
|
+
test_files:
|
115
|
+
- spec/factories/uris.rb
|
116
|
+
- spec/scrapifier_spec.rb
|
117
|
+
- spec/spec_helper.rb
|