scrapifier 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +4 -0
- data/README.md +88 -83
- data/lib/scrapifier/methods.rb +25 -35
- data/lib/scrapifier/support.rb +152 -128
- data/lib/scrapifier/version.rb +1 -1
- data/spec/factories/{uris.rb → samples.rb} +0 -0
- data/spec/spec_helper.rb +1 -1
- metadata +5 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: aa0d714c01fc436bf4f90f4baaf50e98bcb9197a
|
4
|
+
data.tar.gz: b9337a1a690c0a7f0c57b237f327eec80d9bbdda
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 75baa6ed1838759bd6c0ebc8a453120b6c2cfe8f484f4396a5401a1d1acd66eebb0377423750a647ca9417e0fb8f4677ba688d40a4bc7e642cee653a6a76131a
|
7
|
+
data.tar.gz: a77ab52807dbcf3a9846226641a00b210324e2d64db0d18324597be380e576faea9c65eb97b00ea0a98e9de6359329f3b295f168fd001b59a4b9b8390ef3badc
|
data/.travis.yml
ADDED
data/README.md
CHANGED
@@ -1,83 +1,88 @@
|
|
1
|
-
# Scrapifier
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
#
|
51
|
-
#
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
#
|
71
|
-
#
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
1
|
+
# Scrapifier
|
2
|
+
|
3
|
+
[![Build Status](https://travis-ci.org/tiagopog/scrapifier.svg?branch=master)](https://travis-ci.org/tiagopog/scrapifier)
|
4
|
+
[![Code Climate](https://codeclimate.com/github/tiagopog/scrapifier.png)](https://codeclimate.com/github/tiagopog/scrapifier)
|
5
|
+
[![Dependency Status](https://gemnasium.com/tiagopog/scrapifier.svg)](https://gemnasium.com/tiagopog/scrapifier)
|
6
|
+
[![Gem Version](https://badge.fury.io/rb/scrapifier.svg)](http://badge.fury.io/rb/scrapifier)
|
7
|
+
|
8
|
+
It's a Ruby gem that brings a very simple way to extract meta information from URIs using the screen scraping technique.
|
9
|
+
|
10
|
+
## Installation
|
11
|
+
|
12
|
+
Compatible with Ruby 1.9.3+
|
13
|
+
|
14
|
+
Add this line to your application's Gemfile:
|
15
|
+
|
16
|
+
gem 'scrapifier'
|
17
|
+
|
18
|
+
And then execute:
|
19
|
+
|
20
|
+
$ bundle
|
21
|
+
|
22
|
+
Or install it yourself as:
|
23
|
+
|
24
|
+
$ gem install scrapifier
|
25
|
+
|
26
|
+
## Usage
|
27
|
+
|
28
|
+
The method finds an URI in the String and gets some meta information from it, like the page's title, description, images and the URI. All the data is returned in a well-formatted Hash.
|
29
|
+
|
30
|
+
#### Default usage.
|
31
|
+
|
32
|
+
``` ruby
|
33
|
+
'Wow! What an awesome site: http://adtangerine.com!'.scrapify
|
34
|
+
#=> {
|
35
|
+
# title: "AdTangerine | Advertising Platform for Social Media",
|
36
|
+
# description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
|
37
|
+
# images: ["http://adtangerine.com/assets/logo_adt_og.png", "http://adtangerine.com/assets/logo_adt_og.png", "http://s3-us-west-2.amazonaws.com/adtangerine-prod/users/avatars/000/000/834/thumb/275747_1118382211_1929809351_n.jpg", "http://adtangerine.com/assets/foobar.gif"],
|
38
|
+
# uri: "http://adtangerine.com"
|
39
|
+
# }
|
40
|
+
```
|
41
|
+
|
42
|
+
#### Allow only certain image types.
|
43
|
+
|
44
|
+
``` ruby
|
45
|
+
'Wow! What an awesome site: http://adtangerine.com!'.scrapify images: :jpg
|
46
|
+
#=> {
|
47
|
+
# title: "AdTangerine | Advertising Platform for Social Media",
|
48
|
+
# description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
|
49
|
+
# images: ["http://s3-us-west-2.amazonaws.com/adtangerine-prod/users/avatars/000/000/834/thumb/275747_1118382211_1929809351_n.jpg"],
|
50
|
+
# uri: "http://adtangerine.com"
|
51
|
+
# }
|
52
|
+
|
53
|
+
'Wow! What an awesome site: http://adtangerine.com!'.scrapify images: [:png, :gif]
|
54
|
+
#=> {
|
55
|
+
# title: "AdTangerine | Advertising Platform for Social Media",
|
56
|
+
# description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
|
57
|
+
# images: ["http://adtangerine.com/assets/logo_adt_og.png", "http://adtangerine.com/assets/logo_adt_og.png", "http://adtangerine.com/assets/foobar.gif"],
|
58
|
+
# uri: "http://adtangerine.com"
|
59
|
+
# }
|
60
|
+
```
|
61
|
+
|
62
|
+
#### Choose which URI you want it to be scraped.
|
63
|
+
|
64
|
+
``` ruby
|
65
|
+
'Check out: http://adtangerine.com and www.twitflink.com'.scrapify which: 1
|
66
|
+
#=> {
|
67
|
+
# title: "TwitFlink | Find a link!",
|
68
|
+
# description: "TwitFlink is a very simple searching tool that allows people to find out links tweeted by any user from Twitter.",
|
69
|
+
# images: ["http://www.twitflink.com//assets/tf_logo.png", "http://twitflink.com/assets/tf_logo.png"],
|
70
|
+
# uri: "http://www.twitflink.com"
|
71
|
+
# }
|
72
|
+
|
73
|
+
'Check out: http://adtangerine.com and www.twitflink.com'.scrapify({ which: 0, images: :gif })
|
74
|
+
#=> {
|
75
|
+
# title: "AdTangerine | Advertising Platform for Social Media",
|
76
|
+
# description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
|
77
|
+
# images: ["http://adtangerine.com/assets/foobar.gif"],
|
78
|
+
# uri: "http://adtangerine.com"
|
79
|
+
# }
|
80
|
+
```
|
81
|
+
|
82
|
+
## Contributing
|
83
|
+
|
84
|
+
1. Fork it
|
85
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
86
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
87
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
88
|
+
5. Create new Pull Request
|
data/lib/scrapifier/methods.rb
CHANGED
@@ -4,67 +4,57 @@ require 'open-uri'
|
|
4
4
|
require 'scrapifier/support'
|
5
5
|
|
6
6
|
module Scrapifier
|
7
|
+
# Methods which will be included into the String class.
|
7
8
|
module Methods
|
8
9
|
include Scrapifier::Support
|
9
10
|
|
10
|
-
#
|
11
|
-
#
|
11
|
+
# Get metadata from an URI using the screen scraping technique.
|
12
|
+
#
|
12
13
|
# Example:
|
13
14
|
# >> 'Wow! What an awesome site: http://adtangerine.com!'.scrapify
|
14
15
|
# => {
|
15
16
|
# :title => "AdTangerine | Advertising Platform for Social Media",
|
16
|
-
# :description => "AdTangerine is an advertising platform that
|
17
|
-
# :images => [
|
17
|
+
# :description => "AdTangerine is an advertising platform that...",
|
18
|
+
# :images => [
|
19
|
+
# "http://adtangerine.com/assets/logo_adt_og.png",
|
20
|
+
# "http://adtangerine.com/assets/logo_adt_og.png
|
21
|
+
# ],
|
18
22
|
# :uri => "http://adtangerine.com"
|
19
23
|
# }
|
20
24
|
# Arguments:
|
21
25
|
# options: (Hash)
|
22
|
-
# - which: (Integer)
|
23
|
-
#
|
24
|
-
|
26
|
+
# - which: (Integer)
|
27
|
+
# Which URI in the String will be used. It starts from 0 to N.
|
28
|
+
# - images: (Symbol or Array)
|
29
|
+
# Image extensions which are allowed to be returned as result.
|
25
30
|
def scrapify(options = {})
|
26
|
-
|
27
|
-
|
28
|
-
begin
|
29
|
-
if uri.nil?
|
30
|
-
raise
|
31
|
-
elsif uri =~ sf_regex(:image)
|
32
|
-
uri = (sf_check_img_ext(uri, options[:images])[0] rescue [])
|
33
|
-
raise if uri.empty?
|
34
|
-
[:title, :description, :uri, :images].each { |key| meta[key] = uri }
|
35
|
-
else
|
36
|
-
doc = Nokogiri::HTML(open(uri).read)
|
37
|
-
doc.encoding = 'utf-8'
|
38
|
-
|
39
|
-
[:title, :description].each do |key|
|
40
|
-
meta[key] = (doc.xpath(sf_paths[key])[0].text rescue '-')
|
41
|
-
end
|
31
|
+
uri, meta = find_uri(options[:which]), {}
|
32
|
+
return meta if uri.nil?
|
42
33
|
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
meta = {}
|
34
|
+
if !(uri =~ sf_regex(:image))
|
35
|
+
meta = sf_eval_uri(uri, options[:images])
|
36
|
+
elsif !sf_check_img_ext(uri, options[:images]).empty?
|
37
|
+
[:title, :description, :uri, :images].each { |k| meta[k] = uri }
|
48
38
|
end
|
49
39
|
|
50
40
|
meta
|
51
41
|
end
|
52
42
|
|
53
|
-
#
|
54
|
-
#
|
43
|
+
# Find URIs in the String.
|
44
|
+
#
|
55
45
|
# Example:
|
56
46
|
# >> 'Wow! What an awesome site: http://adtangerine.com!'.find_uri
|
57
47
|
# => 'http://adtangerine.com'
|
58
|
-
# >> '
|
48
|
+
# >> 'Very cool: http://adtangerine.com and www.twitflink.com'.find_uri 1
|
59
49
|
# => 'www.twitflink.com'
|
60
50
|
# Arguments:
|
61
51
|
# which: (Integer)
|
62
52
|
# - Which URI in the String: first (0), second (1) and so on.
|
63
|
-
|
64
53
|
def find_uri(which = 0)
|
65
|
-
which
|
66
|
-
which
|
67
|
-
|
54
|
+
which = scan(sf_regex(:uri))[which.to_i][0]
|
55
|
+
which =~ sf_regex(:protocol) ? which : "http://#{which}"
|
56
|
+
rescue NoMethodError
|
57
|
+
nil
|
68
58
|
end
|
69
59
|
end
|
70
60
|
end
|
data/lib/scrapifier/support.rb
CHANGED
@@ -1,144 +1,168 @@
|
|
1
1
|
module Scrapifier
|
2
2
|
module Support
|
3
|
-
|
4
|
-
# Filters images returning those with the allowed extentions.
|
5
|
-
#
|
6
|
-
# Example:
|
7
|
-
# >> sf_check_img_ext('http://source.com/image.gif', :jpg)
|
8
|
-
# => []
|
9
|
-
# >> sf_check_img_ext(['http://source.com/image.gif', 'http://source.com/image.jpg'], [:jpg, :png])
|
10
|
-
# => ['http://source.com/image.jpg']
|
11
|
-
# Arguments:
|
12
|
-
# images: (String or Array)
|
13
|
-
# - Images which will be checked.
|
14
|
-
# allowed: (String, Symbol or Array)
|
15
|
-
# - Allowed types of image extension.
|
16
|
-
|
17
|
-
def sf_check_img_ext(images, allowed = [])
|
18
|
-
allowed ||= []
|
19
|
-
if images.is_a?(String)
|
20
|
-
images = images.split
|
21
|
-
elsif !images.is_a?(Array)
|
22
|
-
images = []
|
23
|
-
end
|
24
|
-
images.select { |i| i =~ sf_regex(:image, allowed) }
|
25
|
-
end
|
3
|
+
module_function
|
26
4
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
5
|
+
# Evaluate the URI's HTML document and get its metadata.
|
6
|
+
#
|
7
|
+
# Example:
|
8
|
+
# >> eval_uri('http://adtangerine.com', [:png])
|
9
|
+
# => {
|
10
|
+
# :title => "AdTangerine | Advertising Platform for Social Media",
|
11
|
+
# :description => "AdTangerine is an advertising platform that...",
|
12
|
+
# :images => [
|
13
|
+
# "http://adtangerine.com/assets/logo_adt_og.png",
|
14
|
+
# "http://adtangerine.com/assets/logo_adt_og.png
|
15
|
+
# ],
|
16
|
+
# :uri => "http://adtangerine.com"
|
17
|
+
# }
|
18
|
+
# Arguments:
|
19
|
+
# uri: (String)
|
20
|
+
# - URI.
|
21
|
+
# imgs: (Array)
|
22
|
+
# - Allowed type of images.
|
23
|
+
def sf_eval_uri(uri, imgs = [])
|
24
|
+
doc = Nokogiri::HTML(open(uri).read)
|
25
|
+
doc.encoding, meta = 'utf-8', { uri: uri }
|
39
26
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
protocol: /((ht|f)tp[s]?)/i
|
48
|
-
}
|
49
|
-
regexes[type]
|
50
|
-
end
|
51
|
-
end
|
27
|
+
%i(title description).each { |k| meta[k] = (doc.xpath(sf_paths[k])[0].text rescue '-') }
|
28
|
+
meta[:images] = sf_fix_imgs(doc.xpath(sf_paths[:image]), uri, imgs)
|
29
|
+
|
30
|
+
meta
|
31
|
+
rescue SocketError
|
32
|
+
{}
|
33
|
+
end
|
52
34
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
eval "/(^http{1}[s]?:\\/\\/([w]{3}\\.)?.+\\.(#{exts.join('|')})(\\?.+)?$)/i"
|
35
|
+
# Filter images returning those with the allowed extentions.
|
36
|
+
#
|
37
|
+
# Example:
|
38
|
+
# >> sf_check_img_ext('http://source.com/image.gif', :jpg)
|
39
|
+
# => []
|
40
|
+
# >> sf_check_img_ext(['http://source.com/image.gif', 'http://source.com/image.jpg'], [:jpg, :png])
|
41
|
+
# => ['http://source.com/image.jpg']
|
42
|
+
# Arguments:
|
43
|
+
# images: (String or Array)
|
44
|
+
# - Images which will be checked.
|
45
|
+
# allowed: (String, Symbol or Array)
|
46
|
+
# - Allowed types of image extension.
|
47
|
+
def sf_check_img_ext(images, allowed = [])
|
48
|
+
allowed ||= []
|
49
|
+
if images.is_a?(String)
|
50
|
+
images = images.split
|
51
|
+
elsif !images.is_a?(Array)
|
52
|
+
images = []
|
72
53
|
end
|
54
|
+
images.select { |i| i =~ sf_regex(:image, allowed) }
|
55
|
+
end
|
73
56
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
57
|
+
# Select regexes for URIs, protocols and image extensions.
|
58
|
+
#
|
59
|
+
# Example:
|
60
|
+
# >> sf_regex(:uri)
|
61
|
+
# => /\b((((ht|f)tp[s]?:\/\/)|([a-z0-9]+\.))+(?<!@)([a-z0-9\_\-]+)(\.[a-z]+)+([\?\/\:][a-z0-9_=%&@\?\.\/\-\:\#\(\)]+)?\/?)/i,
|
62
|
+
# >> sf_regex(:image, :jpg)
|
63
|
+
# => /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg)(\?.+)?$)/i
|
64
|
+
# Arguments:
|
65
|
+
# type: (Symbol or String)
|
66
|
+
# - Regex type.
|
67
|
+
# args: (*)
|
68
|
+
# - Anything.
|
69
|
+
def sf_regex(type, *args)
|
70
|
+
type = type.to_sym unless type.is_a? Symbol
|
71
|
+
if type == :image
|
72
|
+
sf_img_regex args.flatten
|
73
|
+
else
|
74
|
+
regexes = {
|
75
|
+
uri: /\b((((ht|f)tp[s]?:\/\/)|([a-z0-9]+\.))+(?<!@)([a-z0-9\_\-]+)(\.[a-z]+)+([\?\/\:][a-z0-9_=%&@\?\.\/\-\:\#\(\)]+)?\/?)/i,
|
76
|
+
protocol: /((ht|f)tp[s]?)/i
|
86
77
|
}
|
78
|
+
regexes[type]
|
87
79
|
end
|
80
|
+
end
|
88
81
|
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
82
|
+
# Build image regexes according to the required extensions.
|
83
|
+
#
|
84
|
+
# Example:
|
85
|
+
# >> sf_img_regex
|
86
|
+
# => /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg|png|gif)(\?.+)?$)/i
|
87
|
+
# >> sf_img_regex([:jpg, :png])
|
88
|
+
# => /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|png)(\?.+)?$)/i
|
89
|
+
# Arguments:
|
90
|
+
# exts: (Array)
|
91
|
+
# - Image extensions which will be included in the regex.
|
92
|
+
def sf_img_regex(exts = [])
|
93
|
+
exts = [exts].flatten unless exts.is_a?(Array)
|
94
|
+
if exts.nil? or exts.empty?
|
95
|
+
exts = %w(jpg jpeg png gif)
|
96
|
+
elsif exts.include?(:jpg) and !exts.include?(:jpeg)
|
97
|
+
exts.push :jpeg
|
98
|
+
end
|
99
|
+
eval "/(^http{1}[s]?:\\/\\/([w]{3}\\.)?.+\\.(#{exts.join('|')})(\\?.+)?$)/i"
|
100
|
+
end
|
101
101
|
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
102
|
+
# Collection of paths used to get content from HTML tags via Node#xpath method.
|
103
|
+
# See more: http://nokogiri.org/tutorials/searching_a_xml_html_document.html
|
104
|
+
#
|
105
|
+
# Example:
|
106
|
+
# >> sf_paths[:title]
|
107
|
+
# => '//meta[@property = "og:title"]/@content | //meta[@name = "title"]/@content | //meta[@name = "Title"]/@content | //title | //h1'
|
108
|
+
def sf_paths
|
109
|
+
{
|
110
|
+
title: '//meta[@property = "og:title"]/@content | //meta[@name = "title"]/@content | //meta[@name = "Title"]/@content | //title | //h1',
|
111
|
+
description: '//meta[@property = "og:description"]/@content | //meta[@name = "description"]/@content | //meta[@name = "Description"]/@content | //h1 | //h3 | //p | //span | //font',
|
112
|
+
image: '//meta[@property = "og:image"]/@content | //link[@rel = "image_src"]/@href | //meta[@itemprop = "image"]/@content | //div[@id = "logo"]/img/@src | //a[@id = "logo"]/img/@src | //div[@class = "logo"]/img/@src | //a[@class = "logo"]/img/@src | //a//img[@width]/@src | //img[@width]/@src | //a//img[@height]/@src | //img[@height]/@src | //a//img/@src | //span//img/@src'
|
113
|
+
}
|
114
|
+
end
|
109
115
|
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
116
|
+
# Check and return only the valid image URIs.
|
117
|
+
#
|
118
|
+
# Example:
|
119
|
+
# >> sf_fix_imgs(['http://adtangerine.com/image.png', '/assets/image.jpg'], 'http://adtangerine.com', :jpg)
|
120
|
+
# => ['http://adtangerine/assets/image.jpg']
|
121
|
+
# Arguments:
|
122
|
+
# imgs: (Array)
|
123
|
+
# - Image URIs got from the HTML doc.
|
124
|
+
# uri: (String)
|
125
|
+
# - Used as basis to the URIs that don't have any protocol/domain set.
|
126
|
+
# exts: (Symbol or Array)
|
127
|
+
# - Allowed image extesntions.
|
128
|
+
def sf_fix_imgs(imgs, uri, exts = [])
|
129
|
+
sf_check_img_ext(imgs.map do |img|
|
130
|
+
img = img.to_s
|
131
|
+
img = sf_fix_protocol(img, sf_domain(uri)) unless img =~ sf_regex(:protocol)
|
132
|
+
img if (img =~ sf_regex(:image))
|
133
|
+
end.compact, exts)
|
134
|
+
end
|
122
135
|
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
136
|
+
# Fix image URIs that don't have a protocol/domain set.
|
137
|
+
#
|
138
|
+
# Example:
|
139
|
+
# >> sf_fix_protocol('/assets/image.jpg', 'http://adtangerine.com')
|
140
|
+
# => 'http://adtangerine/assets/image.jpg'
|
141
|
+
# >> sf_fix_protocol('//s.ytimg.com/yts/img/youtub_img.png', 'https://youtube.com')
|
142
|
+
# => 'https://s.ytimg.com/yts/img/youtub_img.png'
|
143
|
+
# Arguments:
|
144
|
+
# path: (String)
|
145
|
+
# - URI path having no protocol/domain set.
|
146
|
+
# domain: (String)
|
147
|
+
# - Domain that will be prepended into the path.
|
148
|
+
def sf_fix_protocol(path, domain)
|
149
|
+
if path =~ /^\/\/[^\/]+/
|
150
|
+
'http:' << path
|
151
|
+
else
|
152
|
+
"http://#{domain}#{'/' unless path =~ /^\/[^\/]+/}#{path}"
|
153
|
+
end
|
154
|
+
end
|
130
155
|
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
end
|
156
|
+
# Return the URI domain.
|
157
|
+
#
|
158
|
+
# Example:
|
159
|
+
# >> sf_domain('http://adtangerine.com')
|
160
|
+
# => 'adtangerine.com'
|
161
|
+
# Arguments:
|
162
|
+
# uri: (String)
|
163
|
+
# - URI.
|
164
|
+
def sf_domain(uri)
|
165
|
+
(uri.split('/')[2] rescue '')
|
166
|
+
end
|
143
167
|
end
|
144
168
|
end
|
data/lib/scrapifier/version.rb
CHANGED
File without changes
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrapifier
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tiago Guedes
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-04-
|
11
|
+
date: 2014-04-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -75,6 +75,7 @@ extensions: []
|
|
75
75
|
extra_rdoc_files: []
|
76
76
|
files:
|
77
77
|
- ".gitignore"
|
78
|
+
- ".travis.yml"
|
78
79
|
- Gemfile
|
79
80
|
- LICENSE.txt
|
80
81
|
- README.md
|
@@ -84,7 +85,7 @@ files:
|
|
84
85
|
- lib/scrapifier/support.rb
|
85
86
|
- lib/scrapifier/version.rb
|
86
87
|
- scrapifier.gemspec
|
87
|
-
- spec/factories/
|
88
|
+
- spec/factories/samples.rb
|
88
89
|
- spec/scrapifier_spec.rb
|
89
90
|
- spec/spec_helper.rb
|
90
91
|
homepage: https://github.com/tiagopog/scrapifier
|
@@ -112,6 +113,6 @@ signing_key:
|
|
112
113
|
specification_version: 4
|
113
114
|
summary: Extends the Ruby String class with a screen scraping method.
|
114
115
|
test_files:
|
115
|
-
- spec/factories/
|
116
|
+
- spec/factories/samples.rb
|
116
117
|
- spec/scrapifier_spec.rb
|
117
118
|
- spec/spec_helper.rb
|