scrapifier 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +4 -0
- data/README.md +88 -83
- data/lib/scrapifier/methods.rb +25 -35
- data/lib/scrapifier/support.rb +152 -128
- data/lib/scrapifier/version.rb +1 -1
- data/spec/factories/{uris.rb → samples.rb} +0 -0
- data/spec/spec_helper.rb +1 -1
- metadata +5 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: aa0d714c01fc436bf4f90f4baaf50e98bcb9197a
|
4
|
+
data.tar.gz: b9337a1a690c0a7f0c57b237f327eec80d9bbdda
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 75baa6ed1838759bd6c0ebc8a453120b6c2cfe8f484f4396a5401a1d1acd66eebb0377423750a647ca9417e0fb8f4677ba688d40a4bc7e642cee653a6a76131a
|
7
|
+
data.tar.gz: a77ab52807dbcf3a9846226641a00b210324e2d64db0d18324597be380e576faea9c65eb97b00ea0a98e9de6359329f3b295f168fd001b59a4b9b8390ef3badc
|
data/.travis.yml
ADDED
data/README.md
CHANGED
@@ -1,83 +1,88 @@
|
|
1
|
-
# Scrapifier
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
#
|
51
|
-
#
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
#
|
71
|
-
#
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
1
|
+
# Scrapifier
|
2
|
+
|
3
|
+
[](https://travis-ci.org/tiagopog/scrapifier)
|
4
|
+
[](https://codeclimate.com/github/tiagopog/scrapifier)
|
5
|
+
[](https://gemnasium.com/tiagopog/scrapifier)
|
6
|
+
[](http://badge.fury.io/rb/scrapifier)
|
7
|
+
|
8
|
+
It's a Ruby gem that brings a very simple way to extract meta information from URIs using the screen scraping technique.
|
9
|
+
|
10
|
+
## Installation
|
11
|
+
|
12
|
+
Compatible with Ruby 1.9.3+
|
13
|
+
|
14
|
+
Add this line to your application's Gemfile:
|
15
|
+
|
16
|
+
gem 'scrapifier'
|
17
|
+
|
18
|
+
And then execute:
|
19
|
+
|
20
|
+
$ bundle
|
21
|
+
|
22
|
+
Or install it yourself as:
|
23
|
+
|
24
|
+
$ gem install scrapifier
|
25
|
+
|
26
|
+
## Usage
|
27
|
+
|
28
|
+
The method finds an URI in the String and gets some meta information from it, like the page's title, description, images and the URI. All the data is returned in a well-formatted Hash.
|
29
|
+
|
30
|
+
#### Default usage.
|
31
|
+
|
32
|
+
``` ruby
|
33
|
+
'Wow! What an awesome site: http://adtangerine.com!'.scrapify
|
34
|
+
#=> {
|
35
|
+
# title: "AdTangerine | Advertising Platform for Social Media",
|
36
|
+
# description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
|
37
|
+
# images: ["http://adtangerine.com/assets/logo_adt_og.png", "http://adtangerine.com/assets/logo_adt_og.png", "http://s3-us-west-2.amazonaws.com/adtangerine-prod/users/avatars/000/000/834/thumb/275747_1118382211_1929809351_n.jpg", "http://adtangerine.com/assets/foobar.gif"],
|
38
|
+
# uri: "http://adtangerine.com"
|
39
|
+
# }
|
40
|
+
```
|
41
|
+
|
42
|
+
#### Allow only certain image types.
|
43
|
+
|
44
|
+
``` ruby
|
45
|
+
'Wow! What an awesome site: http://adtangerine.com!'.scrapify images: :jpg
|
46
|
+
#=> {
|
47
|
+
# title: "AdTangerine | Advertising Platform for Social Media",
|
48
|
+
# description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
|
49
|
+
# images: ["http://s3-us-west-2.amazonaws.com/adtangerine-prod/users/avatars/000/000/834/thumb/275747_1118382211_1929809351_n.jpg"],
|
50
|
+
# uri: "http://adtangerine.com"
|
51
|
+
# }
|
52
|
+
|
53
|
+
'Wow! What an awesome site: http://adtangerine.com!'.scrapify images: [:png, :gif]
|
54
|
+
#=> {
|
55
|
+
# title: "AdTangerine | Advertising Platform for Social Media",
|
56
|
+
# description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
|
57
|
+
# images: ["http://adtangerine.com/assets/logo_adt_og.png", "http://adtangerine.com/assets/logo_adt_og.png", "http://adtangerine.com/assets/foobar.gif"],
|
58
|
+
# uri: "http://adtangerine.com"
|
59
|
+
# }
|
60
|
+
```
|
61
|
+
|
62
|
+
#### Choose which URI you want it to be scraped.
|
63
|
+
|
64
|
+
``` ruby
|
65
|
+
'Check out: http://adtangerine.com and www.twitflink.com'.scrapify which: 1
|
66
|
+
#=> {
|
67
|
+
# title: "TwitFlink | Find a link!",
|
68
|
+
# description: "TwitFlink is a very simple searching tool that allows people to find out links tweeted by any user from Twitter.",
|
69
|
+
# images: ["http://www.twitflink.com//assets/tf_logo.png", "http://twitflink.com/assets/tf_logo.png"],
|
70
|
+
# uri: "http://www.twitflink.com"
|
71
|
+
# }
|
72
|
+
|
73
|
+
'Check out: http://adtangerine.com and www.twitflink.com'.scrapify({ which: 0, images: :gif })
|
74
|
+
#=> {
|
75
|
+
# title: "AdTangerine | Advertising Platform for Social Media",
|
76
|
+
# description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
|
77
|
+
# images: ["http://adtangerine.com/assets/foobar.gif"],
|
78
|
+
# uri: "http://adtangerine.com"
|
79
|
+
# }
|
80
|
+
```
|
81
|
+
|
82
|
+
## Contributing
|
83
|
+
|
84
|
+
1. Fork it
|
85
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
86
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
87
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
88
|
+
5. Create new Pull Request
|
data/lib/scrapifier/methods.rb
CHANGED
@@ -4,67 +4,57 @@ require 'open-uri'
|
|
4
4
|
require 'scrapifier/support'
|
5
5
|
|
6
6
|
module Scrapifier
|
7
|
+
# Methods which will be included into the String class.
|
7
8
|
module Methods
|
8
9
|
include Scrapifier::Support
|
9
10
|
|
10
|
-
#
|
11
|
-
#
|
11
|
+
# Get metadata from an URI using the screen scraping technique.
|
12
|
+
#
|
12
13
|
# Example:
|
13
14
|
# >> 'Wow! What an awesome site: http://adtangerine.com!'.scrapify
|
14
15
|
# => {
|
15
16
|
# :title => "AdTangerine | Advertising Platform for Social Media",
|
16
|
-
# :description => "AdTangerine is an advertising platform that
|
17
|
-
# :images => [
|
17
|
+
# :description => "AdTangerine is an advertising platform that...",
|
18
|
+
# :images => [
|
19
|
+
# "http://adtangerine.com/assets/logo_adt_og.png",
|
20
|
+
# "http://adtangerine.com/assets/logo_adt_og.png
|
21
|
+
# ],
|
18
22
|
# :uri => "http://adtangerine.com"
|
19
23
|
# }
|
20
24
|
# Arguments:
|
21
25
|
# options: (Hash)
|
22
|
-
# - which: (Integer)
|
23
|
-
#
|
24
|
-
|
26
|
+
# - which: (Integer)
|
27
|
+
# Which URI in the String will be used. It starts from 0 to N.
|
28
|
+
# - images: (Symbol or Array)
|
29
|
+
# Image extensions which are allowed to be returned as result.
|
25
30
|
def scrapify(options = {})
|
26
|
-
|
27
|
-
|
28
|
-
begin
|
29
|
-
if uri.nil?
|
30
|
-
raise
|
31
|
-
elsif uri =~ sf_regex(:image)
|
32
|
-
uri = (sf_check_img_ext(uri, options[:images])[0] rescue [])
|
33
|
-
raise if uri.empty?
|
34
|
-
[:title, :description, :uri, :images].each { |key| meta[key] = uri }
|
35
|
-
else
|
36
|
-
doc = Nokogiri::HTML(open(uri).read)
|
37
|
-
doc.encoding = 'utf-8'
|
38
|
-
|
39
|
-
[:title, :description].each do |key|
|
40
|
-
meta[key] = (doc.xpath(sf_paths[key])[0].text rescue '-')
|
41
|
-
end
|
31
|
+
uri, meta = find_uri(options[:which]), {}
|
32
|
+
return meta if uri.nil?
|
42
33
|
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
meta = {}
|
34
|
+
if !(uri =~ sf_regex(:image))
|
35
|
+
meta = sf_eval_uri(uri, options[:images])
|
36
|
+
elsif !sf_check_img_ext(uri, options[:images]).empty?
|
37
|
+
[:title, :description, :uri, :images].each { |k| meta[k] = uri }
|
48
38
|
end
|
49
39
|
|
50
40
|
meta
|
51
41
|
end
|
52
42
|
|
53
|
-
#
|
54
|
-
#
|
43
|
+
# Find URIs in the String.
|
44
|
+
#
|
55
45
|
# Example:
|
56
46
|
# >> 'Wow! What an awesome site: http://adtangerine.com!'.find_uri
|
57
47
|
# => 'http://adtangerine.com'
|
58
|
-
# >> '
|
48
|
+
# >> 'Very cool: http://adtangerine.com and www.twitflink.com'.find_uri 1
|
59
49
|
# => 'www.twitflink.com'
|
60
50
|
# Arguments:
|
61
51
|
# which: (Integer)
|
62
52
|
# - Which URI in the String: first (0), second (1) and so on.
|
63
|
-
|
64
53
|
def find_uri(which = 0)
|
65
|
-
which
|
66
|
-
which
|
67
|
-
|
54
|
+
which = scan(sf_regex(:uri))[which.to_i][0]
|
55
|
+
which =~ sf_regex(:protocol) ? which : "http://#{which}"
|
56
|
+
rescue NoMethodError
|
57
|
+
nil
|
68
58
|
end
|
69
59
|
end
|
70
60
|
end
|
data/lib/scrapifier/support.rb
CHANGED
@@ -1,144 +1,168 @@
|
|
1
1
|
module Scrapifier
|
2
2
|
module Support
|
3
|
-
|
4
|
-
# Filters images returning those with the allowed extentions.
|
5
|
-
#
|
6
|
-
# Example:
|
7
|
-
# >> sf_check_img_ext('http://source.com/image.gif', :jpg)
|
8
|
-
# => []
|
9
|
-
# >> sf_check_img_ext(['http://source.com/image.gif', 'http://source.com/image.jpg'], [:jpg, :png])
|
10
|
-
# => ['http://source.com/image.jpg']
|
11
|
-
# Arguments:
|
12
|
-
# images: (String or Array)
|
13
|
-
# - Images which will be checked.
|
14
|
-
# allowed: (String, Symbol or Array)
|
15
|
-
# - Allowed types of image extension.
|
16
|
-
|
17
|
-
def sf_check_img_ext(images, allowed = [])
|
18
|
-
allowed ||= []
|
19
|
-
if images.is_a?(String)
|
20
|
-
images = images.split
|
21
|
-
elsif !images.is_a?(Array)
|
22
|
-
images = []
|
23
|
-
end
|
24
|
-
images.select { |i| i =~ sf_regex(:image, allowed) }
|
25
|
-
end
|
3
|
+
module_function
|
26
4
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
5
|
+
# Evaluate the URI's HTML document and get its metadata.
|
6
|
+
#
|
7
|
+
# Example:
|
8
|
+
# >> eval_uri('http://adtangerine.com', [:png])
|
9
|
+
# => {
|
10
|
+
# :title => "AdTangerine | Advertising Platform for Social Media",
|
11
|
+
# :description => "AdTangerine is an advertising platform that...",
|
12
|
+
# :images => [
|
13
|
+
# "http://adtangerine.com/assets/logo_adt_og.png",
|
14
|
+
# "http://adtangerine.com/assets/logo_adt_og.png
|
15
|
+
# ],
|
16
|
+
# :uri => "http://adtangerine.com"
|
17
|
+
# }
|
18
|
+
# Arguments:
|
19
|
+
# uri: (String)
|
20
|
+
# - URI.
|
21
|
+
# imgs: (Array)
|
22
|
+
# - Allowed type of images.
|
23
|
+
def sf_eval_uri(uri, imgs = [])
|
24
|
+
doc = Nokogiri::HTML(open(uri).read)
|
25
|
+
doc.encoding, meta = 'utf-8', { uri: uri }
|
39
26
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
protocol: /((ht|f)tp[s]?)/i
|
48
|
-
}
|
49
|
-
regexes[type]
|
50
|
-
end
|
51
|
-
end
|
27
|
+
%i(title description).each { |k| meta[k] = (doc.xpath(sf_paths[k])[0].text rescue '-') }
|
28
|
+
meta[:images] = sf_fix_imgs(doc.xpath(sf_paths[:image]), uri, imgs)
|
29
|
+
|
30
|
+
meta
|
31
|
+
rescue SocketError
|
32
|
+
{}
|
33
|
+
end
|
52
34
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
eval "/(^http{1}[s]?:\\/\\/([w]{3}\\.)?.+\\.(#{exts.join('|')})(\\?.+)?$)/i"
|
35
|
+
# Filter images returning those with the allowed extentions.
|
36
|
+
#
|
37
|
+
# Example:
|
38
|
+
# >> sf_check_img_ext('http://source.com/image.gif', :jpg)
|
39
|
+
# => []
|
40
|
+
# >> sf_check_img_ext(['http://source.com/image.gif', 'http://source.com/image.jpg'], [:jpg, :png])
|
41
|
+
# => ['http://source.com/image.jpg']
|
42
|
+
# Arguments:
|
43
|
+
# images: (String or Array)
|
44
|
+
# - Images which will be checked.
|
45
|
+
# allowed: (String, Symbol or Array)
|
46
|
+
# - Allowed types of image extension.
|
47
|
+
def sf_check_img_ext(images, allowed = [])
|
48
|
+
allowed ||= []
|
49
|
+
if images.is_a?(String)
|
50
|
+
images = images.split
|
51
|
+
elsif !images.is_a?(Array)
|
52
|
+
images = []
|
72
53
|
end
|
54
|
+
images.select { |i| i =~ sf_regex(:image, allowed) }
|
55
|
+
end
|
73
56
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
57
|
+
# Select regexes for URIs, protocols and image extensions.
|
58
|
+
#
|
59
|
+
# Example:
|
60
|
+
# >> sf_regex(:uri)
|
61
|
+
# => /\b((((ht|f)tp[s]?:\/\/)|([a-z0-9]+\.))+(?<!@)([a-z0-9\_\-]+)(\.[a-z]+)+([\?\/\:][a-z0-9_=%&@\?\.\/\-\:\#\(\)]+)?\/?)/i,
|
62
|
+
# >> sf_regex(:image, :jpg)
|
63
|
+
# => /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg)(\?.+)?$)/i
|
64
|
+
# Arguments:
|
65
|
+
# type: (Symbol or String)
|
66
|
+
# - Regex type.
|
67
|
+
# args: (*)
|
68
|
+
# - Anything.
|
69
|
+
def sf_regex(type, *args)
|
70
|
+
type = type.to_sym unless type.is_a? Symbol
|
71
|
+
if type == :image
|
72
|
+
sf_img_regex args.flatten
|
73
|
+
else
|
74
|
+
regexes = {
|
75
|
+
uri: /\b((((ht|f)tp[s]?:\/\/)|([a-z0-9]+\.))+(?<!@)([a-z0-9\_\-]+)(\.[a-z]+)+([\?\/\:][a-z0-9_=%&@\?\.\/\-\:\#\(\)]+)?\/?)/i,
|
76
|
+
protocol: /((ht|f)tp[s]?)/i
|
86
77
|
}
|
78
|
+
regexes[type]
|
87
79
|
end
|
80
|
+
end
|
88
81
|
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
82
|
+
# Build image regexes according to the required extensions.
|
83
|
+
#
|
84
|
+
# Example:
|
85
|
+
# >> sf_img_regex
|
86
|
+
# => /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg|png|gif)(\?.+)?$)/i
|
87
|
+
# >> sf_img_regex([:jpg, :png])
|
88
|
+
# => /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|png)(\?.+)?$)/i
|
89
|
+
# Arguments:
|
90
|
+
# exts: (Array)
|
91
|
+
# - Image extensions which will be included in the regex.
|
92
|
+
def sf_img_regex(exts = [])
|
93
|
+
exts = [exts].flatten unless exts.is_a?(Array)
|
94
|
+
if exts.nil? or exts.empty?
|
95
|
+
exts = %w(jpg jpeg png gif)
|
96
|
+
elsif exts.include?(:jpg) and !exts.include?(:jpeg)
|
97
|
+
exts.push :jpeg
|
98
|
+
end
|
99
|
+
eval "/(^http{1}[s]?:\\/\\/([w]{3}\\.)?.+\\.(#{exts.join('|')})(\\?.+)?$)/i"
|
100
|
+
end
|
101
101
|
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
102
|
+
# Collection of paths used to get content from HTML tags via Node#xpath method.
|
103
|
+
# See more: http://nokogiri.org/tutorials/searching_a_xml_html_document.html
|
104
|
+
#
|
105
|
+
# Example:
|
106
|
+
# >> sf_paths[:title]
|
107
|
+
# => '//meta[@property = "og:title"]/@content | //meta[@name = "title"]/@content | //meta[@name = "Title"]/@content | //title | //h1'
|
108
|
+
def sf_paths
|
109
|
+
{
|
110
|
+
title: '//meta[@property = "og:title"]/@content | //meta[@name = "title"]/@content | //meta[@name = "Title"]/@content | //title | //h1',
|
111
|
+
description: '//meta[@property = "og:description"]/@content | //meta[@name = "description"]/@content | //meta[@name = "Description"]/@content | //h1 | //h3 | //p | //span | //font',
|
112
|
+
image: '//meta[@property = "og:image"]/@content | //link[@rel = "image_src"]/@href | //meta[@itemprop = "image"]/@content | //div[@id = "logo"]/img/@src | //a[@id = "logo"]/img/@src | //div[@class = "logo"]/img/@src | //a[@class = "logo"]/img/@src | //a//img[@width]/@src | //img[@width]/@src | //a//img[@height]/@src | //img[@height]/@src | //a//img/@src | //span//img/@src'
|
113
|
+
}
|
114
|
+
end
|
109
115
|
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
116
|
+
# Check and return only the valid image URIs.
|
117
|
+
#
|
118
|
+
# Example:
|
119
|
+
# >> sf_fix_imgs(['http://adtangerine.com/image.png', '/assets/image.jpg'], 'http://adtangerine.com', :jpg)
|
120
|
+
# => ['http://adtangerine/assets/image.jpg']
|
121
|
+
# Arguments:
|
122
|
+
# imgs: (Array)
|
123
|
+
# - Image URIs got from the HTML doc.
|
124
|
+
# uri: (String)
|
125
|
+
# - Used as basis to the URIs that don't have any protocol/domain set.
|
126
|
+
# exts: (Symbol or Array)
|
127
|
+
# - Allowed image extesntions.
|
128
|
+
def sf_fix_imgs(imgs, uri, exts = [])
|
129
|
+
sf_check_img_ext(imgs.map do |img|
|
130
|
+
img = img.to_s
|
131
|
+
img = sf_fix_protocol(img, sf_domain(uri)) unless img =~ sf_regex(:protocol)
|
132
|
+
img if (img =~ sf_regex(:image))
|
133
|
+
end.compact, exts)
|
134
|
+
end
|
122
135
|
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
136
|
+
# Fix image URIs that don't have a protocol/domain set.
|
137
|
+
#
|
138
|
+
# Example:
|
139
|
+
# >> sf_fix_protocol('/assets/image.jpg', 'http://adtangerine.com')
|
140
|
+
# => 'http://adtangerine/assets/image.jpg'
|
141
|
+
# >> sf_fix_protocol('//s.ytimg.com/yts/img/youtub_img.png', 'https://youtube.com')
|
142
|
+
# => 'https://s.ytimg.com/yts/img/youtub_img.png'
|
143
|
+
# Arguments:
|
144
|
+
# path: (String)
|
145
|
+
# - URI path having no protocol/domain set.
|
146
|
+
# domain: (String)
|
147
|
+
# - Domain that will be prepended into the path.
|
148
|
+
def sf_fix_protocol(path, domain)
|
149
|
+
if path =~ /^\/\/[^\/]+/
|
150
|
+
'http:' << path
|
151
|
+
else
|
152
|
+
"http://#{domain}#{'/' unless path =~ /^\/[^\/]+/}#{path}"
|
153
|
+
end
|
154
|
+
end
|
130
155
|
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
end
|
156
|
+
# Return the URI domain.
|
157
|
+
#
|
158
|
+
# Example:
|
159
|
+
# >> sf_domain('http://adtangerine.com')
|
160
|
+
# => 'adtangerine.com'
|
161
|
+
# Arguments:
|
162
|
+
# uri: (String)
|
163
|
+
# - URI.
|
164
|
+
def sf_domain(uri)
|
165
|
+
(uri.split('/')[2] rescue '')
|
166
|
+
end
|
143
167
|
end
|
144
168
|
end
|
data/lib/scrapifier/version.rb
CHANGED
File without changes
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrapifier
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tiago Guedes
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-04-
|
11
|
+
date: 2014-04-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -75,6 +75,7 @@ extensions: []
|
|
75
75
|
extra_rdoc_files: []
|
76
76
|
files:
|
77
77
|
- ".gitignore"
|
78
|
+
- ".travis.yml"
|
78
79
|
- Gemfile
|
79
80
|
- LICENSE.txt
|
80
81
|
- README.md
|
@@ -84,7 +85,7 @@ files:
|
|
84
85
|
- lib/scrapifier/support.rb
|
85
86
|
- lib/scrapifier/version.rb
|
86
87
|
- scrapifier.gemspec
|
87
|
-
- spec/factories/
|
88
|
+
- spec/factories/samples.rb
|
88
89
|
- spec/scrapifier_spec.rb
|
89
90
|
- spec/spec_helper.rb
|
90
91
|
homepage: https://github.com/tiagopog/scrapifier
|
@@ -112,6 +113,6 @@ signing_key:
|
|
112
113
|
specification_version: 4
|
113
114
|
summary: Extends the Ruby String class with a screen scraping method.
|
114
115
|
test_files:
|
115
|
-
- spec/factories/
|
116
|
+
- spec/factories/samples.rb
|
116
117
|
- spec/scrapifier_spec.rb
|
117
118
|
- spec/spec_helper.rb
|