scrapifier 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: d645984640446b98bdc5bdf71972c991b361b75e
4
+ data.tar.gz: 9bd772bf8ab26ab4dda602fa69eee8c12ae45e39
5
+ SHA512:
6
+ metadata.gz: ff5dd829fd8e41af883fccd65ade03bb44d968f71a5457a0f2ceeb3afb0e389f71b44c0b559b352793293f4675cd9556c6e0bbab9799b637f1c4e6e7bdbb61ee
7
+ data.tar.gz: 3b974c372000f5d4f795f32074bd4da64e4fd910e8623f8446169c18b6880b1967aee105bf5df4e9de36fd054697c567727d4a8eee48def87f51b93b4117f556
data/.gitignore ADDED
@@ -0,0 +1,18 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ .rspec
7
+ Gemfile.lock
8
+ InstalledFiles
9
+ _yardoc
10
+ coverage
11
+ doc/
12
+ lib/bundler/man
13
+ pkg
14
+ rdoc
15
+ spec/reports
16
+ test/tmp
17
+ test/version_tmp
18
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in scrapifier.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Tiago Guedes
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,83 @@
1
+ # Scrapifier
2
+
3
+ It's a Ruby gem that brings a very simple way to extract meta information from URIs using the screen scraping technique.
4
+
5
+ ## Installation
6
+
7
+ Compatible with Ruby 1.9.3+
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ gem 'scrapifier'
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install scrapifier
20
+
21
+ ## Usage
22
+
23
+ The method finds an URI in the String and gets some meta information from it, like the page's title, description, images and the URI. All the data is returned in a well-formatted Hash.
24
+
25
+ #### Default usage.
26
+
27
+ ``` ruby
28
+ 'Wow! What an awesome site: http://adtangerine.com!'.scrapify
29
+ #=> {
30
+ # title: "AdTangerine | Advertising Platform for Social Media",
31
+ # description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
32
+ # images: ["http://adtangerine.com/assets/logo_adt_og.png", "http://adtangerine.com/assets/logo_adt_og.png", "http://s3-us-west-2.amazonaws.com/adtangerine-prod/users/avatars/000/000/834/thumb/275747_1118382211_1929809351_n.jpg", "http://adtangerine.com/assets/foobar.gif"],
33
+ # uri: "http://adtangerine.com"
34
+ # }
35
+ ```
36
+
37
+ #### Allow only certain image types.
38
+
39
+ ``` ruby
40
+ 'Wow! What an awesome site: http://adtangerine.com!'.scrapify images: :jpg
41
+ #=> {
42
+ # title: "AdTangerine | Advertising Platform for Social Media",
43
+ # description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
44
+ # images: ["http://s3-us-west-2.amazonaws.com/adtangerine-prod/users/avatars/000/000/834/thumb/275747_1118382211_1929809351_n.jpg"],
45
+ # uri: "http://adtangerine.com"
46
+ # }
47
+
48
+ 'Wow! What an awesome site: http://adtangerine.com!'.scrapify images: [:png, :gif]
49
+ #=> {
50
+ # title: "AdTangerine | Advertising Platform for Social Media",
51
+ # description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
52
+ # images: ["http://adtangerine.com/assets/logo_adt_og.png", "http://adtangerine.com/assets/logo_adt_og.png", "http://adtangerine.com/assets/foobar.gif"],
53
+ # uri: "http://adtangerine.com"
54
+ # }
55
+ ```
56
+
57
+ #### Choose which URI you want it to be scraped.
58
+
59
+ ``` ruby
60
+ 'Check out: http://adtangerine.com and www.twitflink.com'.scrapify which: 1
61
+ #=> {
62
+ # title: "TwitFlink | Find a link!",
63
+ # description: "TwitFlink is a very simple searching tool that allows people to find out links tweeted by any user from Twitter.",
64
+ # images: ["http://www.twitflink.com//assets/tf_logo.png", "http://twitflink.com/assets/tf_logo.png"],
65
+ # uri: "http://www.twitflink.com"
66
+ # }
67
+
68
+ 'Check out: http://adtangerine.com and www.twitflink.com'.scrapify({ which: 0, images: :gif })
69
+ #=> {
70
+ # title: "AdTangerine | Advertising Platform for Social Media",
71
+ # description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
72
+ # images: ["http://adtangerine.com/assets/foobar.gif"],
73
+ # uri: "http://adtangerine.com"
74
+ # }
75
+ ```
76
+
77
+ ## Contributing
78
+
79
+ 1. Fork it
80
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
81
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
82
+ 4. Push to the branch (`git push origin my-new-feature`)
83
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,5 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+ task default: :spec
@@ -0,0 +1,70 @@
1
+ # coding: utf-8
2
+ require 'nokogiri'
3
+ require 'open-uri'
4
+ require 'scrapifier/support'
5
+
6
+ module Scrapifier
7
+ module Methods
8
+ include Scrapifier::Support
9
+
10
+ # Gets meta data from an URI using the screen scraping technique.
11
+ #
12
+ # Example:
13
+ # >> 'Wow! What an awesome site: http://adtangerine.com!'.scrapify
14
+ # => {
15
+ # :title => "AdTangerine | Advertising Platform for Social Media",
16
+ # :description => "AdTangerine is an advertising platform that uses the tangerine as a virtual currency...",
17
+ # :images => ["http://adtangerine.com/assets/logo_adt_og.png", "http://adtangerine.com/assets/logo_adt_og.png],
18
+ # :uri => "http://adtangerine.com"
19
+ # }
20
+ # Arguments:
21
+ # options: (Hash)
22
+ # - which: (Integer) Indicates which URI in the String will be used. It starts from 0 to N.
23
+ # - images: (Symbol or Array) Indicates the image extensions which are allowed to be returned as result.
24
+
25
+ def scrapify(options = {})
26
+ meta, uri = {}, find_uri(options[:which])
27
+
28
+ begin
29
+ if uri.nil?
30
+ raise
31
+ elsif uri =~ sf_regex(:image)
32
+ uri = (sf_check_img_ext(uri, options[:images])[0] rescue [])
33
+ raise if uri.empty?
34
+ [:title, :description, :uri, :images].each { |key| meta[key] = uri }
35
+ else
36
+ doc = Nokogiri::HTML(open(uri).read)
37
+ doc.encoding = 'utf-8'
38
+
39
+ [:title, :description].each do |key|
40
+ meta[key] = (doc.xpath(sf_paths[key])[0].text rescue '-')
41
+ end
42
+
43
+ meta[:images] = sf_fix_imgs(doc.xpath(sf_paths[:image]), uri, options[:images])
44
+ meta[:uri] = uri
45
+ end
46
+ rescue
47
+ meta = {}
48
+ end
49
+
50
+ meta
51
+ end
52
+
53
+ # Looks for URIs in the String.
54
+ #
55
+ # Example:
56
+ # >> 'Wow! What an awesome site: http://adtangerine.com!'.find_uri
57
+ # => 'http://adtangerine.com'
58
+ # >> 'Wow! What an awesome sites: http://adtangerine.com and www.twitflink.com'.find_uri 1
59
+ # => 'www.twitflink.com'
60
+ # Arguments:
61
+ # which: (Integer)
62
+ # - Which URI in the String: first (0), second (1) and so on.
63
+
64
+ def find_uri(which = 0)
65
+ which ||= which.to_i
66
+ which = self.scan(sf_regex(:uri))[which][0] rescue nil
67
+ (which.nil? or which =~ sf_regex(:protocol)) ? which : 'http://' << which
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,144 @@
1
+ module Scrapifier
2
+ module Support
3
+ private
4
+ # Filters images returning those with the allowed extentions.
5
+ #
6
+ # Example:
7
+ # >> sf_check_img_ext('http://source.com/image.gif', :jpg)
8
+ # => []
9
+ # >> sf_check_img_ext(['http://source.com/image.gif', 'http://source.com/image.jpg'], [:jpg, :png])
10
+ # => ['http://source.com/image.jpg']
11
+ # Arguments:
12
+ # images: (String or Array)
13
+ # - Images which will be checked.
14
+ # allowed: (String, Symbol or Array)
15
+ # - Allowed types of image extension.
16
+
17
+ def sf_check_img_ext(images, allowed = [])
18
+ allowed ||= []
19
+ if images.is_a?(String)
20
+ images = images.split
21
+ elsif !images.is_a?(Array)
22
+ images = []
23
+ end
24
+ images.select { |i| i =~ sf_regex(:image, allowed) }
25
+ end
26
+
27
+ # Selects regexes for URIs, protocols and image extensions.
28
+ #
29
+ # Example:
30
+ # >> sf_regex(:uri)
31
+ # => /\b((((ht|f)tp[s]?:\/\/)|([a-z0-9]+\.))+(?<!@)([a-z0-9\_\-]+)(\.[a-z]+)+([\?\/\:][a-z0-9_=%&@\?\.\/\-\:\#\(\)]+)?\/?)/i,
32
+ # >> sf_regex(:image, :jpg)
33
+ # => /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg)(\?.+)?$)/i
34
+ # Arguments:
35
+ # type: (Symbol or String)
36
+ # - Regex type.
37
+ # args: (*)
38
+ # - Anything.
39
+
40
+ def sf_regex(type, *args)
41
+ type = type.to_sym unless type.is_a? Symbol
42
+ if type == :image
43
+ sf_img_regex args.flatten
44
+ else
45
+ regexes = {
46
+ uri: /\b((((ht|f)tp[s]?:\/\/)|([a-z0-9]+\.))+(?<!@)([a-z0-9\_\-]+)(\.[a-z]+)+([\?\/\:][a-z0-9_=%&@\?\.\/\-\:\#\(\)]+)?\/?)/i,
47
+ protocol: /((ht|f)tp[s]?)/i
48
+ }
49
+ regexes[type]
50
+ end
51
+ end
52
+
53
+ # Builds image regexes according to the required extensions.
54
+ #
55
+ # Example:
56
+ # >> sf_img_regex
57
+ # => /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg|png|gif)(\?.+)?$)/i
58
+ # >> sf_img_regex([:jpg, :png])
59
+ # => /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|png)(\?.+)?$)/i
60
+ # Arguments:
61
+ # exts: (Array)
62
+ # - Image extensions which will be included in the regex.
63
+
64
+ def sf_img_regex(exts = [])
65
+ exts = [exts].flatten unless exts.is_a?(Array)
66
+ if exts.nil? or exts.empty?
67
+ exts = %w(jpg jpeg png gif)
68
+ elsif exts.include?(:jpg) and !exts.include?(:jpeg)
69
+ exts.push :jpeg
70
+ end
71
+ eval "/(^http{1}[s]?:\\/\\/([w]{3}\\.)?.+\\.(#{exts.join('|')})(\\?.+)?$)/i"
72
+ end
73
+
74
+ # Collection of paths used to get content from HTML tags via Node#xpath method.
75
+ # See more: http://nokogiri.org/tutorials/searching_a_xml_html_document.html
76
+ #
77
+ # Example:
78
+ # >> sf_paths[:title]
79
+ # => '//meta[@property = "og:title"]/@content | //meta[@name = "title"]/@content | //meta[@name = "Title"]/@content | //title | //h1'
80
+
81
+ def sf_paths
82
+ {
83
+ title: '//meta[@property = "og:title"]/@content | //meta[@name = "title"]/@content | //meta[@name = "Title"]/@content | //title | //h1',
84
+ description: '//meta[@property = "og:description"]/@content | //meta[@name = "description"]/@content | //meta[@name = "Description"]/@content | //h1 | //h3 | //p | //span | //font',
85
+ image: '//meta[@property = "og:image"]/@content | //link[@rel = "image_src"]/@href | //meta[@itemprop = "image"]/@content | //div[@id = "logo"]/img/@src | //a[@id = "logo"]/img/@src | //div[@class = "logo"]/img/@src | //a[@class = "logo"]/img/@src | //a//img[@width]/@src | //img[@width]/@src | //a//img[@height]/@src | //img[@height]/@src | //a//img/@src | //span//img/@src'
86
+ }
87
+ end
88
+
89
+ # Checks and returns only the valid image URIs.
90
+ #
91
+ # Example:
92
+ # >> sf_fix_imgs(['http://adtangerine.com/image.png', '/assets/image.jpg'], 'http://adtangerine.com', :jpg)
93
+ # => ['http://adtangerine/assets/image.jpg']
94
+ # Arguments:
95
+ # imgs: (Array)
96
+ # - Image URIs got from the HTML doc.
97
+ # uri: (String)
98
+ # - Used as basis to the URIs that don't have any protocol/domain set.
99
+ # exts: (Symbol or Array)
100
+ # - Allowed image extesntions.
101
+
102
+ def sf_fix_imgs(imgs, uri, exts = [])
103
+ sf_check_img_ext(imgs.map do |img|
104
+ img = img.to_s
105
+ img = sf_fix_protocol(img, sf_domain(uri)) unless img =~ sf_regex(:protocol)
106
+ img if (img =~ sf_regex(:image))
107
+ end.compact, exts)
108
+ end
109
+
110
+ # Fixes image URIs that doesn't present protocol/domain.
111
+ #
112
+ # Example:
113
+ # >> sf_fix_protocol('/assets/image.jpg', 'http://adtangerine.com')
114
+ # => 'http://adtangerine/assets/image.jpg'
115
+ # >> sf_fix_protocol('//s.ytimg.com/yts/img/youtub_img.png', 'https://youtube.com')
116
+ # => 'https://s.ytimg.com/yts/img/youtub_img.png'
117
+ # Arguments:
118
+ # path: (String)
119
+ # - URI path having no protocol/domain set.
120
+ # domain: (String)
121
+ # - Domain that will be prepended into the path.
122
+
123
+ def sf_fix_protocol(path, domain)
124
+ if path =~ /^\/\/[^\/]+/
125
+ 'http:' << path
126
+ else
127
+ "http://#{domain}#{'/' unless path =~ /^\/[^\/]+/}#{path}"
128
+ end
129
+ end
130
+
131
+ # Returns the domain from an URI
132
+ #
133
+ # Example:
134
+ # >> sf_domain('http://adtangerine.com')
135
+ # => 'adtangerine.com'
136
+ # Arguments:
137
+ # uri: (String)
138
+ # - URI.
139
+
140
+ def sf_domain(uri)
141
+ (uri.split('/')[2] rescue '')
142
+ end
143
+ end
144
+ end
@@ -0,0 +1,3 @@
1
+ module Scrapifier
2
+ VERSION = '0.0.1'
3
+ end
data/lib/scrapifier.rb ADDED
@@ -0,0 +1,4 @@
1
+ # coding: utf-8
2
+ require 'scrapifier/methods'
3
+
4
+ String.send :include, Scrapifier::Methods
@@ -0,0 +1,26 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'scrapifier/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'scrapifier'
8
+ spec.version = Scrapifier::VERSION
9
+ spec.authors = ['Tiago Guedes']
10
+ spec.email = ['tiagopog@gmail.com']
11
+ spec.description = 'A very simple way to extract meta information from URIs using the screen scraping technique.'
12
+ spec.summary = 'Extends the Ruby String class with a screen scraping method.'
13
+ spec.homepage = 'https://github.com/tiagopog/scrapifier'
14
+ spec.license = 'MIT'
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ['lib']
20
+
21
+ spec.add_runtime_dependency 'nokogiri', '~> 1.6'
22
+
23
+ spec.add_development_dependency 'bundler', '~> 1.3'
24
+ spec.add_development_dependency 'rspec', '~> 2.14'
25
+ spec.add_development_dependency 'rake', '~> 10.1'
26
+ end
@@ -0,0 +1,40 @@
1
+ module Factories
2
+ private
3
+ def sf_samples
4
+ {
5
+ misc: {
6
+ http: 'http://adtangerine.com',
7
+ https: 'https://rubygems.org/gems/string_awesome',
8
+ ftp: 'ftp://ftpserver.com',
9
+ www: 'www.twitflink.com'
10
+ },
11
+ images: {
12
+ jpg: [
13
+ 'http://jlcauvin.com/wp-content/uploads/2013/09/heisenberg-breaking-bad.jpg',
14
+ 'https://www.foobar.com/awesome_image.jpeg?foo=bar&bar=foo',
15
+ 'http://foobar.com.br/nice-image.jpg'
16
+ ],
17
+ png: [
18
+ 'http://www.faniq.com/images/blog/58389e481aee9c5abbf49ff0a263f3ca.png',
19
+ 'https://foobar.br/awesome_image.png',
20
+ 'https://bar.foobar.br/foo/var/image.png?foo=bar',
21
+ ],
22
+ gif: [
23
+ 'http://31.media.tumblr.com/6eec77e355fe50bae424291fd8c58622/tumblr_me7ucl8kO61rf089no1_500.gif',
24
+ 'http://foobar.com/ugly_image.gif',
25
+ 'https://bar.foobar.br/foo/var/stop_using.gif?foo=bar'
26
+ ]
27
+ },
28
+ regexes: {
29
+ image: {
30
+ all: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg|png|gif)(\?.+)?$)/i,
31
+ jpg: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg)(\?.+)?$)/i,
32
+ png: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(png)(\?.+)?$)/i,
33
+ gif: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(gif)(\?.+)?$)/i
34
+ },
35
+ uri: /\b((((ht|f)tp[s]?:\/\/)|([a-z0-9]+\.))+(?<!@)([a-z0-9\_\-]+)(\.[a-z]+)+([\?\/\:][a-z0-9_=%&@\?\.\/\-\:\#\(\)]+)?\/?)/i,
36
+ protocol: /((ht|f)tp[s]?)/i
37
+ }
38
+ }
39
+ end
40
+ end
@@ -0,0 +1,269 @@
1
+ # coding: utf-8
2
+ require 'spec_helper'
3
+ include Factories
4
+
5
+ describe String do
6
+ let(:images) { sf_samples[:images] }
7
+ let(:misc) { sf_samples[:misc] }
8
+ let(:regexes) { sf_samples[:regexes] }
9
+
10
+ #
11
+ # String#scrapify
12
+ #
13
+
14
+ describe '#scrapify' do
15
+ context 'when no URI is matched in the String' do
16
+ subject { 'String without any URI.'.scrapify }
17
+
18
+ it { should eq({}) }
19
+ end
20
+
21
+ context 'when the website was not found' do
22
+ subject { 'Check out this http://someweirduri.com.br'.scrapify }
23
+
24
+ it { should eq({}) }
25
+ end
26
+
27
+ context 'when an image URI is matched' do
28
+ let(:jpg) { images[:jpg][0] }
29
+ let(:png) { images[:png][0] }
30
+ let(:gif) { images[:gif][0] }
31
+
32
+ it 'sets the same value for :title, :description and :uri keys' do
33
+ "Say my name: #{jpg}".scrapify.should include(title: jpg, description: jpg, uri: jpg)
34
+ end
35
+
36
+ it 'allows all the standard image extensions by default (even GIFs)' do
37
+ "Smile GIF! Oh, wait... #{gif}".scrapify.should include(title: gif, description: gif, uri: gif)
38
+ end
39
+
40
+ it 'returns an empty Hash if the extension is not allowed' do
41
+ "PNG is awesome! #{png}".scrapify(images: [:jpg]).should eq({})
42
+ end
43
+ end
44
+
45
+ context 'when a website URI is matched in the String and a Hash is returned' do
46
+ subject(:hash) { "Look this awesome site #{misc[:http]}".scrapify }
47
+
48
+ it "includes a field with the site's title" do
49
+ hash[:title].is_a?(String).should be_true
50
+ hash[:title].empty?.should be_false
51
+ end
52
+
53
+ it "includes a field with the site's description" do
54
+ hash[:description].is_a?(String).should be_true
55
+ hash[:description].empty?.should be_false
56
+ end
57
+
58
+ it 'includes a field with the page URI' do
59
+ hash[:uri].is_a?(String).should be_true
60
+ hash[:uri].empty?.should be_false
61
+ hash[:uri].should eq(misc[:http])
62
+ end
63
+
64
+ it "includes a field with image URIs from the site's head/body" do
65
+ hash[:images].is_a?(Array).should be_true
66
+ hash[:images].sample.should match(regexes[:image][:all])
67
+ end
68
+ end
69
+
70
+ it "includes a field with only the allowed types of image URIs from the site's head/body" do
71
+ misc[:http].scrapify(images: :png)[:images].sample.should match(regexes[:image][:png])
72
+ end
73
+
74
+ it "can choose the URI in the String to be scrapified" do
75
+ hash = "Check out these awesome sites: #{misc[:http]} and #{misc[:www]}".scrapify(which: 1, images: :png)
76
+ [:title, :description, :uri].each do |key|
77
+ hash[key].is_a?(String).should be_true
78
+ hash[key].empty?.should be_false
79
+ end
80
+ hash[:uri].should eq("http://#{misc[:www]}")
81
+ hash[:images].sample.should match(regexes[:image][:png])
82
+ end
83
+ end
84
+
85
+ #
86
+ # String#find_uri
87
+ #
88
+
89
+ describe '#find_uri' do
90
+ let(:sample_uris) { misc.map { |u| u[1] } }
91
+ let(:str) { "Awesome sites: #{sample_uris.join ' and '}" }
92
+
93
+ it 'matches the first URI in the String by default' do
94
+ str.send(:find_uri).should eq(sample_uris[0])
95
+ end
96
+
97
+ it 'matches the second URI in the String (https)' do
98
+ str.send(:find_uri, 1).should eq(sample_uris[1])
99
+ end
100
+
101
+ it 'matches the third URI in the String (www)' do
102
+ str.send(:find_uri, 2).should eq(sample_uris[2])
103
+ end
104
+
105
+ context 'when no URI is matched' do
106
+ it 'returns nil' do
107
+ 'Lorem ipsum dolor.'.send(:find_uri).should be_nil
108
+ end
109
+
110
+ it 'returns nil (no presence of http|https|ftp|www)' do
111
+ 'Check this out: google.com'.send(:find_uri).should be_nil
112
+ end
113
+ end
114
+ end
115
+
116
+ #
117
+ # String#sf_check_img_ext
118
+ #
119
+
120
+ describe '#sf_check_img_ext' do
121
+ let(:img) { images[:jpg].sample }
122
+ let(:imgs) { images.map { |i| i[1] }.flatten }
123
+ let(:checked) do
124
+ {
125
+ str: ''.send(:sf_check_img_ext, img),
126
+ array: ''.send(:sf_check_img_ext, imgs),
127
+ jpg: ''.send(:sf_check_img_ext, imgs, [:jpg]),
128
+ png: ''.send(:sf_check_img_ext, imgs, :png),
129
+ gif: ''.send(:sf_check_img_ext, imgs, 'gif')
130
+ }
131
+ end
132
+
133
+ context 'when no arument is passed' do
134
+ it { expect { ''.send(:sf_check_img_ext) }.to raise_error(ArgumentError) }
135
+ end
136
+
137
+ context 'when only the first argument is defined' do
138
+ it 'allows a String as argument' do
139
+ checked[:str].should have(1).item
140
+ end
141
+
142
+ it 'allows an Array as argument' do
143
+ checked[:jpg].should have(3).item
144
+ end
145
+
146
+ it 'allows all the image extensions by default' do
147
+ checked[:array].should have(9).item
148
+ end
149
+ end
150
+
151
+ context 'when the two arguments are defined' do
152
+ it 'allows a Symbol as the second argument' do
153
+ checked[:png].should have(3).item
154
+ end
155
+
156
+ it 'allows a String as the second argument' do
157
+ checked[:gif].should have(3).item
158
+ end
159
+
160
+ it 'allows an Array as the second argument' do
161
+ checked[:jpg].should have(3).item
162
+ end
163
+
164
+ it 'returns an Array with only image types allowed' do
165
+ [:jpg, :png, :gif].each { |ext| checked[ext].should have(3).item }
166
+ end
167
+ end
168
+
169
+ context 'when no image is found/allowed' do
170
+ it 'returns an empty Array' do
171
+ end
172
+ end
173
+
174
+ it 'always returns an Array' do
175
+ checked.each { |c| c[1].is_a?(Array).should be_true }
176
+ end
177
+ end
178
+
179
+
180
+ #
181
+ # String#sf_regex
182
+ #
183
+
184
+ describe '#sf_regex' do
185
+ context 'when it needs a regex to match any kind of URI' do
186
+ subject { ''.send(:sf_regex, :uri) }
187
+
188
+ [:http, :https, :ftp, :www].each do |p|
189
+ it { should match(misc[:http]) }
190
+ end
191
+ end
192
+
193
+ context 'when it needs a regex to match only image uris' do
194
+ subject { ''.send(:sf_regex, :image) }
195
+
196
+ [:jpg, :png, :gif].each do |ext|
197
+ it { should match(sf_samples[:images][ext].sample) }
198
+ end
199
+ end
200
+ end
201
+
202
+ #
203
+ # String#sf_img_regex
204
+ #
205
+
206
+ describe '#sf_img_regex' do
207
+ let(:img_regexes) { regexes[:image] }
208
+
209
+ context 'when no argument is passed' do
210
+ subject(:regex) { ''.send(:sf_img_regex) }
211
+
212
+ it 'returns a regex that matches all image extensions' do
213
+ regex.should eq(img_regexes[:all])
214
+ end
215
+
216
+ it 'matches all image extensions' do
217
+ [:jpg, :png, :gif].each { |ext| images[ext].sample.should match(regex) }
218
+ end
219
+ end
220
+
221
+ context 'when only jpg is allowed' do
222
+ subject(:regex) { ''.send(:sf_img_regex, [:jpg]) }
223
+
224
+ it 'returns a regex that matches only jpg images' do
225
+ regex.should eq(img_regexes[:jpg])
226
+ end
227
+
228
+ it 'matches only the defined extension' do
229
+ regex.should match(images[:jpg].sample)
230
+ end
231
+
232
+ it "doesn't match any other extension" do
233
+ [:png, :gif].each { |ext| regex.should_not match(images[ext].sample) }
234
+ end
235
+ end
236
+
237
+ context 'when only png is allowed' do
238
+ subject(:regex) { ''.send(:sf_img_regex, :png) }
239
+
240
+ it 'returns a regex that matches only png images' do
241
+ regex.should eq(img_regexes[:png])
242
+ end
243
+
244
+ it 'matches only the defined extension' do
245
+ regex.should match(images[:png].sample)
246
+ end
247
+
248
+ it "doesn't match any other extension" do
249
+ [:jpg, :gif].each { |ext| regex.should_not match(images[ext].sample) }
250
+ end
251
+ end
252
+
253
+ context 'when only gif (argh!) is allowed' do
254
+ subject(:regex) { ''.send(:sf_img_regex, :gif) }
255
+
256
+ it 'returns a regex that matches only gif images' do
257
+ regex.should eq(img_regexes[:gif])
258
+ end
259
+
260
+ it 'matches only the defined extension' do
261
+ regex.should match(images[:gif].sample)
262
+ end
263
+
264
+ it "doesn't match any other extension" do
265
+ [:jpg, :png].each { |ext| regex.should_not match(images[ext].sample) }
266
+ end
267
+ end
268
+ end
269
+ end
@@ -0,0 +1,5 @@
1
+ require 'rubygems'
2
+ require 'bundler/setup'
3
+ require 'scrapifier'
4
+ require 'factories/uris'
5
+
metadata ADDED
@@ -0,0 +1,117 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: scrapifier
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Tiago Guedes
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-04-07 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.6'
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.3'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.3'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '2.14'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '2.14'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '10.1'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '10.1'
69
+ description: A very simple way to extract meta information from URIs using the screen
70
+ scraping technique.
71
+ email:
72
+ - tiagopog@gmail.com
73
+ executables: []
74
+ extensions: []
75
+ extra_rdoc_files: []
76
+ files:
77
+ - ".gitignore"
78
+ - Gemfile
79
+ - LICENSE.txt
80
+ - README.md
81
+ - Rakefile
82
+ - lib/scrapifier.rb
83
+ - lib/scrapifier/methods.rb
84
+ - lib/scrapifier/support.rb
85
+ - lib/scrapifier/version.rb
86
+ - scrapifier.gemspec
87
+ - spec/factories/uris.rb
88
+ - spec/scrapifier_spec.rb
89
+ - spec/spec_helper.rb
90
+ homepage: https://github.com/tiagopog/scrapifier
91
+ licenses:
92
+ - MIT
93
+ metadata: {}
94
+ post_install_message:
95
+ rdoc_options: []
96
+ require_paths:
97
+ - lib
98
+ required_ruby_version: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - ">="
101
+ - !ruby/object:Gem::Version
102
+ version: '0'
103
+ required_rubygems_version: !ruby/object:Gem::Requirement
104
+ requirements:
105
+ - - ">="
106
+ - !ruby/object:Gem::Version
107
+ version: '0'
108
+ requirements: []
109
+ rubyforge_project:
110
+ rubygems_version: 2.2.2
111
+ signing_key:
112
+ specification_version: 4
113
+ summary: Extends the Ruby String class with a screen scraping method.
114
+ test_files:
115
+ - spec/factories/uris.rb
116
+ - spec/scrapifier_spec.rb
117
+ - spec/spec_helper.rb