scrapifier 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: d645984640446b98bdc5bdf71972c991b361b75e
4
+ data.tar.gz: 9bd772bf8ab26ab4dda602fa69eee8c12ae45e39
5
+ SHA512:
6
+ metadata.gz: ff5dd829fd8e41af883fccd65ade03bb44d968f71a5457a0f2ceeb3afb0e389f71b44c0b559b352793293f4675cd9556c6e0bbab9799b637f1c4e6e7bdbb61ee
7
+ data.tar.gz: 3b974c372000f5d4f795f32074bd4da64e4fd910e8623f8446169c18b6880b1967aee105bf5df4e9de36fd054697c567727d4a8eee48def87f51b93b4117f556
data/.gitignore ADDED
@@ -0,0 +1,18 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ .rspec
7
+ Gemfile.lock
8
+ InstalledFiles
9
+ _yardoc
10
+ coverage
11
+ doc/
12
+ lib/bundler/man
13
+ pkg
14
+ rdoc
15
+ spec/reports
16
+ test/tmp
17
+ test/version_tmp
18
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in scrapifier.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Tiago Guedes
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,83 @@
1
+ # Scrapifier
2
+
3
+ It's a Ruby gem that brings a very simple way to extract meta information from URIs using the screen scraping technique.
4
+
5
+ ## Installation
6
+
7
+ Compatible with Ruby 1.9.3+
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ gem 'scrapifier'
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install scrapifier
20
+
21
+ ## Usage
22
+
23
+ The method finds an URI in the String and gets some meta information from it, like the page's title, description, images and the URI. All the data is returned in a well-formatted Hash.
24
+
25
+ #### Default usage.
26
+
27
+ ``` ruby
28
+ 'Wow! What an awesome site: http://adtangerine.com!'.scrapify
29
+ #=> {
30
+ # title: "AdTangerine | Advertising Platform for Social Media",
31
+ # description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
32
+ # images: ["http://adtangerine.com/assets/logo_adt_og.png", "http://adtangerine.com/assets/logo_adt_og.png", "http://s3-us-west-2.amazonaws.com/adtangerine-prod/users/avatars/000/000/834/thumb/275747_1118382211_1929809351_n.jpg", "http://adtangerine.com/assets/foobar.gif"],
33
+ # uri: "http://adtangerine.com"
34
+ # }
35
+ ```
36
+
37
+ #### Allow only certain image types.
38
+
39
+ ``` ruby
40
+ 'Wow! What an awesome site: http://adtangerine.com!'.scrapify images: :jpg
41
+ #=> {
42
+ # title: "AdTangerine | Advertising Platform for Social Media",
43
+ # description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
44
+ # images: ["http://s3-us-west-2.amazonaws.com/adtangerine-prod/users/avatars/000/000/834/thumb/275747_1118382211_1929809351_n.jpg"],
45
+ # uri: "http://adtangerine.com"
46
+ # }
47
+
48
+ 'Wow! What an awesome site: http://adtangerine.com!'.scrapify images: [:png, :gif]
49
+ #=> {
50
+ # title: "AdTangerine | Advertising Platform for Social Media",
51
+ # description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
52
+ # images: ["http://adtangerine.com/assets/logo_adt_og.png", "http://adtangerine.com/assets/logo_adt_og.png", "http://adtangerine.com/assets/foobar.gif"],
53
+ # uri: "http://adtangerine.com"
54
+ # }
55
+ ```
56
+
57
+ #### Choose which URI you want it to be scraped.
58
+
59
+ ``` ruby
60
+ 'Check out: http://adtangerine.com and www.twitflink.com'.scrapify which: 1
61
+ #=> {
62
+ # title: "TwitFlink | Find a link!",
63
+ # description: "TwitFlink is a very simple searching tool that allows people to find out links tweeted by any user from Twitter.",
64
+ # images: ["http://www.twitflink.com//assets/tf_logo.png", "http://twitflink.com/assets/tf_logo.png"],
65
+ # uri: "http://www.twitflink.com"
66
+ # }
67
+
68
+ 'Check out: http://adtangerine.com and www.twitflink.com'.scrapify({ which: 0, images: :gif })
69
+ #=> {
70
+ # title: "AdTangerine | Advertising Platform for Social Media",
71
+ # description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
72
+ # images: ["http://adtangerine.com/assets/foobar.gif"],
73
+ # uri: "http://adtangerine.com"
74
+ # }
75
+ ```
76
+
77
+ ## Contributing
78
+
79
+ 1. Fork it
80
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
81
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
82
+ 4. Push to the branch (`git push origin my-new-feature`)
83
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,5 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+ task default: :spec
@@ -0,0 +1,70 @@
1
+ # coding: utf-8
2
+ require 'nokogiri'
3
+ require 'open-uri'
4
+ require 'scrapifier/support'
5
+
6
+ module Scrapifier
7
+ module Methods
8
+ include Scrapifier::Support
9
+
10
+ # Gets meta data from an URI using the screen scraping technique.
11
+ #
12
+ # Example:
13
+ # >> 'Wow! What an awesome site: http://adtangerine.com!'.scrapify
14
+ # => {
15
+ # :title => "AdTangerine | Advertising Platform for Social Media",
16
+ # :description => "AdTangerine is an advertising platform that uses the tangerine as a virtual currency...",
17
+ # :images => ["http://adtangerine.com/assets/logo_adt_og.png", "http://adtangerine.com/assets/logo_adt_og.png],
18
+ # :uri => "http://adtangerine.com"
19
+ # }
20
+ # Arguments:
21
+ # options: (Hash)
22
+ # - which: (Integer) Indicates which URI in the String will be used. It starts from 0 to N.
23
+ # - images: (Symbol or Array) Indicates the image extensions which are allowed to be returned as result.
24
+
25
+ def scrapify(options = {})
26
+ meta, uri = {}, find_uri(options[:which])
27
+
28
+ begin
29
+ if uri.nil?
30
+ raise
31
+ elsif uri =~ sf_regex(:image)
32
+ uri = (sf_check_img_ext(uri, options[:images])[0] rescue [])
33
+ raise if uri.empty?
34
+ [:title, :description, :uri, :images].each { |key| meta[key] = uri }
35
+ else
36
+ doc = Nokogiri::HTML(open(uri).read)
37
+ doc.encoding = 'utf-8'
38
+
39
+ [:title, :description].each do |key|
40
+ meta[key] = (doc.xpath(sf_paths[key])[0].text rescue '-')
41
+ end
42
+
43
+ meta[:images] = sf_fix_imgs(doc.xpath(sf_paths[:image]), uri, options[:images])
44
+ meta[:uri] = uri
45
+ end
46
+ rescue
47
+ meta = {}
48
+ end
49
+
50
+ meta
51
+ end
52
+
53
+ # Looks for URIs in the String.
54
+ #
55
+ # Example:
56
+ # >> 'Wow! What an awesome site: http://adtangerine.com!'.find_uri
57
+ # => 'http://adtangerine.com'
58
+ # >> 'Wow! What an awesome sites: http://adtangerine.com and www.twitflink.com'.find_uri 1
59
+ # => 'www.twitflink.com'
60
+ # Arguments:
61
+ # which: (Integer)
62
+ # - Which URI in the String: first (0), second (1) and so on.
63
+
64
+ def find_uri(which = 0)
65
+ which ||= which.to_i
66
+ which = self.scan(sf_regex(:uri))[which][0] rescue nil
67
+ (which.nil? or which =~ sf_regex(:protocol)) ? which : 'http://' << which
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,144 @@
1
+ module Scrapifier
2
+ module Support
3
+ private
4
+ # Filters images returning those with the allowed extentions.
5
+ #
6
+ # Example:
7
+ # >> sf_check_img_ext('http://source.com/image.gif', :jpg)
8
+ # => []
9
+ # >> sf_check_img_ext(['http://source.com/image.gif', 'http://source.com/image.jpg'], [:jpg, :png])
10
+ # => ['http://source.com/image.jpg']
11
+ # Arguments:
12
+ # images: (String or Array)
13
+ # - Images which will be checked.
14
+ # allowed: (String, Symbol or Array)
15
+ # - Allowed types of image extension.
16
+
17
+ def sf_check_img_ext(images, allowed = [])
18
+ allowed ||= []
19
+ if images.is_a?(String)
20
+ images = images.split
21
+ elsif !images.is_a?(Array)
22
+ images = []
23
+ end
24
+ images.select { |i| i =~ sf_regex(:image, allowed) }
25
+ end
26
+
27
+ # Selects regexes for URIs, protocols and image extensions.
28
+ #
29
+ # Example:
30
+ # >> sf_regex(:uri)
31
+ # => /\b((((ht|f)tp[s]?:\/\/)|([a-z0-9]+\.))+(?<!@)([a-z0-9\_\-]+)(\.[a-z]+)+([\?\/\:][a-z0-9_=%&@\?\.\/\-\:\#\(\)]+)?\/?)/i,
32
+ # >> sf_regex(:image, :jpg)
33
+ # => /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg)(\?.+)?$)/i
34
+ # Arguments:
35
+ # type: (Symbol or String)
36
+ # - Regex type.
37
+ # args: (*)
38
+ # - Anything.
39
+
40
+ def sf_regex(type, *args)
41
+ type = type.to_sym unless type.is_a? Symbol
42
+ if type == :image
43
+ sf_img_regex args.flatten
44
+ else
45
+ regexes = {
46
+ uri: /\b((((ht|f)tp[s]?:\/\/)|([a-z0-9]+\.))+(?<!@)([a-z0-9\_\-]+)(\.[a-z]+)+([\?\/\:][a-z0-9_=%&@\?\.\/\-\:\#\(\)]+)?\/?)/i,
47
+ protocol: /((ht|f)tp[s]?)/i
48
+ }
49
+ regexes[type]
50
+ end
51
+ end
52
+
53
+ # Builds image regexes according to the required extensions.
54
+ #
55
+ # Example:
56
+ # >> sf_img_regex
57
+ # => /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg|png|gif)(\?.+)?$)/i
58
+ # >> sf_img_regex([:jpg, :png])
59
+ # => /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|png)(\?.+)?$)/i
60
+ # Arguments:
61
+ # exts: (Array)
62
+ # - Image extensions which will be included in the regex.
63
+
64
+ def sf_img_regex(exts = [])
65
+ exts = [exts].flatten unless exts.is_a?(Array)
66
+ if exts.nil? or exts.empty?
67
+ exts = %w(jpg jpeg png gif)
68
+ elsif exts.include?(:jpg) and !exts.include?(:jpeg)
69
+ exts.push :jpeg
70
+ end
71
+ eval "/(^http{1}[s]?:\\/\\/([w]{3}\\.)?.+\\.(#{exts.join('|')})(\\?.+)?$)/i"
72
+ end
73
+
74
+ # Collection of paths used to get content from HTML tags via Node#xpath method.
75
+ # See more: http://nokogiri.org/tutorials/searching_a_xml_html_document.html
76
+ #
77
+ # Example:
78
+ # >> sf_paths[:title]
79
+ # => '//meta[@property = "og:title"]/@content | //meta[@name = "title"]/@content | //meta[@name = "Title"]/@content | //title | //h1'
80
+
81
+ def sf_paths
82
+ {
83
+ title: '//meta[@property = "og:title"]/@content | //meta[@name = "title"]/@content | //meta[@name = "Title"]/@content | //title | //h1',
84
+ description: '//meta[@property = "og:description"]/@content | //meta[@name = "description"]/@content | //meta[@name = "Description"]/@content | //h1 | //h3 | //p | //span | //font',
85
+ image: '//meta[@property = "og:image"]/@content | //link[@rel = "image_src"]/@href | //meta[@itemprop = "image"]/@content | //div[@id = "logo"]/img/@src | //a[@id = "logo"]/img/@src | //div[@class = "logo"]/img/@src | //a[@class = "logo"]/img/@src | //a//img[@width]/@src | //img[@width]/@src | //a//img[@height]/@src | //img[@height]/@src | //a//img/@src | //span//img/@src'
86
+ }
87
+ end
88
+
89
+ # Checks and returns only the valid image URIs.
90
+ #
91
+ # Example:
92
+ # >> sf_fix_imgs(['http://adtangerine.com/image.png', '/assets/image.jpg'], 'http://adtangerine.com', :jpg)
93
+ # => ['http://adtangerine/assets/image.jpg']
94
+ # Arguments:
95
+ # imgs: (Array)
96
+ # - Image URIs got from the HTML doc.
97
+ # uri: (String)
98
+ # - Used as basis to the URIs that don't have any protocol/domain set.
99
+ # exts: (Symbol or Array)
100
+ # - Allowed image extesntions.
101
+
102
+ def sf_fix_imgs(imgs, uri, exts = [])
103
+ sf_check_img_ext(imgs.map do |img|
104
+ img = img.to_s
105
+ img = sf_fix_protocol(img, sf_domain(uri)) unless img =~ sf_regex(:protocol)
106
+ img if (img =~ sf_regex(:image))
107
+ end.compact, exts)
108
+ end
109
+
110
+ # Fixes image URIs that doesn't present protocol/domain.
111
+ #
112
+ # Example:
113
+ # >> sf_fix_protocol('/assets/image.jpg', 'http://adtangerine.com')
114
+ # => 'http://adtangerine/assets/image.jpg'
115
+ # >> sf_fix_protocol('//s.ytimg.com/yts/img/youtub_img.png', 'https://youtube.com')
116
+ # => 'https://s.ytimg.com/yts/img/youtub_img.png'
117
+ # Arguments:
118
+ # path: (String)
119
+ # - URI path having no protocol/domain set.
120
+ # domain: (String)
121
+ # - Domain that will be prepended into the path.
122
+
123
+ def sf_fix_protocol(path, domain)
124
+ if path =~ /^\/\/[^\/]+/
125
+ 'http:' << path
126
+ else
127
+ "http://#{domain}#{'/' unless path =~ /^\/[^\/]+/}#{path}"
128
+ end
129
+ end
130
+
131
+ # Returns the domain from an URI
132
+ #
133
+ # Example:
134
+ # >> sf_domain('http://adtangerine.com')
135
+ # => 'adtangerine.com'
136
+ # Arguments:
137
+ # uri: (String)
138
+ # - URI.
139
+
140
+ def sf_domain(uri)
141
+ (uri.split('/')[2] rescue '')
142
+ end
143
+ end
144
+ end
@@ -0,0 +1,3 @@
1
+ module Scrapifier
2
+ VERSION = '0.0.1'
3
+ end
data/lib/scrapifier.rb ADDED
@@ -0,0 +1,4 @@
1
+ # coding: utf-8
2
+ require 'scrapifier/methods'
3
+
4
+ String.send :include, Scrapifier::Methods
@@ -0,0 +1,26 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'scrapifier/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'scrapifier'
8
+ spec.version = Scrapifier::VERSION
9
+ spec.authors = ['Tiago Guedes']
10
+ spec.email = ['tiagopog@gmail.com']
11
+ spec.description = 'A very simple way to extract meta information from URIs using the screen scraping technique.'
12
+ spec.summary = 'Extends the Ruby String class with a screen scraping method.'
13
+ spec.homepage = 'https://github.com/tiagopog/scrapifier'
14
+ spec.license = 'MIT'
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ['lib']
20
+
21
+ spec.add_runtime_dependency 'nokogiri', '~> 1.6'
22
+
23
+ spec.add_development_dependency 'bundler', '~> 1.3'
24
+ spec.add_development_dependency 'rspec', '~> 2.14'
25
+ spec.add_development_dependency 'rake', '~> 10.1'
26
+ end
@@ -0,0 +1,40 @@
1
+ module Factories
2
+ private
3
+ def sf_samples
4
+ {
5
+ misc: {
6
+ http: 'http://adtangerine.com',
7
+ https: 'https://rubygems.org/gems/string_awesome',
8
+ ftp: 'ftp://ftpserver.com',
9
+ www: 'www.twitflink.com'
10
+ },
11
+ images: {
12
+ jpg: [
13
+ 'http://jlcauvin.com/wp-content/uploads/2013/09/heisenberg-breaking-bad.jpg',
14
+ 'https://www.foobar.com/awesome_image.jpeg?foo=bar&bar=foo',
15
+ 'http://foobar.com.br/nice-image.jpg'
16
+ ],
17
+ png: [
18
+ 'http://www.faniq.com/images/blog/58389e481aee9c5abbf49ff0a263f3ca.png',
19
+ 'https://foobar.br/awesome_image.png',
20
+ 'https://bar.foobar.br/foo/var/image.png?foo=bar',
21
+ ],
22
+ gif: [
23
+ 'http://31.media.tumblr.com/6eec77e355fe50bae424291fd8c58622/tumblr_me7ucl8kO61rf089no1_500.gif',
24
+ 'http://foobar.com/ugly_image.gif',
25
+ 'https://bar.foobar.br/foo/var/stop_using.gif?foo=bar'
26
+ ]
27
+ },
28
+ regexes: {
29
+ image: {
30
+ all: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg|png|gif)(\?.+)?$)/i,
31
+ jpg: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg)(\?.+)?$)/i,
32
+ png: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(png)(\?.+)?$)/i,
33
+ gif: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(gif)(\?.+)?$)/i
34
+ },
35
+ uri: /\b((((ht|f)tp[s]?:\/\/)|([a-z0-9]+\.))+(?<!@)([a-z0-9\_\-]+)(\.[a-z]+)+([\?\/\:][a-z0-9_=%&@\?\.\/\-\:\#\(\)]+)?\/?)/i,
36
+ protocol: /((ht|f)tp[s]?)/i
37
+ }
38
+ }
39
+ end
40
+ end
@@ -0,0 +1,269 @@
1
+ # coding: utf-8
2
+ require 'spec_helper'
3
+ include Factories
4
+
5
+ describe String do
6
+ let(:images) { sf_samples[:images] }
7
+ let(:misc) { sf_samples[:misc] }
8
+ let(:regexes) { sf_samples[:regexes] }
9
+
10
+ #
11
+ # String#scrapify
12
+ #
13
+
14
+ describe '#scrapify' do
15
+ context 'when no URI is matched in the String' do
16
+ subject { 'String without any URI.'.scrapify }
17
+
18
+ it { should eq({}) }
19
+ end
20
+
21
+ context 'when the website was not found' do
22
+ subject { 'Check out this http://someweirduri.com.br'.scrapify }
23
+
24
+ it { should eq({}) }
25
+ end
26
+
27
+ context 'when an image URI is matched' do
28
+ let(:jpg) { images[:jpg][0] }
29
+ let(:png) { images[:png][0] }
30
+ let(:gif) { images[:gif][0] }
31
+
32
+ it 'sets the same value for :title, :description and :uri keys' do
33
+ "Say my name: #{jpg}".scrapify.should include(title: jpg, description: jpg, uri: jpg)
34
+ end
35
+
36
+ it 'allows all the standard image extensions by default (even GIFs)' do
37
+ "Smile GIF! Oh, wait... #{gif}".scrapify.should include(title: gif, description: gif, uri: gif)
38
+ end
39
+
40
+ it 'returns an empty Hash if the extension is not allowed' do
41
+ "PNG is awesome! #{png}".scrapify(images: [:jpg]).should eq({})
42
+ end
43
+ end
44
+
45
+ context 'when a website URI is matched in the String and a Hash is returned' do
46
+ subject(:hash) { "Look this awesome site #{misc[:http]}".scrapify }
47
+
48
+ it "includes a field with the site's title" do
49
+ hash[:title].is_a?(String).should be_true
50
+ hash[:title].empty?.should be_false
51
+ end
52
+
53
+ it "includes a field with the site's description" do
54
+ hash[:description].is_a?(String).should be_true
55
+ hash[:description].empty?.should be_false
56
+ end
57
+
58
+ it 'includes a field with the page URI' do
59
+ hash[:uri].is_a?(String).should be_true
60
+ hash[:uri].empty?.should be_false
61
+ hash[:uri].should eq(misc[:http])
62
+ end
63
+
64
+ it "includes a field with image URIs from the site's head/body" do
65
+ hash[:images].is_a?(Array).should be_true
66
+ hash[:images].sample.should match(regexes[:image][:all])
67
+ end
68
+ end
69
+
70
+ it "includes a field with only the allowed types of image URIs from the site's head/body" do
71
+ misc[:http].scrapify(images: :png)[:images].sample.should match(regexes[:image][:png])
72
+ end
73
+
74
+ it "can choose the URI in the String to be scrapified" do
75
+ hash = "Check out these awesome sites: #{misc[:http]} and #{misc[:www]}".scrapify(which: 1, images: :png)
76
+ [:title, :description, :uri].each do |key|
77
+ hash[key].is_a?(String).should be_true
78
+ hash[key].empty?.should be_false
79
+ end
80
+ hash[:uri].should eq("http://#{misc[:www]}")
81
+ hash[:images].sample.should match(regexes[:image][:png])
82
+ end
83
+ end
84
+
85
+ #
86
+ # String#find_uri
87
+ #
88
+
89
+ describe '#find_uri' do
90
+ let(:sample_uris) { misc.map { |u| u[1] } }
91
+ let(:str) { "Awesome sites: #{sample_uris.join ' and '}" }
92
+
93
+ it 'matches the first URI in the String by default' do
94
+ str.send(:find_uri).should eq(sample_uris[0])
95
+ end
96
+
97
+ it 'matches the second URI in the String (https)' do
98
+ str.send(:find_uri, 1).should eq(sample_uris[1])
99
+ end
100
+
101
+ it 'matches the third URI in the String (www)' do
102
+ str.send(:find_uri, 2).should eq(sample_uris[2])
103
+ end
104
+
105
+ context 'when no URI is matched' do
106
+ it 'returns nil' do
107
+ 'Lorem ipsum dolor.'.send(:find_uri).should be_nil
108
+ end
109
+
110
+ it 'returns nil (no presence of http|https|ftp|www)' do
111
+ 'Check this out: google.com'.send(:find_uri).should be_nil
112
+ end
113
+ end
114
+ end
115
+
116
+ #
117
+ # String#sf_check_img_ext
118
+ #
119
+
120
+ describe '#sf_check_img_ext' do
121
+ let(:img) { images[:jpg].sample }
122
+ let(:imgs) { images.map { |i| i[1] }.flatten }
123
+ let(:checked) do
124
+ {
125
+ str: ''.send(:sf_check_img_ext, img),
126
+ array: ''.send(:sf_check_img_ext, imgs),
127
+ jpg: ''.send(:sf_check_img_ext, imgs, [:jpg]),
128
+ png: ''.send(:sf_check_img_ext, imgs, :png),
129
+ gif: ''.send(:sf_check_img_ext, imgs, 'gif')
130
+ }
131
+ end
132
+
133
+ context 'when no arument is passed' do
134
+ it { expect { ''.send(:sf_check_img_ext) }.to raise_error(ArgumentError) }
135
+ end
136
+
137
+ context 'when only the first argument is defined' do
138
+ it 'allows a String as argument' do
139
+ checked[:str].should have(1).item
140
+ end
141
+
142
+ it 'allows an Array as argument' do
143
+ checked[:jpg].should have(3).item
144
+ end
145
+
146
+ it 'allows all the image extensions by default' do
147
+ checked[:array].should have(9).item
148
+ end
149
+ end
150
+
151
+ context 'when the two arguments are defined' do
152
+ it 'allows a Symbol as the second argument' do
153
+ checked[:png].should have(3).item
154
+ end
155
+
156
+ it 'allows a String as the second argument' do
157
+ checked[:gif].should have(3).item
158
+ end
159
+
160
+ it 'allows an Array as the second argument' do
161
+ checked[:jpg].should have(3).item
162
+ end
163
+
164
+ it 'returns an Array with only image types allowed' do
165
+ [:jpg, :png, :gif].each { |ext| checked[ext].should have(3).item }
166
+ end
167
+ end
168
+
169
+ context 'when no image is found/allowed' do
170
+ it 'returns an empty Array' do
171
+ end
172
+ end
173
+
174
+ it 'always returns an Array' do
175
+ checked.each { |c| c[1].is_a?(Array).should be_true }
176
+ end
177
+ end
178
+
179
+
180
+ #
181
+ # String#sf_regex
182
+ #
183
+
184
+ describe '#sf_regex' do
185
+ context 'when it needs a regex to match any kind of URI' do
186
+ subject { ''.send(:sf_regex, :uri) }
187
+
188
+ [:http, :https, :ftp, :www].each do |p|
189
+ it { should match(misc[:http]) }
190
+ end
191
+ end
192
+
193
+ context 'when it needs a regex to match only image uris' do
194
+ subject { ''.send(:sf_regex, :image) }
195
+
196
+ [:jpg, :png, :gif].each do |ext|
197
+ it { should match(sf_samples[:images][ext].sample) }
198
+ end
199
+ end
200
+ end
201
+
202
+ #
203
+ # String#sf_img_regex
204
+ #
205
+
206
+ describe '#sf_img_regex' do
207
+ let(:img_regexes) { regexes[:image] }
208
+
209
+ context 'when no argument is passed' do
210
+ subject(:regex) { ''.send(:sf_img_regex) }
211
+
212
+ it 'returns a regex that matches all image extensions' do
213
+ regex.should eq(img_regexes[:all])
214
+ end
215
+
216
+ it 'matches all image extensions' do
217
+ [:jpg, :png, :gif].each { |ext| images[ext].sample.should match(regex) }
218
+ end
219
+ end
220
+
221
+ context 'when only jpg is allowed' do
222
+ subject(:regex) { ''.send(:sf_img_regex, [:jpg]) }
223
+
224
+ it 'returns a regex that matches only jpg images' do
225
+ regex.should eq(img_regexes[:jpg])
226
+ end
227
+
228
+ it 'matches only the defined extension' do
229
+ regex.should match(images[:jpg].sample)
230
+ end
231
+
232
+ it "doesn't match any other extension" do
233
+ [:png, :gif].each { |ext| regex.should_not match(images[ext].sample) }
234
+ end
235
+ end
236
+
237
+ context 'when only png is allowed' do
238
+ subject(:regex) { ''.send(:sf_img_regex, :png) }
239
+
240
+ it 'returns a regex that matches only png images' do
241
+ regex.should eq(img_regexes[:png])
242
+ end
243
+
244
+ it 'matches only the defined extension' do
245
+ regex.should match(images[:png].sample)
246
+ end
247
+
248
+ it "doesn't match any other extension" do
249
+ [:jpg, :gif].each { |ext| regex.should_not match(images[ext].sample) }
250
+ end
251
+ end
252
+
253
+ context 'when only gif (argh!) is allowed' do
254
+ subject(:regex) { ''.send(:sf_img_regex, :gif) }
255
+
256
+ it 'returns a regex that matches only gif images' do
257
+ regex.should eq(img_regexes[:gif])
258
+ end
259
+
260
+ it 'matches only the defined extension' do
261
+ regex.should match(images[:gif].sample)
262
+ end
263
+
264
+ it "doesn't match any other extension" do
265
+ [:jpg, :png].each { |ext| regex.should_not match(images[ext].sample) }
266
+ end
267
+ end
268
+ end
269
+ end
@@ -0,0 +1,5 @@
1
+ require 'rubygems'
2
+ require 'bundler/setup'
3
+ require 'scrapifier'
4
+ require 'factories/uris'
5
+
metadata ADDED
@@ -0,0 +1,117 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: scrapifier
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Tiago Guedes
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-04-07 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.6'
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.3'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.3'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '2.14'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '2.14'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '10.1'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '10.1'
69
+ description: A very simple way to extract meta information from URIs using the screen
70
+ scraping technique.
71
+ email:
72
+ - tiagopog@gmail.com
73
+ executables: []
74
+ extensions: []
75
+ extra_rdoc_files: []
76
+ files:
77
+ - ".gitignore"
78
+ - Gemfile
79
+ - LICENSE.txt
80
+ - README.md
81
+ - Rakefile
82
+ - lib/scrapifier.rb
83
+ - lib/scrapifier/methods.rb
84
+ - lib/scrapifier/support.rb
85
+ - lib/scrapifier/version.rb
86
+ - scrapifier.gemspec
87
+ - spec/factories/uris.rb
88
+ - spec/scrapifier_spec.rb
89
+ - spec/spec_helper.rb
90
+ homepage: https://github.com/tiagopog/scrapifier
91
+ licenses:
92
+ - MIT
93
+ metadata: {}
94
+ post_install_message:
95
+ rdoc_options: []
96
+ require_paths:
97
+ - lib
98
+ required_ruby_version: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - ">="
101
+ - !ruby/object:Gem::Version
102
+ version: '0'
103
+ required_rubygems_version: !ruby/object:Gem::Requirement
104
+ requirements:
105
+ - - ">="
106
+ - !ruby/object:Gem::Version
107
+ version: '0'
108
+ requirements: []
109
+ rubyforge_project:
110
+ rubygems_version: 2.2.2
111
+ signing_key:
112
+ specification_version: 4
113
+ summary: Extends the Ruby String class with a screen scraping method.
114
+ test_files:
115
+ - spec/factories/uris.rb
116
+ - spec/scrapifier_spec.rb
117
+ - spec/spec_helper.rb