scrapifier 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1bf5a65bfe5f1fd54830bf6b2ef57b286661c515
4
- data.tar.gz: af7058d4dc2ef44d03c930b869003c12616b1edd
3
+ metadata.gz: 67a72b01b9c64d37f15c459e11c536f29bd0abc2
4
+ data.tar.gz: d83f2f8c7ff3203ee2a5472e2e2099a4389af7aa
5
5
  SHA512:
6
- metadata.gz: c0d9fbe9986f730e57dff1b2164508581e928f1b2a7ccd7b48e0e812fa5002befbf953c30c2385498a21a196ebc3c09cff57106b7d3188956a820ed9bdecc26f
7
- data.tar.gz: 1c5f1d0171f91f68acd9d642491adb9de3f1b210087c75117582da68d2cb22457812ae1ee747145bf300652aa175a8c65f9092c5f39ca9fcce73df1fa50de8a6
6
+ metadata.gz: 2a4abdb171a16b46b89bfb4c43a3262169c08c70125b3ac88e9c4f13209acd5085368b3a4b9399691b9fb9dc89fc4cacfe34b781a26df3c5646bfd4f20f44435
7
+ data.tar.gz: c246dd2f9770e3835f4423b497b0bb86d38e51ff9030b59aa516be89f11262db159d2938f94e43f60b91fb14c2c56015965fb8556eec963a13e3c568a4c68d9d
data/README.md CHANGED
@@ -23,9 +23,15 @@ Or install it yourself as:
23
23
 
24
24
  $ gem install scrapifier
25
25
 
26
+ An then require the gem:
27
+
28
+ $ require 'scrapifier'
29
+
26
30
  ## Usage
27
31
 
28
- The method finds an URI in the String and gets some meta information from it, like the page's title, description, images and the URI. All the data is returned in a well-formatted Hash.
32
+ The String#scrapify method finds URIs in a string and then gets their metadata, e.g., the page's title, description, images and URI. All the data is returned in a well-formatted hash.
33
+
34
+ Note: This gem is mainly focused on screen scraping URLs (presence of protocol, such as: "http", "https" and "ftp"), but it also works with URIs which have the "www" without any protocol defined, like: "www.google.com".
29
35
 
30
36
  #### Default usage.
31
37
 
@@ -42,7 +48,7 @@ The method finds an URI in the String and gets some meta information from it, li
42
48
  #### Allow only certain image types.
43
49
 
44
50
  ``` ruby
45
- 'Wow! What an awesome site: http://adtangerine.com!'.scrapify images: :jpg
51
+ 'Wow! What an awesome site: http://adtangerine.com!'.scrapify(images: :jpg)
46
52
  #=> {
47
53
  # title: "AdTangerine | Advertising Platform for Social Media",
48
54
  # description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
@@ -50,7 +56,7 @@ The method finds an URI in the String and gets some meta information from it, li
50
56
  # uri: "http://adtangerine.com"
51
57
  # }
52
58
 
53
- 'Wow! What an awesome site: http://adtangerine.com!'.scrapify images: [:png, :gif]
59
+ 'Wow! What an awesome site: http://adtangerine.com!'.scrapify(images: [:png, :gif])
54
60
  #=> {
55
61
  # title: "AdTangerine | Advertising Platform for Social Media",
56
62
  # description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
@@ -62,7 +68,7 @@ The method finds an URI in the String and gets some meta information from it, li
62
68
  #### Choose which URI you want it to be scraped.
63
69
 
64
70
  ``` ruby
65
- 'Check out: http://adtangerine.com and www.twitflink.com'.scrapify which: 1
71
+ 'Check out: http://adtangerine.com and www.twitflink.com'.scrapify(which: 1)
66
72
  #=> {
67
73
  # title: "TwitFlink | Find a link!",
68
74
  # description: "TwitFlink is a very simple searching tool that allows people to find out links tweeted by any user from Twitter.",
@@ -70,7 +76,7 @@ The method finds an URI in the String and gets some meta information from it, li
70
76
  # uri: "http://www.twitflink.com"
71
77
  # }
72
78
 
73
- 'Check out: http://adtangerine.com and www.twitflink.com'.scrapify({ which: 0, images: :gif })
79
+ 'Check out: http://adtangerine.com and www.twitflink.com'.scrapify(which: 0, images: :gif)
74
80
  #=> {
75
81
  # title: "AdTangerine | Advertising Platform for Social Media",
76
82
  # description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
@@ -1,4 +1,5 @@
1
1
  module Scrapifier
2
+ # Support methods to get, check and organize data.
2
3
  module Support
3
4
  module_function
4
5
 
@@ -18,14 +19,17 @@ module Scrapifier
18
19
  # Arguments:
19
20
  # uri: (String)
20
21
  # - URI.
21
- # imgs: (Array)
22
+ # exts: (Array)
22
23
  # - Allowed type of images.
23
- def sf_eval_uri(uri, imgs = [])
24
+ def sf_eval_uri(uri, exts = [])
24
25
  doc = Nokogiri::HTML(open(uri).read)
25
26
  doc.encoding, meta = 'utf-8', { uri: uri }
26
27
 
27
- [:title, :description].each { |k| meta[k] = (doc.xpath(sf_paths[k])[0].text rescue '-') }
28
- meta[:images] = sf_fix_imgs(doc.xpath(sf_paths[:image]), uri, imgs)
28
+ [:title, :description].each do |k|
29
+ node = doc.xpath(sf_xpaths[k])[0]
30
+ meta[k] = node.nil? ? '-' : node.text
31
+ end
32
+ meta[:images] = sf_fix_imgs(doc.xpath(sf_xpaths[:image]), uri, exts)
29
33
 
30
34
  meta
31
35
  rescue SocketError
@@ -33,11 +37,14 @@ module Scrapifier
33
37
  end
34
38
 
35
39
  # Filter images returning those with the allowed extentions.
36
- #
40
+ #
37
41
  # Example:
38
42
  # >> sf_check_img_ext('http://source.com/image.gif', :jpg)
39
43
  # => []
40
- # >> sf_check_img_ext(['http://source.com/image.gif', 'http://source.com/image.jpg'], [:jpg, :png])
44
+ # >> sf_check_img_ext(
45
+ # ['http://source.com/image.gif','http://source.com/image.jpg'],
46
+ # [:jpg, :png]
47
+ # )
41
48
  # => ['http://source.com/image.jpg']
42
49
  # Arguments:
43
50
  # images: (String or Array)
@@ -55,32 +62,37 @@ module Scrapifier
55
62
  end
56
63
 
57
64
  # Select regexes for URIs, protocols and image extensions.
58
- #
65
+ #
59
66
  # Example:
60
67
  # >> sf_regex(:uri)
61
- # => /\b((((ht|f)tp[s]?:\/\/)|([a-z0-9]+\.))+(?<!@)([a-z0-9\_\-]+)(\.[a-z]+)+([\?\/\:][a-z0-9_=%&@\?\.\/\-\:\#\(\)]+)?\/?)/i,
68
+ # => /\b((((ht|f)tp[s]?:\/\/).../i,
62
69
  # >> sf_regex(:image, :jpg)
63
70
  # => /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg)(\?.+)?$)/i
64
71
  # Arguments:
65
72
  # type: (Symbol or String)
66
- # - Regex type.
73
+ # - Regex type: :uri, :protocol, :image
67
74
  # args: (*)
68
75
  # - Anything.
69
76
  def sf_regex(type, *args)
70
77
  type = type.to_sym unless type.is_a? Symbol
71
- if type == :image
72
- sf_img_regex args.flatten
73
- else
74
- regexes = {
75
- uri: /\b((((ht|f)tp[s]?:\/\/)|([a-z0-9]+\.))+(?<!@)([a-z0-9\_\-]+)(\.[a-z]+)+([\?\/\:][a-z0-9_=%&@\?\.\/\-\:\#\(\)]+)?\/?)/i,
76
- protocol: /((ht|f)tp[s]?)/i
77
- }
78
- regexes[type]
79
- end
78
+ type == :image && sf_img_regex(args.flatten) || sf_uri_regex[type]
79
+ end
80
+
81
+ # Build a hash with the URI regexes.
82
+ def sf_uri_regex
83
+ { uri: %r{\b(
84
+ (((ht|f)tp[s]?://)|([a-z0-9]+\.))+
85
+ (?<!@)
86
+ ([a-z0-9\_\-]+)
87
+ (\.[a-z]+)+
88
+ ([\?/\:][a-z0-9_=%&@\?\./\-\:\#\(\)]+)?
89
+ /?
90
+ )}ix,
91
+ protocol: /((ht|f)tp[s]?)/i }
80
92
  end
81
93
 
82
94
  # Build image regexes according to the required extensions.
83
- #
95
+ #
84
96
  # Example:
85
97
  # >> sf_img_regex
86
98
  # => /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg|png|gif)(\?.+)?$)/i
@@ -91,54 +103,90 @@ module Scrapifier
91
103
  # - Image extensions which will be included in the regex.
92
104
  def sf_img_regex(exts = [])
93
105
  exts = [exts].flatten unless exts.is_a?(Array)
94
- if exts.nil? or exts.empty?
106
+ if exts.nil? || exts.empty?
95
107
  exts = %w(jpg jpeg png gif)
96
- elsif exts.include?(:jpg) and !exts.include?(:jpeg)
108
+ elsif exts.include?(:jpg) && !exts.include?(:jpeg)
97
109
  exts.push :jpeg
98
- end
99
- eval "/(^http{1}[s]?:\\/\\/([w]{3}\\.)?.+\\.(#{exts.join('|')})(\\?.+)?$)/i"
110
+ end
111
+ %r{(^http{1}[s]?://([w]{3}\.)?.+\.(#{exts.join('|')})(\?.+)?$)}i
100
112
  end
101
113
 
102
- # Collection of paths used to get content from HTML tags via Node#xpath method.
103
- # See more: http://nokogiri.org/tutorials/searching_a_xml_html_document.html
104
- #
105
- # Example:
106
- # >> sf_paths[:title]
107
- # => '//meta[@property = "og:title"]/@content | //meta[@name = "title"]/@content | //meta[@name = "Title"]/@content | //title | //h1'
108
- def sf_paths
114
+ # Collection of xpath that are used to get nodes
115
+ # from the parsed HTML.
116
+ def sf_xpaths
109
117
  {
110
- title: '//meta[@property = "og:title"]/@content | //meta[@name = "title"]/@content | //meta[@name = "Title"]/@content | //title | //h1',
111
- description: '//meta[@property = "og:description"]/@content | //meta[@name = "description"]/@content | //meta[@name = "Description"]/@content | //h1 | //h3 | //p | //span | //font',
112
- image: '//meta[@property = "og:image"]/@content | //link[@rel = "image_src"]/@href | //meta[@itemprop = "image"]/@content | //div[@id = "logo"]/img/@src | //a[@id = "logo"]/img/@src | //div[@class = "logo"]/img/@src | //a[@class = "logo"]/img/@src | //a//img[@width]/@src | //img[@width]/@src | //a//img[@height]/@src | //img[@height]/@src | //a//img/@src | //span//img/@src'
118
+ title: sf_title_xpath,
119
+ description: sf_desc_xpath,
120
+ image: sf_img_xpath
113
121
  }
114
122
  end
115
123
 
124
+ def sf_title_xpath
125
+ <<-END.gsub(/^\s+\|/, '')
126
+ |//meta[@property = "og:title"]/@content|
127
+ |//meta[@name = "title"]/@content|
128
+ |//meta[@name = "Title"]/@content|
129
+ |//title|//h1
130
+ END
131
+ end
132
+
133
+ def sf_desc_xpath
134
+ <<-END.gsub(/^\s+\|/, '')
135
+ |//meta[@property = "og:description"]/@content|
136
+ |//meta[@name = "description"]/@content|
137
+ |//meta[@name = "Description"]/@content|
138
+ |//h1|//h3|//p|//span|//font
139
+ END
140
+ end
141
+
142
+ def sf_img_xpath
143
+ <<-END.gsub(/^\s+\|/, '')
144
+ |//meta[@property = "og:image"]/@content|
145
+ |//link[@rel = "image_src"]/@href|
146
+ |//meta[@itemprop = "image"]/@content|
147
+ |//div[@id = "logo"]/img/@src|//a[@id = "logo"]/img/@src|
148
+ |//div[@class = "logo"]/img/@src|//a[@class = "logo"]/img/@src|
149
+ |//a//img[@width]/@src|//img[@width]/@src|
150
+ |//a//img[@height]/@src|//img[@height]/@src|
151
+ |//a//img/@src|//span//img/@src|//img/@src
152
+ END
153
+ end
154
+
116
155
  # Check and return only the valid image URIs.
117
- #
156
+ #
118
157
  # Example:
119
- # >> sf_fix_imgs(['http://adtangerine.com/image.png', '/assets/image.jpg'], 'http://adtangerine.com', :jpg)
158
+ # >> sf_fix_imgs(
159
+ # ['http://adtangerine.com/image.png', '/assets/image.jpg'],
160
+ # 'http://adtangerine.com',
161
+ # :jpg
162
+ # )
120
163
  # => ['http://adtangerine/assets/image.jpg']
121
164
  # Arguments:
122
165
  # imgs: (Array)
123
166
  # - Image URIs got from the HTML doc.
124
167
  # uri: (String)
125
- # - Used as basis to the URIs that don't have any protocol/domain set.
168
+ # - Used as basis to the URIs that don't have any protocol/domain set.
126
169
  # exts: (Symbol or Array)
127
170
  # - Allowed image extesntions.
128
171
  def sf_fix_imgs(imgs, uri, exts = [])
129
172
  sf_check_img_ext(imgs.map do |img|
130
- img = img.to_s
131
- img = sf_fix_protocol(img, sf_domain(uri)) unless img =~ sf_regex(:protocol)
132
- img if (img =~ sf_regex(:image))
173
+ img = img.to_s
174
+ unless img =~ sf_regex(:protocol)
175
+ img = sf_fix_protocol(img, sf_domain(uri))
176
+ end
177
+ img if img =~ sf_regex(:image)
133
178
  end.compact, exts)
134
179
  end
135
180
 
136
181
  # Fix image URIs that don't have a protocol/domain set.
137
- #
182
+ #
138
183
  # Example:
139
184
  # >> sf_fix_protocol('/assets/image.jpg', 'http://adtangerine.com')
140
185
  # => 'http://adtangerine/assets/image.jpg'
141
- # >> sf_fix_protocol('//s.ytimg.com/yts/img/youtub_img.png', 'https://youtube.com')
186
+ # >> sf_fix_protocol(
187
+ # '//s.ytimg.com/yts/img/youtub_img.png',
188
+ # 'https://youtube.com'
189
+ # )
142
190
  # => 'https://s.ytimg.com/yts/img/youtub_img.png'
143
191
  # Arguments:
144
192
  # path: (String)
@@ -146,15 +194,15 @@ module Scrapifier
146
194
  # domain: (String)
147
195
  # - Domain that will be prepended into the path.
148
196
  def sf_fix_protocol(path, domain)
149
- if path =~ /^\/\/[^\/]+/
197
+ if path =~ %r{^//[^/]+}
150
198
  'http:' << path
151
199
  else
152
- "http://#{domain}#{'/' unless path =~ /^\/[^\/]+/}#{path}"
153
- end
200
+ "http://#{domain}#{'/' unless path =~ %r{^/[^/]+}}#{path}"
201
+ end
154
202
  end
155
203
 
156
204
  # Return the URI domain.
157
- #
205
+ #
158
206
  # Example:
159
207
  # >> sf_domain('http://adtangerine.com')
160
208
  # => 'adtangerine.com'
@@ -162,7 +210,8 @@ module Scrapifier
162
210
  # uri: (String)
163
211
  # - URI.
164
212
  def sf_domain(uri)
165
- (uri.split('/')[2] rescue '')
213
+ uri = uri.to_s.split('/')
214
+ uri.empty? ? '' : uri[2]
166
215
  end
167
216
  end
168
- end
217
+ end
@@ -1,3 +1,3 @@
1
1
  module Scrapifier
2
- VERSION = '0.0.3'
2
+ VERSION = '0.0.4'
3
3
  end
@@ -1,40 +1,41 @@
1
1
  module Factories
2
2
  private
3
- def sf_samples
4
- {
5
- misc: {
6
- http: 'http://adtangerine.com',
7
- https: 'https://rubygems.org/gems/string_awesome',
8
- ftp: 'ftp://ftpserver.com',
9
- www: 'www.twitflink.com'
3
+
4
+ def sf_samples
5
+ {
6
+ misc: {
7
+ http: 'http://adtangerine.com',
8
+ https: 'https://rubygems.org/gems/string_awesome',
9
+ ftp: 'ftp://ftpserver.com',
10
+ www: 'www.twitflink.com'
11
+ },
12
+ images: {
13
+ jpg: [
14
+ 'http://jlcauvin.com/wp-content/uploads/2013/09/heisenberg-breaking-bad.jpg',
15
+ 'https://www.foobar.com/awesome_image.jpeg?foo=bar&bar=foo',
16
+ 'http://foobar.com.br/nice-image.jpg'
17
+ ],
18
+ png: [
19
+ 'http://www.faniq.com/images/blog/58389e481aee9c5abbf49ff0a263f3ca.png',
20
+ 'https://foobar.br/awesome_image.png',
21
+ 'https://bar.foobar.br/foo/var/image.png?foo=bar',
22
+ ],
23
+ gif: [
24
+ 'http://31.media.tumblr.com/6eec77e355fe50bae424291fd8c58622/tumblr_me7ucl8kO61rf089no1_500.gif',
25
+ 'http://foobar.com/ugly_image.gif',
26
+ 'https://bar.foobar.br/foo/var/stop_using.gif?foo=bar'
27
+ ]
28
+ },
29
+ regexes: {
30
+ image: {
31
+ all: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg|png|gif)(\?.+)?$)/i,
32
+ jpg: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg)(\?.+)?$)/i,
33
+ png: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(png)(\?.+)?$)/i,
34
+ gif: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(gif)(\?.+)?$)/i
10
35
  },
11
- images: {
12
- jpg: [
13
- 'http://jlcauvin.com/wp-content/uploads/2013/09/heisenberg-breaking-bad.jpg',
14
- 'https://www.foobar.com/awesome_image.jpeg?foo=bar&bar=foo',
15
- 'http://foobar.com.br/nice-image.jpg'
16
- ],
17
- png: [
18
- 'http://www.faniq.com/images/blog/58389e481aee9c5abbf49ff0a263f3ca.png',
19
- 'https://foobar.br/awesome_image.png',
20
- 'https://bar.foobar.br/foo/var/image.png?foo=bar',
21
- ],
22
- gif: [
23
- 'http://31.media.tumblr.com/6eec77e355fe50bae424291fd8c58622/tumblr_me7ucl8kO61rf089no1_500.gif',
24
- 'http://foobar.com/ugly_image.gif',
25
- 'https://bar.foobar.br/foo/var/stop_using.gif?foo=bar'
26
- ]
27
- },
28
- regexes: {
29
- image: {
30
- all: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg|png|gif)(\?.+)?$)/i,
31
- jpg: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg)(\?.+)?$)/i,
32
- png: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(png)(\?.+)?$)/i,
33
- gif: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(gif)(\?.+)?$)/i
34
- },
35
- uri: /\b((((ht|f)tp[s]?:\/\/)|([a-z0-9]+\.))+(?<!@)([a-z0-9\_\-]+)(\.[a-z]+)+([\?\/\:][a-z0-9_=%&@\?\.\/\-\:\#\(\)]+)?\/?)/i,
36
- protocol: /((ht|f)tp[s]?)/i
37
- }
36
+ uri: /\b((((ht|f)tp[s]?:\/\/)|([a-z0-9]+\.))+(?<!@)([a-z0-9\_\-]+)(\.[a-z]+)+([\?\/\:][a-z0-9_=%&@\?\.\/\-\:\#\(\)]+)?\/?)/i,
37
+ protocol: /((ht|f)tp[s]?)/i
38
38
  }
39
- end
40
- end
39
+ }
40
+ end
41
+ end
@@ -62,13 +62,16 @@ describe String do
62
62
  end
63
63
 
64
64
  it "includes a field with image URIs from the site's head/body" do
65
- hash[:images].is_a?(Array).should be_true
66
- hash[:images].sample.should match(regexes[:image][:all])
65
+ unless hash[:images].empty?
66
+ hash[:images].is_a?(Array).should be_true
67
+ hash[:images].sample.should match(regexes[:image][:all])
68
+ end
67
69
  end
68
70
  end
69
71
 
70
72
  it "includes a field with only the allowed types of image URIs from the site's head/body" do
71
- misc[:http].scrapify(images: :png)[:images].sample.should match(regexes[:image][:png])
73
+ image = misc[:http].scrapify(images: :png)[:images].sample
74
+ image.should match(regexes[:image][:png]) unless image.nil?
72
75
  end
73
76
 
74
77
  it "can choose the URI in the String to be scrapified" do
@@ -2,4 +2,3 @@ require 'rubygems'
2
2
  require 'bundler/setup'
3
3
  require 'scrapifier'
4
4
  require 'factories/samples'
5
-
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrapifier
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tiago Guedes
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-04-30 00:00:00.000000000 Z
11
+ date: 2014-06-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri