scrapifier 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1bf5a65bfe5f1fd54830bf6b2ef57b286661c515
4
- data.tar.gz: af7058d4dc2ef44d03c930b869003c12616b1edd
3
+ metadata.gz: 67a72b01b9c64d37f15c459e11c536f29bd0abc2
4
+ data.tar.gz: d83f2f8c7ff3203ee2a5472e2e2099a4389af7aa
5
5
  SHA512:
6
- metadata.gz: c0d9fbe9986f730e57dff1b2164508581e928f1b2a7ccd7b48e0e812fa5002befbf953c30c2385498a21a196ebc3c09cff57106b7d3188956a820ed9bdecc26f
7
- data.tar.gz: 1c5f1d0171f91f68acd9d642491adb9de3f1b210087c75117582da68d2cb22457812ae1ee747145bf300652aa175a8c65f9092c5f39ca9fcce73df1fa50de8a6
6
+ metadata.gz: 2a4abdb171a16b46b89bfb4c43a3262169c08c70125b3ac88e9c4f13209acd5085368b3a4b9399691b9fb9dc89fc4cacfe34b781a26df3c5646bfd4f20f44435
7
+ data.tar.gz: c246dd2f9770e3835f4423b497b0bb86d38e51ff9030b59aa516be89f11262db159d2938f94e43f60b91fb14c2c56015965fb8556eec963a13e3c568a4c68d9d
data/README.md CHANGED
@@ -23,9 +23,15 @@ Or install it yourself as:
23
23
 
24
24
  $ gem install scrapifier
25
25
 
26
+ An then require the gem:
27
+
28
+ $ require 'scrapifier'
29
+
26
30
  ## Usage
27
31
 
28
- The method finds an URI in the String and gets some meta information from it, like the page's title, description, images and the URI. All the data is returned in a well-formatted Hash.
32
+ The String#scrapify method finds URIs in a string and then gets their metadata, e.g., the page's title, description, images and URI. All the data is returned in a well-formatted hash.
33
+
34
+ Note: This gem is mainly focused on screen scraping URLs (presence of protocol, such as: "http", "https" and "ftp"), but it also works with URIs which have the "www" without any protocol defined, like: "www.google.com".
29
35
 
30
36
  #### Default usage.
31
37
 
@@ -42,7 +48,7 @@ The method finds an URI in the String and gets some meta information from it, li
42
48
  #### Allow only certain image types.
43
49
 
44
50
  ``` ruby
45
- 'Wow! What an awesome site: http://adtangerine.com!'.scrapify images: :jpg
51
+ 'Wow! What an awesome site: http://adtangerine.com!'.scrapify(images: :jpg)
46
52
  #=> {
47
53
  # title: "AdTangerine | Advertising Platform for Social Media",
48
54
  # description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
@@ -50,7 +56,7 @@ The method finds an URI in the String and gets some meta information from it, li
50
56
  # uri: "http://adtangerine.com"
51
57
  # }
52
58
 
53
- 'Wow! What an awesome site: http://adtangerine.com!'.scrapify images: [:png, :gif]
59
+ 'Wow! What an awesome site: http://adtangerine.com!'.scrapify(images: [:png, :gif])
54
60
  #=> {
55
61
  # title: "AdTangerine | Advertising Platform for Social Media",
56
62
  # description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
@@ -62,7 +68,7 @@ The method finds an URI in the String and gets some meta information from it, li
62
68
  #### Choose which URI you want it to be scraped.
63
69
 
64
70
  ``` ruby
65
- 'Check out: http://adtangerine.com and www.twitflink.com'.scrapify which: 1
71
+ 'Check out: http://adtangerine.com and www.twitflink.com'.scrapify(which: 1)
66
72
  #=> {
67
73
  # title: "TwitFlink | Find a link!",
68
74
  # description: "TwitFlink is a very simple searching tool that allows people to find out links tweeted by any user from Twitter.",
@@ -70,7 +76,7 @@ The method finds an URI in the String and gets some meta information from it, li
70
76
  # uri: "http://www.twitflink.com"
71
77
  # }
72
78
 
73
- 'Check out: http://adtangerine.com and www.twitflink.com'.scrapify({ which: 0, images: :gif })
79
+ 'Check out: http://adtangerine.com and www.twitflink.com'.scrapify(which: 0, images: :gif)
74
80
  #=> {
75
81
  # title: "AdTangerine | Advertising Platform for Social Media",
76
82
  # description: "AdTangerine is an advertising platform that uses the tangerine as a virtual currency for advertisers and publishers in order to share content on social networks.",
@@ -1,4 +1,5 @@
1
1
  module Scrapifier
2
+ # Support methods to get, check and organize data.
2
3
  module Support
3
4
  module_function
4
5
 
@@ -18,14 +19,17 @@ module Scrapifier
18
19
  # Arguments:
19
20
  # uri: (String)
20
21
  # - URI.
21
- # imgs: (Array)
22
+ # exts: (Array)
22
23
  # - Allowed type of images.
23
- def sf_eval_uri(uri, imgs = [])
24
+ def sf_eval_uri(uri, exts = [])
24
25
  doc = Nokogiri::HTML(open(uri).read)
25
26
  doc.encoding, meta = 'utf-8', { uri: uri }
26
27
 
27
- [:title, :description].each { |k| meta[k] = (doc.xpath(sf_paths[k])[0].text rescue '-') }
28
- meta[:images] = sf_fix_imgs(doc.xpath(sf_paths[:image]), uri, imgs)
28
+ [:title, :description].each do |k|
29
+ node = doc.xpath(sf_xpaths[k])[0]
30
+ meta[k] = node.nil? ? '-' : node.text
31
+ end
32
+ meta[:images] = sf_fix_imgs(doc.xpath(sf_xpaths[:image]), uri, exts)
29
33
 
30
34
  meta
31
35
  rescue SocketError
@@ -33,11 +37,14 @@ module Scrapifier
33
37
  end
34
38
 
35
39
  # Filter images returning those with the allowed extentions.
36
- #
40
+ #
37
41
  # Example:
38
42
  # >> sf_check_img_ext('http://source.com/image.gif', :jpg)
39
43
  # => []
40
- # >> sf_check_img_ext(['http://source.com/image.gif', 'http://source.com/image.jpg'], [:jpg, :png])
44
+ # >> sf_check_img_ext(
45
+ # ['http://source.com/image.gif','http://source.com/image.jpg'],
46
+ # [:jpg, :png]
47
+ # )
41
48
  # => ['http://source.com/image.jpg']
42
49
  # Arguments:
43
50
  # images: (String or Array)
@@ -55,32 +62,37 @@ module Scrapifier
55
62
  end
56
63
 
57
64
  # Select regexes for URIs, protocols and image extensions.
58
- #
65
+ #
59
66
  # Example:
60
67
  # >> sf_regex(:uri)
61
- # => /\b((((ht|f)tp[s]?:\/\/)|([a-z0-9]+\.))+(?<!@)([a-z0-9\_\-]+)(\.[a-z]+)+([\?\/\:][a-z0-9_=%&@\?\.\/\-\:\#\(\)]+)?\/?)/i,
68
+ # => /\b((((ht|f)tp[s]?:\/\/).../i,
62
69
  # >> sf_regex(:image, :jpg)
63
70
  # => /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg)(\?.+)?$)/i
64
71
  # Arguments:
65
72
  # type: (Symbol or String)
66
- # - Regex type.
73
+ # - Regex type: :uri, :protocol, :image
67
74
  # args: (*)
68
75
  # - Anything.
69
76
  def sf_regex(type, *args)
70
77
  type = type.to_sym unless type.is_a? Symbol
71
- if type == :image
72
- sf_img_regex args.flatten
73
- else
74
- regexes = {
75
- uri: /\b((((ht|f)tp[s]?:\/\/)|([a-z0-9]+\.))+(?<!@)([a-z0-9\_\-]+)(\.[a-z]+)+([\?\/\:][a-z0-9_=%&@\?\.\/\-\:\#\(\)]+)?\/?)/i,
76
- protocol: /((ht|f)tp[s]?)/i
77
- }
78
- regexes[type]
79
- end
78
+ type == :image && sf_img_regex(args.flatten) || sf_uri_regex[type]
79
+ end
80
+
81
+ # Build a hash with the URI regexes.
82
+ def sf_uri_regex
83
+ { uri: %r{\b(
84
+ (((ht|f)tp[s]?://)|([a-z0-9]+\.))+
85
+ (?<!@)
86
+ ([a-z0-9\_\-]+)
87
+ (\.[a-z]+)+
88
+ ([\?/\:][a-z0-9_=%&@\?\./\-\:\#\(\)]+)?
89
+ /?
90
+ )}ix,
91
+ protocol: /((ht|f)tp[s]?)/i }
80
92
  end
81
93
 
82
94
  # Build image regexes according to the required extensions.
83
- #
95
+ #
84
96
  # Example:
85
97
  # >> sf_img_regex
86
98
  # => /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg|png|gif)(\?.+)?$)/i
@@ -91,54 +103,90 @@ module Scrapifier
91
103
  # - Image extensions which will be included in the regex.
92
104
  def sf_img_regex(exts = [])
93
105
  exts = [exts].flatten unless exts.is_a?(Array)
94
- if exts.nil? or exts.empty?
106
+ if exts.nil? || exts.empty?
95
107
  exts = %w(jpg jpeg png gif)
96
- elsif exts.include?(:jpg) and !exts.include?(:jpeg)
108
+ elsif exts.include?(:jpg) && !exts.include?(:jpeg)
97
109
  exts.push :jpeg
98
- end
99
- eval "/(^http{1}[s]?:\\/\\/([w]{3}\\.)?.+\\.(#{exts.join('|')})(\\?.+)?$)/i"
110
+ end
111
+ %r{(^http{1}[s]?://([w]{3}\.)?.+\.(#{exts.join('|')})(\?.+)?$)}i
100
112
  end
101
113
 
102
- # Collection of paths used to get content from HTML tags via Node#xpath method.
103
- # See more: http://nokogiri.org/tutorials/searching_a_xml_html_document.html
104
- #
105
- # Example:
106
- # >> sf_paths[:title]
107
- # => '//meta[@property = "og:title"]/@content | //meta[@name = "title"]/@content | //meta[@name = "Title"]/@content | //title | //h1'
108
- def sf_paths
114
+ # Collection of xpath that are used to get nodes
115
+ # from the parsed HTML.
116
+ def sf_xpaths
109
117
  {
110
- title: '//meta[@property = "og:title"]/@content | //meta[@name = "title"]/@content | //meta[@name = "Title"]/@content | //title | //h1',
111
- description: '//meta[@property = "og:description"]/@content | //meta[@name = "description"]/@content | //meta[@name = "Description"]/@content | //h1 | //h3 | //p | //span | //font',
112
- image: '//meta[@property = "og:image"]/@content | //link[@rel = "image_src"]/@href | //meta[@itemprop = "image"]/@content | //div[@id = "logo"]/img/@src | //a[@id = "logo"]/img/@src | //div[@class = "logo"]/img/@src | //a[@class = "logo"]/img/@src | //a//img[@width]/@src | //img[@width]/@src | //a//img[@height]/@src | //img[@height]/@src | //a//img/@src | //span//img/@src'
118
+ title: sf_title_xpath,
119
+ description: sf_desc_xpath,
120
+ image: sf_img_xpath
113
121
  }
114
122
  end
115
123
 
124
+ def sf_title_xpath
125
+ <<-END.gsub(/^\s+\|/, '')
126
+ |//meta[@property = "og:title"]/@content|
127
+ |//meta[@name = "title"]/@content|
128
+ |//meta[@name = "Title"]/@content|
129
+ |//title|//h1
130
+ END
131
+ end
132
+
133
+ def sf_desc_xpath
134
+ <<-END.gsub(/^\s+\|/, '')
135
+ |//meta[@property = "og:description"]/@content|
136
+ |//meta[@name = "description"]/@content|
137
+ |//meta[@name = "Description"]/@content|
138
+ |//h1|//h3|//p|//span|//font
139
+ END
140
+ end
141
+
142
+ def sf_img_xpath
143
+ <<-END.gsub(/^\s+\|/, '')
144
+ |//meta[@property = "og:image"]/@content|
145
+ |//link[@rel = "image_src"]/@href|
146
+ |//meta[@itemprop = "image"]/@content|
147
+ |//div[@id = "logo"]/img/@src|//a[@id = "logo"]/img/@src|
148
+ |//div[@class = "logo"]/img/@src|//a[@class = "logo"]/img/@src|
149
+ |//a//img[@width]/@src|//img[@width]/@src|
150
+ |//a//img[@height]/@src|//img[@height]/@src|
151
+ |//a//img/@src|//span//img/@src|//img/@src
152
+ END
153
+ end
154
+
116
155
  # Check and return only the valid image URIs.
117
- #
156
+ #
118
157
  # Example:
119
- # >> sf_fix_imgs(['http://adtangerine.com/image.png', '/assets/image.jpg'], 'http://adtangerine.com', :jpg)
158
+ # >> sf_fix_imgs(
159
+ # ['http://adtangerine.com/image.png', '/assets/image.jpg'],
160
+ # 'http://adtangerine.com',
161
+ # :jpg
162
+ # )
120
163
  # => ['http://adtangerine/assets/image.jpg']
121
164
  # Arguments:
122
165
  # imgs: (Array)
123
166
  # - Image URIs got from the HTML doc.
124
167
  # uri: (String)
125
- # - Used as basis to the URIs that don't have any protocol/domain set.
168
+ # - Used as basis to the URIs that don't have any protocol/domain set.
126
169
  # exts: (Symbol or Array)
127
170
  # - Allowed image extesntions.
128
171
  def sf_fix_imgs(imgs, uri, exts = [])
129
172
  sf_check_img_ext(imgs.map do |img|
130
- img = img.to_s
131
- img = sf_fix_protocol(img, sf_domain(uri)) unless img =~ sf_regex(:protocol)
132
- img if (img =~ sf_regex(:image))
173
+ img = img.to_s
174
+ unless img =~ sf_regex(:protocol)
175
+ img = sf_fix_protocol(img, sf_domain(uri))
176
+ end
177
+ img if img =~ sf_regex(:image)
133
178
  end.compact, exts)
134
179
  end
135
180
 
136
181
  # Fix image URIs that don't have a protocol/domain set.
137
- #
182
+ #
138
183
  # Example:
139
184
  # >> sf_fix_protocol('/assets/image.jpg', 'http://adtangerine.com')
140
185
  # => 'http://adtangerine/assets/image.jpg'
141
- # >> sf_fix_protocol('//s.ytimg.com/yts/img/youtub_img.png', 'https://youtube.com')
186
+ # >> sf_fix_protocol(
187
+ # '//s.ytimg.com/yts/img/youtub_img.png',
188
+ # 'https://youtube.com'
189
+ # )
142
190
  # => 'https://s.ytimg.com/yts/img/youtub_img.png'
143
191
  # Arguments:
144
192
  # path: (String)
@@ -146,15 +194,15 @@ module Scrapifier
146
194
  # domain: (String)
147
195
  # - Domain that will be prepended into the path.
148
196
  def sf_fix_protocol(path, domain)
149
- if path =~ /^\/\/[^\/]+/
197
+ if path =~ %r{^//[^/]+}
150
198
  'http:' << path
151
199
  else
152
- "http://#{domain}#{'/' unless path =~ /^\/[^\/]+/}#{path}"
153
- end
200
+ "http://#{domain}#{'/' unless path =~ %r{^/[^/]+}}#{path}"
201
+ end
154
202
  end
155
203
 
156
204
  # Return the URI domain.
157
- #
205
+ #
158
206
  # Example:
159
207
  # >> sf_domain('http://adtangerine.com')
160
208
  # => 'adtangerine.com'
@@ -162,7 +210,8 @@ module Scrapifier
162
210
  # uri: (String)
163
211
  # - URI.
164
212
  def sf_domain(uri)
165
- (uri.split('/')[2] rescue '')
213
+ uri = uri.to_s.split('/')
214
+ uri.empty? ? '' : uri[2]
166
215
  end
167
216
  end
168
- end
217
+ end
@@ -1,3 +1,3 @@
1
1
  module Scrapifier
2
- VERSION = '0.0.3'
2
+ VERSION = '0.0.4'
3
3
  end
@@ -1,40 +1,41 @@
1
1
  module Factories
2
2
  private
3
- def sf_samples
4
- {
5
- misc: {
6
- http: 'http://adtangerine.com',
7
- https: 'https://rubygems.org/gems/string_awesome',
8
- ftp: 'ftp://ftpserver.com',
9
- www: 'www.twitflink.com'
3
+
4
+ def sf_samples
5
+ {
6
+ misc: {
7
+ http: 'http://adtangerine.com',
8
+ https: 'https://rubygems.org/gems/string_awesome',
9
+ ftp: 'ftp://ftpserver.com',
10
+ www: 'www.twitflink.com'
11
+ },
12
+ images: {
13
+ jpg: [
14
+ 'http://jlcauvin.com/wp-content/uploads/2013/09/heisenberg-breaking-bad.jpg',
15
+ 'https://www.foobar.com/awesome_image.jpeg?foo=bar&bar=foo',
16
+ 'http://foobar.com.br/nice-image.jpg'
17
+ ],
18
+ png: [
19
+ 'http://www.faniq.com/images/blog/58389e481aee9c5abbf49ff0a263f3ca.png',
20
+ 'https://foobar.br/awesome_image.png',
21
+ 'https://bar.foobar.br/foo/var/image.png?foo=bar',
22
+ ],
23
+ gif: [
24
+ 'http://31.media.tumblr.com/6eec77e355fe50bae424291fd8c58622/tumblr_me7ucl8kO61rf089no1_500.gif',
25
+ 'http://foobar.com/ugly_image.gif',
26
+ 'https://bar.foobar.br/foo/var/stop_using.gif?foo=bar'
27
+ ]
28
+ },
29
+ regexes: {
30
+ image: {
31
+ all: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg|png|gif)(\?.+)?$)/i,
32
+ jpg: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg)(\?.+)?$)/i,
33
+ png: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(png)(\?.+)?$)/i,
34
+ gif: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(gif)(\?.+)?$)/i
10
35
  },
11
- images: {
12
- jpg: [
13
- 'http://jlcauvin.com/wp-content/uploads/2013/09/heisenberg-breaking-bad.jpg',
14
- 'https://www.foobar.com/awesome_image.jpeg?foo=bar&bar=foo',
15
- 'http://foobar.com.br/nice-image.jpg'
16
- ],
17
- png: [
18
- 'http://www.faniq.com/images/blog/58389e481aee9c5abbf49ff0a263f3ca.png',
19
- 'https://foobar.br/awesome_image.png',
20
- 'https://bar.foobar.br/foo/var/image.png?foo=bar',
21
- ],
22
- gif: [
23
- 'http://31.media.tumblr.com/6eec77e355fe50bae424291fd8c58622/tumblr_me7ucl8kO61rf089no1_500.gif',
24
- 'http://foobar.com/ugly_image.gif',
25
- 'https://bar.foobar.br/foo/var/stop_using.gif?foo=bar'
26
- ]
27
- },
28
- regexes: {
29
- image: {
30
- all: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg|png|gif)(\?.+)?$)/i,
31
- jpg: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(jpg|jpeg)(\?.+)?$)/i,
32
- png: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(png)(\?.+)?$)/i,
33
- gif: /(^http{1}[s]?:\/\/([w]{3}\.)?.+\.(gif)(\?.+)?$)/i
34
- },
35
- uri: /\b((((ht|f)tp[s]?:\/\/)|([a-z0-9]+\.))+(?<!@)([a-z0-9\_\-]+)(\.[a-z]+)+([\?\/\:][a-z0-9_=%&@\?\.\/\-\:\#\(\)]+)?\/?)/i,
36
- protocol: /((ht|f)tp[s]?)/i
37
- }
36
+ uri: /\b((((ht|f)tp[s]?:\/\/)|([a-z0-9]+\.))+(?<!@)([a-z0-9\_\-]+)(\.[a-z]+)+([\?\/\:][a-z0-9_=%&@\?\.\/\-\:\#\(\)]+)?\/?)/i,
37
+ protocol: /((ht|f)tp[s]?)/i
38
38
  }
39
- end
40
- end
39
+ }
40
+ end
41
+ end
@@ -62,13 +62,16 @@ describe String do
62
62
  end
63
63
 
64
64
  it "includes a field with image URIs from the site's head/body" do
65
- hash[:images].is_a?(Array).should be_true
66
- hash[:images].sample.should match(regexes[:image][:all])
65
+ unless hash[:images].empty?
66
+ hash[:images].is_a?(Array).should be_true
67
+ hash[:images].sample.should match(regexes[:image][:all])
68
+ end
67
69
  end
68
70
  end
69
71
 
70
72
  it "includes a field with only the allowed types of image URIs from the site's head/body" do
71
- misc[:http].scrapify(images: :png)[:images].sample.should match(regexes[:image][:png])
73
+ image = misc[:http].scrapify(images: :png)[:images].sample
74
+ image.should match(regexes[:image][:png]) unless image.nil?
72
75
  end
73
76
 
74
77
  it "can choose the URI in the String to be scrapified" do
@@ -2,4 +2,3 @@ require 'rubygems'
2
2
  require 'bundler/setup'
3
3
  require 'scrapifier'
4
4
  require 'factories/samples'
5
-
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrapifier
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tiago Guedes
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-04-30 00:00:00.000000000 Z
11
+ date: 2014-06-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri