metainspector 1.16.1 → 1.17.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -9,7 +9,7 @@ describe MetaInspector do
9
9
  m = MetaInspector.new("http://facebook.com")
10
10
  m.title.should be_nil
11
11
  m.should_not be_ok
12
- m.errors.first.should == "Scraping exception: redirection forbidden: http://facebook.com/ -> https://www.facebook.com/"
12
+ m.exceptions.first.message.should == "redirection forbidden: http://facebook.com/ -> https://www.facebook.com/"
13
13
  end
14
14
 
15
15
  it "allows safe redirections when :allow_redirections => :safe" do
@@ -30,14 +30,14 @@ describe MetaInspector do
30
30
  m = MetaInspector.new("https://unsafe-facebook.com")
31
31
  m.title.should be_nil
32
32
  m.should_not be_ok
33
- m.errors.first.should == "Scraping exception: redirection forbidden: https://unsafe-facebook.com/ -> http://unsafe-facebook.com/"
33
+ m.exceptions.first.message.should == "redirection forbidden: https://unsafe-facebook.com/ -> http://unsafe-facebook.com/"
34
34
  end
35
35
 
36
36
  it "disallows unsafe redirections when :allow_redirections => :safe" do
37
37
  m = MetaInspector.new("https://unsafe-facebook.com", :allow_redirections => :safe)
38
38
  m.title.should be_nil
39
39
  m.should_not be_ok
40
- m.errors.first.should == "Scraping exception: redirection forbidden: https://unsafe-facebook.com/ -> http://unsafe-facebook.com/"
40
+ m.exceptions.first.message.should == "redirection forbidden: https://unsafe-facebook.com/ -> http://unsafe-facebook.com/"
41
41
  end
42
42
 
43
43
  it "allows unsafe redirections when :allow_redirections => :all" do
@@ -46,5 +46,22 @@ describe MetaInspector do
46
46
  m.should be_ok
47
47
  end
48
48
  end
49
+
50
+ describe "Redirections should update the base_uri" do
51
+ it "updates the base_uri on safe redirections" do
52
+ m = MetaInspector.new("http://facebook.com", :allow_redirections => :safe)
53
+ # Check for the title to make sure the request happens
54
+ m.title.should == "Hello From Facebook"
55
+ m.url.should == "https://www.facebook.com/"
56
+ end
57
+
58
+ it "updates the base_uri on all redirections" do
59
+ m = MetaInspector.new("http://facebook.com", :allow_redirections => :all)
60
+ # Check for the title to make sure the request happens
61
+ m.title.should == "Hello From Facebook"
62
+
63
+ m.url.should == "https://www.facebook.com/"
64
+ end
65
+ end
49
66
  end
50
67
  end
@@ -0,0 +1,64 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ require File.join(File.dirname(__FILE__), "/spec_helper")
4
+
5
+ describe MetaInspector::Request do
6
+
7
+ describe "read" do
8
+ it "should return the content of the page" do
9
+ page_request = MetaInspector::Request.new(url('http://pagerankalert.com'))
10
+
11
+ page_request.read[0..14].should == "<!DOCTYPE html>"
12
+ end
13
+ end
14
+
15
+ describe "content_type" do
16
+ it "should return the correct content type of the url for html pages" do
17
+ page_request = MetaInspector::Request.new(url('http://pagerankalert.com'))
18
+
19
+ page_request.content_type.should == "text/html"
20
+ end
21
+
22
+ it "should return the correct content type of the url for non html pages" do
23
+ image_request = MetaInspector::Request.new(url('http://pagerankalert.com/image.png'))
24
+
25
+ image_request.content_type.should == "image/png"
26
+ end
27
+ end
28
+
29
+ describe 'exception handling' do
30
+ before(:each) do
31
+ FakeWeb.allow_net_connect = true
32
+ end
33
+
34
+ after(:each) do
35
+ FakeWeb.allow_net_connect = false
36
+ end
37
+
38
+ it "should handle timeouts" do
39
+ impatient = MetaInspector::Request.new(url('http://example.com'), timeout: 0.0000000000001)
40
+
41
+ expect {
42
+ impatient.read.should be_nil
43
+ }.to change { impatient.exceptions.size }
44
+
45
+ impatient.exceptions.first.class.should == Timeout::Error
46
+ end
47
+
48
+ it "should handle socket errors" do
49
+ nowhere = MetaInspector::Request.new(url('http://caca232dsdsaer3sdsd-asd343.org'))
50
+
51
+ expect {
52
+ nowhere.read.should be_nil
53
+ }.to change { nowhere.exceptions.size }
54
+
55
+ nowhere.exceptions.first.class.should == SocketError
56
+ end
57
+ end
58
+
59
+ private
60
+
61
+ def url(initial_url)
62
+ MetaInspector::URL.new(initial_url)
63
+ end
64
+ end
@@ -0,0 +1,74 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ require File.join(File.dirname(__FILE__), "/spec_helper")
4
+
5
+ describe MetaInspector::URL do
6
+ it "should normalize URLs" do
7
+ MetaInspector::URL.new('http://example.com').url.should == 'http://example.com/'
8
+ end
9
+
10
+ it 'should accept an URL with a scheme' do
11
+ MetaInspector::URL.new('http://example.com/').url.should == 'http://example.com/'
12
+ end
13
+
14
+ it "should use http:// as a default scheme" do
15
+ MetaInspector::URL.new('example.com').url.should == 'http://example.com/'
16
+ end
17
+
18
+ it "should accept an URL with international characters" do
19
+ MetaInspector::URL.new('http://international.com/olé').url.should == 'http://international.com/ol%C3%A9'
20
+ end
21
+
22
+ it "should return the scheme" do
23
+ MetaInspector::URL.new('http://example.com').scheme.should == 'http'
24
+ MetaInspector::URL.new('https://example.com').scheme.should == 'https'
25
+ MetaInspector::URL.new('example.com').scheme.should == 'http'
26
+ end
27
+
28
+ it "should return the host" do
29
+ MetaInspector::URL.new('http://example.com').host.should == 'example.com'
30
+ MetaInspector::URL.new('https://example.com').host.should == 'example.com'
31
+ MetaInspector::URL.new('example.com').host.should == 'example.com'
32
+ end
33
+
34
+ it "should return the root url" do
35
+ MetaInspector::URL.new('http://example.com').root_url.should == 'http://example.com/'
36
+ MetaInspector::URL.new('https://example.com').root_url.should == 'https://example.com/'
37
+ MetaInspector::URL.new('example.com').root_url.should == 'http://example.com/'
38
+ MetaInspector::URL.new('http://example.com/faqs').root_url.should == 'http://example.com/'
39
+ end
40
+
41
+ describe "url=" do
42
+ it "should update the url" do
43
+ url = MetaInspector::URL.new('http://first.com/')
44
+
45
+ url.url = 'http://second.com/'
46
+ url.url.should == 'http://second.com/'
47
+ end
48
+
49
+ it "should add the missing scheme and normalize" do
50
+ url = MetaInspector::URL.new('http://first.com/')
51
+
52
+ url.url = 'second.com'
53
+ url.url.should == 'http://second.com/'
54
+ end
55
+ end
56
+
57
+ describe "exception handling" do
58
+ it "should handle URI::InvalidURIError" do
59
+ expect {
60
+ @malformed = MetaInspector::URL.new('javascript://')
61
+ }.to_not raise_error
62
+
63
+ @malformed.exceptions.first.class.should == URI::InvalidURIError
64
+ end
65
+
66
+ it "should handle URI::InvalidComponentError" do
67
+ expect {
68
+ @malformed = MetaInspector::URL.new('mailto:email(at)example.com')
69
+ }.to_not raise_error
70
+
71
+ @malformed.exceptions.first.class.should == URI::InvalidComponentError
72
+ end
73
+ end
74
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: metainspector
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.16.1
4
+ version: 1.17.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jaime Iniesta
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-10-01 00:00:00.000000000 Z
11
+ date: 2013-10-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -138,12 +138,20 @@ files:
138
138
  - README.md
139
139
  - Rakefile
140
140
  - lib/meta_inspector.rb
141
- - lib/meta_inspector/scraper.rb
141
+ - lib/meta_inspector/deprecations.rb
142
+ - lib/meta_inspector/document.rb
143
+ - lib/meta_inspector/exception_log.rb
144
+ - lib/meta_inspector/exceptionable.rb
145
+ - lib/meta_inspector/parser.rb
146
+ - lib/meta_inspector/request.rb
147
+ - lib/meta_inspector/url.rb
142
148
  - lib/meta_inspector/version.rb
143
149
  - lib/metainspector.rb
144
150
  - meta_inspector.gemspec
145
151
  - samples/basic_scraping.rb
146
152
  - samples/spider.rb
153
+ - spec/document_spec.rb
154
+ - spec/exception_log_spec.rb
147
155
  - spec/fixtures/alazan.com.response
148
156
  - spec/fixtures/alazan_websolution.response
149
157
  - spec/fixtures/charset_000.response
@@ -171,9 +179,12 @@ files:
171
179
  - spec/fixtures/unsafe_https.facebook.com.response
172
180
  - spec/fixtures/wordpress_site.response
173
181
  - spec/fixtures/youtube.response
174
- - spec/metainspector_spec.rb
182
+ - spec/meta_inspector_spec.rb
183
+ - spec/parser_spec.rb
175
184
  - spec/redirections_spec.rb
185
+ - spec/request_spec.rb
176
186
  - spec/spec_helper.rb
187
+ - spec/url_spec.rb
177
188
  homepage: http://jaimeiniesta.github.io/metainspector/
178
189
  licenses: []
179
190
  metadata: {}
@@ -183,17 +194,17 @@ require_paths:
183
194
  - lib
184
195
  required_ruby_version: !ruby/object:Gem::Requirement
185
196
  requirements:
186
- - - '>='
197
+ - - ! '>='
187
198
  - !ruby/object:Gem::Version
188
199
  version: '0'
189
200
  required_rubygems_version: !ruby/object:Gem::Requirement
190
201
  requirements:
191
- - - '>='
202
+ - - ! '>='
192
203
  - !ruby/object:Gem::Version
193
204
  version: '0'
194
205
  requirements: []
195
206
  rubyforge_project:
196
- rubygems_version: 2.1.3
207
+ rubygems_version: 2.0.5
197
208
  signing_key:
198
209
  specification_version: 4
199
210
  summary: MetaInspector is a ruby gem for web scraping purposes, that returns a hash
@@ -1,283 +0,0 @@
1
- # -*- encoding: utf-8 -*-
2
-
3
- require 'open-uri'
4
- require 'open_uri_redirections'
5
- require 'addressable/uri'
6
- require 'nokogiri'
7
- require 'hashie/rash'
8
- require 'timeout'
9
-
10
- # MetaInspector provides an easy way to scrape web pages and get its elements
11
- module MetaInspector
12
- class Scraper
13
- attr_reader :url, :scheme, :host, :root_url, :errors, :content_type, :timeout, :html_content_only
14
- attr_reader :allow_redirections, :verbose
15
-
16
- # Initializes a new instance of MetaInspector, setting the URL to the one given
17
- # Options:
18
- # => timeout: defaults to 20 seconds
19
- # => html_content_type_only: if an exception should be raised if request content-type is not text/html. Defaults to false
20
- # => allow_redirections: when :safe, allows HTTP => HTTPS redirections. When :all, it also allows HTTPS => HTTP
21
- # => document: the html of the url as a string
22
- # => verbose: if the errors should be logged to the screen
23
- def initialize(url, options = {})
24
- options = defaults.merge(options)
25
-
26
- @url = with_default_scheme(normalize_url(url))
27
- @scheme = URI.parse(@url).scheme
28
- @host = URI.parse(@url).host
29
- @root_url = "#{@scheme}://#{@host}/"
30
- @timeout = options[:timeout]
31
- @data = Hashie::Rash.new
32
- @errors = []
33
- @html_content_only = options[:html_content_only]
34
- @allow_redirections = options[:allow_redirections]
35
- @verbose = options[:verbose]
36
- @document = options[:document]
37
- end
38
-
39
- # Returns the parsed document title, from the content of the <title> tag.
40
- # This is not the same as the meta_title tag
41
- def title
42
- @title ||= parsed_document.css('title').inner_text rescue nil
43
- end
44
-
45
- # A description getter that first checks for a meta description and if not present will
46
- # guess by looking at the first paragraph with more than 120 characters
47
- def description
48
- meta_description.nil? ? secondary_description : meta_description
49
- end
50
-
51
- # Links found on the page, as absolute URLs
52
- def links
53
- @links ||= parsed_links.map{ |l| absolutify_url(unrelativize_url(l)) }.compact.uniq
54
- end
55
-
56
- # Internal links found on the page, as absolute URLs
57
- def internal_links
58
- @internal_links ||= links.select {|link| host_from_url(link) == host }
59
- end
60
-
61
- # External links found on the page, as absolute URLs
62
- def external_links
63
- @external_links ||= links.select {|link| host_from_url(link) != host }
64
- end
65
-
66
- # Images found on the page, as absolute URLs
67
- def images
68
- @images ||= parsed_images.map{ |i| absolutify_url(i) }
69
- end
70
-
71
- # Returns the parsed image from Facebook's open graph property tags
72
- # Most all major websites now define this property and is usually very relevant
73
- # See doc at http://developers.facebook.com/docs/opengraph/
74
- def image
75
- meta_og_image || meta_twitter_image
76
- end
77
-
78
- # Returns the parsed document meta rss link
79
- def feed
80
- @feed ||= (parsed_feed('rss') || parsed_feed('atom'))
81
- end
82
-
83
- # Returns the charset from the meta tags, looking for it in the following order:
84
- # <meta charset='utf-8' />
85
- # <meta http-equiv="Content-Type" content="text/html; charset=windows-1252" />
86
- def charset
87
- @charset ||= (charset_from_meta_charset || charset_from_content_type)
88
- end
89
-
90
- # Returns all parsed data as a nested Hash
91
- def to_hash
92
- scrape_meta_data
93
-
94
- {
95
- 'url' => url,
96
- 'title' => title,
97
- 'links' => links,
98
- 'internal_links' => internal_links,
99
- 'external_links' => external_links,
100
- 'images' => images,
101
- 'charset' => charset,
102
- 'feed' => feed,
103
- 'content_type' => content_type
104
- }.merge @data.to_hash
105
- end
106
-
107
- # Returns the whole parsed document
108
- def parsed_document
109
- @parsed_document ||= Nokogiri::HTML(document)
110
- rescue Exception => e
111
- add_fatal_error "Parsing exception: #{e.message}"
112
- end
113
-
114
- # Returns the original, unparsed document
115
- def document
116
- @document ||= if html_content_only && content_type != "text/html"
117
- raise "The url provided contains #{content_type} content instead of text/html content" and nil
118
- else
119
- request.read
120
- end
121
- rescue Exception => e
122
- add_fatal_error "Scraping exception: #{e.message}"
123
- end
124
-
125
- # Returns the content_type of the fetched document
126
- def content_type
127
- @content_type ||= request.content_type
128
- end
129
-
130
- # Returns true if there are no errors
131
- def ok?
132
- errors.empty?
133
- end
134
-
135
- private
136
-
137
- def defaults
138
- {
139
- :timeout => 20,
140
- :html_content_only => false,
141
- :verbose => false
142
- }
143
- end
144
-
145
- # Scrapers for all meta_tags in the form of "meta_name" are automatically defined. This has been tested for
146
- # meta name: keywords, description, robots, generator
147
- # meta http-equiv: content-language, Content-Type
148
- #
149
- # It will first try with meta name="..." and if nothing found,
150
- # with meta http-equiv="...", substituting "_" by "-"
151
- # TODO: define respond_to? to return true on the meta_name methods
152
- def method_missing(method_name)
153
- if method_name.to_s =~ /^meta_(.*)/
154
- key = $1
155
-
156
- #special treatment for opengraph (og:) and twitter card (twitter:) tags
157
- key.gsub!("_",":") if key =~ /^og_(.*)/ || key =~ /^twitter_(.*)/
158
-
159
- scrape_meta_data
160
-
161
- @data.meta.name && (@data.meta.name[key.downcase]) || (@data.meta.property && @data.meta.property[key.downcase])
162
- else
163
- super
164
- end
165
- end
166
-
167
- # Makes the request to the server
168
- def request
169
- Timeout::timeout(timeout) { @request ||= open(url, {:allow_redirections => allow_redirections}) }
170
-
171
- rescue TimeoutError
172
- add_fatal_error 'Timeout!!!'
173
- rescue SocketError
174
- add_fatal_error 'Socket error: The url provided does not exist or is temporarily unavailable'
175
- rescue Exception => e
176
- add_fatal_error "Scraping exception: #{e.message}"
177
- end
178
-
179
- # Scrapes all meta tags found
180
- def scrape_meta_data
181
- unless @data.meta
182
- @data.meta!.name!
183
- @data.meta!.property!
184
- parsed_document.xpath("//meta").each do |element|
185
- get_meta_name_or_property(element)
186
- end
187
- end
188
- end
189
-
190
- # Store meta tag value, looking at meta name or meta property
191
- def get_meta_name_or_property(element)
192
- name_or_property = element.attributes["name"] ? "name" : (element.attributes["property"] ? "property" : nil)
193
- content_or_value = element.attributes["content"] ? "content" : (element.attributes["value"] ? "value" : nil)
194
-
195
- if !name_or_property.nil? && !content_or_value.nil?
196
- @data.meta.name[element.attributes[name_or_property].value.downcase] = element.attributes[content_or_value].value
197
- end
198
- end
199
-
200
- def parsed_feed(format)
201
- feed = parsed_document.search("//link[@type='application/#{format}+xml']").first
202
- feed ? absolutify_url(feed.attributes['href'].value) : nil
203
- end
204
-
205
- def parsed_links
206
- @parsed_links ||= cleanup_nokogiri_values(parsed_document.search("//a/@href"))
207
- end
208
-
209
- def parsed_images
210
- @parsed_images ||= cleanup_nokogiri_values(parsed_document.search('//img/@src'))
211
- end
212
-
213
- # Takes a nokogiri search result, strips the values, rejects the empty ones, and removes duplicates
214
- def cleanup_nokogiri_values(results)
215
- results.map { |a| a.value.strip }.reject { |s| s.empty? }.uniq
216
- end
217
-
218
- # Stores the error for later inspection
219
- def add_fatal_error(error)
220
- warn error if verbose
221
- @errors << error
222
- end
223
-
224
- # Normalize url to deal with characters that should be encodes, add trailing slash, convert to downcase...
225
- def normalize_url(url)
226
- Addressable::URI.parse(url).normalize.to_s
227
- end
228
-
229
- # Adds 'http' as default scheme, if there if none
230
- def with_default_scheme(url)
231
- URI.parse(url).scheme.nil? ? 'http://' + url : url
232
- end
233
-
234
- # Convert a relative url like "/users" to an absolute one like "http://example.com/users"
235
- # Respecting already absolute URLs like the ones starting with http:, ftp:, telnet:, mailto:, javascript: ...
236
- def absolutify_url(uri)
237
- if uri =~ /^\w*\:/i
238
- normalize_url(uri)
239
- else
240
- Addressable::URI.join(base_url, uri).normalize.to_s
241
- end
242
- rescue URI::InvalidURIError, Addressable::URI::InvalidURIError => e
243
- add_fatal_error "Link parsing exception: #{e.message}" and nil
244
- end
245
-
246
- # Returns the base url to absolutify relative links. This can be the one set on a <base> tag,
247
- # or the url of the document if no <base> tag was found.
248
- def base_url
249
- base_href || @url
250
- end
251
-
252
- # Returns the value of the href attribute on the <base /> tag, if it exists
253
- def base_href
254
- parsed_document.search('base').first.attributes['href'].value rescue nil
255
- end
256
-
257
- # Convert a protocol-relative url to its full form, depending on the scheme of the page that contains it
258
- def unrelativize_url(url)
259
- url =~ /^\/\// ? "#{scheme}://#{url[2..-1]}" : url
260
- end
261
-
262
- # Extracts the host from a given URL
263
- def host_from_url(url)
264
- URI.parse(url).host
265
- rescue URI::InvalidURIError, URI::InvalidComponentError, Addressable::URI::InvalidURIError => e
266
- add_fatal_error "Link parsing exception: #{e.message}" and nil
267
- end
268
-
269
- # Look for the first <p> block with 120 characters or more
270
- def secondary_description
271
- first_long_paragraph = parsed_document.search('//p[string-length() >= 120]').first
272
- first_long_paragraph ? first_long_paragraph.text : ''
273
- end
274
-
275
- def charset_from_meta_charset
276
- parsed_document.css("meta[charset]")[0].attributes['charset'].value rescue nil
277
- end
278
-
279
- def charset_from_content_type
280
- parsed_document.css("meta[http-equiv='Content-Type']")[0].attributes['content'].value.split(";")[1].split("=")[1] rescue nil
281
- end
282
- end
283
- end