metainspector 1.16.1 → 1.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,7 +9,7 @@ describe MetaInspector do
9
9
  m = MetaInspector.new("http://facebook.com")
10
10
  m.title.should be_nil
11
11
  m.should_not be_ok
12
- m.errors.first.should == "Scraping exception: redirection forbidden: http://facebook.com/ -> https://www.facebook.com/"
12
+ m.exceptions.first.message.should == "redirection forbidden: http://facebook.com/ -> https://www.facebook.com/"
13
13
  end
14
14
 
15
15
  it "allows safe redirections when :allow_redirections => :safe" do
@@ -30,14 +30,14 @@ describe MetaInspector do
30
30
  m = MetaInspector.new("https://unsafe-facebook.com")
31
31
  m.title.should be_nil
32
32
  m.should_not be_ok
33
- m.errors.first.should == "Scraping exception: redirection forbidden: https://unsafe-facebook.com/ -> http://unsafe-facebook.com/"
33
+ m.exceptions.first.message.should == "redirection forbidden: https://unsafe-facebook.com/ -> http://unsafe-facebook.com/"
34
34
  end
35
35
 
36
36
  it "disallows unsafe redirections when :allow_redirections => :safe" do
37
37
  m = MetaInspector.new("https://unsafe-facebook.com", :allow_redirections => :safe)
38
38
  m.title.should be_nil
39
39
  m.should_not be_ok
40
- m.errors.first.should == "Scraping exception: redirection forbidden: https://unsafe-facebook.com/ -> http://unsafe-facebook.com/"
40
+ m.exceptions.first.message.should == "redirection forbidden: https://unsafe-facebook.com/ -> http://unsafe-facebook.com/"
41
41
  end
42
42
 
43
43
  it "allows unsafe redirections when :allow_redirections => :all" do
@@ -46,5 +46,22 @@ describe MetaInspector do
46
46
  m.should be_ok
47
47
  end
48
48
  end
49
+
50
+ describe "Redirections should update the base_uri" do
51
+ it "updates the base_uri on safe redirections" do
52
+ m = MetaInspector.new("http://facebook.com", :allow_redirections => :safe)
53
+ # Check for the title to make sure the request happens
54
+ m.title.should == "Hello From Facebook"
55
+ m.url.should == "https://www.facebook.com/"
56
+ end
57
+
58
+ it "updates the base_uri on all redirections" do
59
+ m = MetaInspector.new("http://facebook.com", :allow_redirections => :all)
60
+ # Check for the title to make sure the request happens
61
+ m.title.should == "Hello From Facebook"
62
+
63
+ m.url.should == "https://www.facebook.com/"
64
+ end
65
+ end
49
66
  end
50
67
  end
@@ -0,0 +1,64 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ require File.join(File.dirname(__FILE__), "/spec_helper")
4
+
5
+ describe MetaInspector::Request do
6
+
7
+ describe "read" do
8
+ it "should return the content of the page" do
9
+ page_request = MetaInspector::Request.new(url('http://pagerankalert.com'))
10
+
11
+ page_request.read[0..14].should == "<!DOCTYPE html>"
12
+ end
13
+ end
14
+
15
+ describe "content_type" do
16
+ it "should return the correct content type of the url for html pages" do
17
+ page_request = MetaInspector::Request.new(url('http://pagerankalert.com'))
18
+
19
+ page_request.content_type.should == "text/html"
20
+ end
21
+
22
+ it "should return the correct content type of the url for non html pages" do
23
+ image_request = MetaInspector::Request.new(url('http://pagerankalert.com/image.png'))
24
+
25
+ image_request.content_type.should == "image/png"
26
+ end
27
+ end
28
+
29
+ describe 'exception handling' do
30
+ before(:each) do
31
+ FakeWeb.allow_net_connect = true
32
+ end
33
+
34
+ after(:each) do
35
+ FakeWeb.allow_net_connect = false
36
+ end
37
+
38
+ it "should handle timeouts" do
39
+ impatient = MetaInspector::Request.new(url('http://example.com'), timeout: 0.0000000000001)
40
+
41
+ expect {
42
+ impatient.read.should be_nil
43
+ }.to change { impatient.exceptions.size }
44
+
45
+ impatient.exceptions.first.class.should == Timeout::Error
46
+ end
47
+
48
+ it "should handle socket errors" do
49
+ nowhere = MetaInspector::Request.new(url('http://caca232dsdsaer3sdsd-asd343.org'))
50
+
51
+ expect {
52
+ nowhere.read.should be_nil
53
+ }.to change { nowhere.exceptions.size }
54
+
55
+ nowhere.exceptions.first.class.should == SocketError
56
+ end
57
+ end
58
+
59
+ private
60
+
61
+ def url(initial_url)
62
+ MetaInspector::URL.new(initial_url)
63
+ end
64
+ end
@@ -0,0 +1,74 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ require File.join(File.dirname(__FILE__), "/spec_helper")
4
+
5
+ describe MetaInspector::URL do
6
+ it "should normalize URLs" do
7
+ MetaInspector::URL.new('http://example.com').url.should == 'http://example.com/'
8
+ end
9
+
10
+ it 'should accept an URL with a scheme' do
11
+ MetaInspector::URL.new('http://example.com/').url.should == 'http://example.com/'
12
+ end
13
+
14
+ it "should use http:// as a default scheme" do
15
+ MetaInspector::URL.new('example.com').url.should == 'http://example.com/'
16
+ end
17
+
18
+ it "should accept an URL with international characters" do
19
+ MetaInspector::URL.new('http://international.com/olé').url.should == 'http://international.com/ol%C3%A9'
20
+ end
21
+
22
+ it "should return the scheme" do
23
+ MetaInspector::URL.new('http://example.com').scheme.should == 'http'
24
+ MetaInspector::URL.new('https://example.com').scheme.should == 'https'
25
+ MetaInspector::URL.new('example.com').scheme.should == 'http'
26
+ end
27
+
28
+ it "should return the host" do
29
+ MetaInspector::URL.new('http://example.com').host.should == 'example.com'
30
+ MetaInspector::URL.new('https://example.com').host.should == 'example.com'
31
+ MetaInspector::URL.new('example.com').host.should == 'example.com'
32
+ end
33
+
34
+ it "should return the root url" do
35
+ MetaInspector::URL.new('http://example.com').root_url.should == 'http://example.com/'
36
+ MetaInspector::URL.new('https://example.com').root_url.should == 'https://example.com/'
37
+ MetaInspector::URL.new('example.com').root_url.should == 'http://example.com/'
38
+ MetaInspector::URL.new('http://example.com/faqs').root_url.should == 'http://example.com/'
39
+ end
40
+
41
+ describe "url=" do
42
+ it "should update the url" do
43
+ url = MetaInspector::URL.new('http://first.com/')
44
+
45
+ url.url = 'http://second.com/'
46
+ url.url.should == 'http://second.com/'
47
+ end
48
+
49
+ it "should add the missing scheme and normalize" do
50
+ url = MetaInspector::URL.new('http://first.com/')
51
+
52
+ url.url = 'second.com'
53
+ url.url.should == 'http://second.com/'
54
+ end
55
+ end
56
+
57
+ describe "exception handling" do
58
+ it "should handle URI::InvalidURIError" do
59
+ expect {
60
+ @malformed = MetaInspector::URL.new('javascript://')
61
+ }.to_not raise_error
62
+
63
+ @malformed.exceptions.first.class.should == URI::InvalidURIError
64
+ end
65
+
66
+ it "should handle URI::InvalidComponentError" do
67
+ expect {
68
+ @malformed = MetaInspector::URL.new('mailto:email(at)example.com')
69
+ }.to_not raise_error
70
+
71
+ @malformed.exceptions.first.class.should == URI::InvalidComponentError
72
+ end
73
+ end
74
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: metainspector
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.16.1
4
+ version: 1.17.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jaime Iniesta
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-10-01 00:00:00.000000000 Z
11
+ date: 2013-10-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -138,12 +138,20 @@ files:
138
138
  - README.md
139
139
  - Rakefile
140
140
  - lib/meta_inspector.rb
141
- - lib/meta_inspector/scraper.rb
141
+ - lib/meta_inspector/deprecations.rb
142
+ - lib/meta_inspector/document.rb
143
+ - lib/meta_inspector/exception_log.rb
144
+ - lib/meta_inspector/exceptionable.rb
145
+ - lib/meta_inspector/parser.rb
146
+ - lib/meta_inspector/request.rb
147
+ - lib/meta_inspector/url.rb
142
148
  - lib/meta_inspector/version.rb
143
149
  - lib/metainspector.rb
144
150
  - meta_inspector.gemspec
145
151
  - samples/basic_scraping.rb
146
152
  - samples/spider.rb
153
+ - spec/document_spec.rb
154
+ - spec/exception_log_spec.rb
147
155
  - spec/fixtures/alazan.com.response
148
156
  - spec/fixtures/alazan_websolution.response
149
157
  - spec/fixtures/charset_000.response
@@ -171,9 +179,12 @@ files:
171
179
  - spec/fixtures/unsafe_https.facebook.com.response
172
180
  - spec/fixtures/wordpress_site.response
173
181
  - spec/fixtures/youtube.response
174
- - spec/metainspector_spec.rb
182
+ - spec/meta_inspector_spec.rb
183
+ - spec/parser_spec.rb
175
184
  - spec/redirections_spec.rb
185
+ - spec/request_spec.rb
176
186
  - spec/spec_helper.rb
187
+ - spec/url_spec.rb
177
188
  homepage: http://jaimeiniesta.github.io/metainspector/
178
189
  licenses: []
179
190
  metadata: {}
@@ -183,17 +194,17 @@ require_paths:
183
194
  - lib
184
195
  required_ruby_version: !ruby/object:Gem::Requirement
185
196
  requirements:
186
- - - '>='
197
+ - - ! '>='
187
198
  - !ruby/object:Gem::Version
188
199
  version: '0'
189
200
  required_rubygems_version: !ruby/object:Gem::Requirement
190
201
  requirements:
191
- - - '>='
202
+ - - ! '>='
192
203
  - !ruby/object:Gem::Version
193
204
  version: '0'
194
205
  requirements: []
195
206
  rubyforge_project:
196
- rubygems_version: 2.1.3
207
+ rubygems_version: 2.0.5
197
208
  signing_key:
198
209
  specification_version: 4
199
210
  summary: MetaInspector is a ruby gem for web scraping purposes, that returns a hash
@@ -1,283 +0,0 @@
1
- # -*- encoding: utf-8 -*-
2
-
3
- require 'open-uri'
4
- require 'open_uri_redirections'
5
- require 'addressable/uri'
6
- require 'nokogiri'
7
- require 'hashie/rash'
8
- require 'timeout'
9
-
10
- # MetaInspector provides an easy way to scrape web pages and get its elements
11
- module MetaInspector
12
- class Scraper
13
- attr_reader :url, :scheme, :host, :root_url, :errors, :content_type, :timeout, :html_content_only
14
- attr_reader :allow_redirections, :verbose
15
-
16
- # Initializes a new instance of MetaInspector, setting the URL to the one given
17
- # Options:
18
- # => timeout: defaults to 20 seconds
19
- # => html_content_type_only: if an exception should be raised if request content-type is not text/html. Defaults to false
20
- # => allow_redirections: when :safe, allows HTTP => HTTPS redirections. When :all, it also allows HTTPS => HTTP
21
- # => document: the html of the url as a string
22
- # => verbose: if the errors should be logged to the screen
23
- def initialize(url, options = {})
24
- options = defaults.merge(options)
25
-
26
- @url = with_default_scheme(normalize_url(url))
27
- @scheme = URI.parse(@url).scheme
28
- @host = URI.parse(@url).host
29
- @root_url = "#{@scheme}://#{@host}/"
30
- @timeout = options[:timeout]
31
- @data = Hashie::Rash.new
32
- @errors = []
33
- @html_content_only = options[:html_content_only]
34
- @allow_redirections = options[:allow_redirections]
35
- @verbose = options[:verbose]
36
- @document = options[:document]
37
- end
38
-
39
- # Returns the parsed document title, from the content of the <title> tag.
40
- # This is not the same as the meta_title tag
41
- def title
42
- @title ||= parsed_document.css('title').inner_text rescue nil
43
- end
44
-
45
- # A description getter that first checks for a meta description and if not present will
46
- # guess by looking at the first paragraph with more than 120 characters
47
- def description
48
- meta_description.nil? ? secondary_description : meta_description
49
- end
50
-
51
- # Links found on the page, as absolute URLs
52
- def links
53
- @links ||= parsed_links.map{ |l| absolutify_url(unrelativize_url(l)) }.compact.uniq
54
- end
55
-
56
- # Internal links found on the page, as absolute URLs
57
- def internal_links
58
- @internal_links ||= links.select {|link| host_from_url(link) == host }
59
- end
60
-
61
- # External links found on the page, as absolute URLs
62
- def external_links
63
- @external_links ||= links.select {|link| host_from_url(link) != host }
64
- end
65
-
66
- # Images found on the page, as absolute URLs
67
- def images
68
- @images ||= parsed_images.map{ |i| absolutify_url(i) }
69
- end
70
-
71
- # Returns the parsed image from Facebook's open graph property tags
72
- # Most all major websites now define this property and is usually very relevant
73
- # See doc at http://developers.facebook.com/docs/opengraph/
74
- def image
75
- meta_og_image || meta_twitter_image
76
- end
77
-
78
- # Returns the parsed document meta rss link
79
- def feed
80
- @feed ||= (parsed_feed('rss') || parsed_feed('atom'))
81
- end
82
-
83
- # Returns the charset from the meta tags, looking for it in the following order:
84
- # <meta charset='utf-8' />
85
- # <meta http-equiv="Content-Type" content="text/html; charset=windows-1252" />
86
- def charset
87
- @charset ||= (charset_from_meta_charset || charset_from_content_type)
88
- end
89
-
90
- # Returns all parsed data as a nested Hash
91
- def to_hash
92
- scrape_meta_data
93
-
94
- {
95
- 'url' => url,
96
- 'title' => title,
97
- 'links' => links,
98
- 'internal_links' => internal_links,
99
- 'external_links' => external_links,
100
- 'images' => images,
101
- 'charset' => charset,
102
- 'feed' => feed,
103
- 'content_type' => content_type
104
- }.merge @data.to_hash
105
- end
106
-
107
- # Returns the whole parsed document
108
- def parsed_document
109
- @parsed_document ||= Nokogiri::HTML(document)
110
- rescue Exception => e
111
- add_fatal_error "Parsing exception: #{e.message}"
112
- end
113
-
114
- # Returns the original, unparsed document
115
- def document
116
- @document ||= if html_content_only && content_type != "text/html"
117
- raise "The url provided contains #{content_type} content instead of text/html content" and nil
118
- else
119
- request.read
120
- end
121
- rescue Exception => e
122
- add_fatal_error "Scraping exception: #{e.message}"
123
- end
124
-
125
- # Returns the content_type of the fetched document
126
- def content_type
127
- @content_type ||= request.content_type
128
- end
129
-
130
- # Returns true if there are no errors
131
- def ok?
132
- errors.empty?
133
- end
134
-
135
- private
136
-
137
- def defaults
138
- {
139
- :timeout => 20,
140
- :html_content_only => false,
141
- :verbose => false
142
- }
143
- end
144
-
145
- # Scrapers for all meta_tags in the form of "meta_name" are automatically defined. This has been tested for
146
- # meta name: keywords, description, robots, generator
147
- # meta http-equiv: content-language, Content-Type
148
- #
149
- # It will first try with meta name="..." and if nothing found,
150
- # with meta http-equiv="...", substituting "_" by "-"
151
- # TODO: define respond_to? to return true on the meta_name methods
152
- def method_missing(method_name)
153
- if method_name.to_s =~ /^meta_(.*)/
154
- key = $1
155
-
156
- #special treatment for opengraph (og:) and twitter card (twitter:) tags
157
- key.gsub!("_",":") if key =~ /^og_(.*)/ || key =~ /^twitter_(.*)/
158
-
159
- scrape_meta_data
160
-
161
- @data.meta.name && (@data.meta.name[key.downcase]) || (@data.meta.property && @data.meta.property[key.downcase])
162
- else
163
- super
164
- end
165
- end
166
-
167
- # Makes the request to the server
168
- def request
169
- Timeout::timeout(timeout) { @request ||= open(url, {:allow_redirections => allow_redirections}) }
170
-
171
- rescue TimeoutError
172
- add_fatal_error 'Timeout!!!'
173
- rescue SocketError
174
- add_fatal_error 'Socket error: The url provided does not exist or is temporarily unavailable'
175
- rescue Exception => e
176
- add_fatal_error "Scraping exception: #{e.message}"
177
- end
178
-
179
- # Scrapes all meta tags found
180
- def scrape_meta_data
181
- unless @data.meta
182
- @data.meta!.name!
183
- @data.meta!.property!
184
- parsed_document.xpath("//meta").each do |element|
185
- get_meta_name_or_property(element)
186
- end
187
- end
188
- end
189
-
190
- # Store meta tag value, looking at meta name or meta property
191
- def get_meta_name_or_property(element)
192
- name_or_property = element.attributes["name"] ? "name" : (element.attributes["property"] ? "property" : nil)
193
- content_or_value = element.attributes["content"] ? "content" : (element.attributes["value"] ? "value" : nil)
194
-
195
- if !name_or_property.nil? && !content_or_value.nil?
196
- @data.meta.name[element.attributes[name_or_property].value.downcase] = element.attributes[content_or_value].value
197
- end
198
- end
199
-
200
- def parsed_feed(format)
201
- feed = parsed_document.search("//link[@type='application/#{format}+xml']").first
202
- feed ? absolutify_url(feed.attributes['href'].value) : nil
203
- end
204
-
205
- def parsed_links
206
- @parsed_links ||= cleanup_nokogiri_values(parsed_document.search("//a/@href"))
207
- end
208
-
209
- def parsed_images
210
- @parsed_images ||= cleanup_nokogiri_values(parsed_document.search('//img/@src'))
211
- end
212
-
213
- # Takes a nokogiri search result, strips the values, rejects the empty ones, and removes duplicates
214
- def cleanup_nokogiri_values(results)
215
- results.map { |a| a.value.strip }.reject { |s| s.empty? }.uniq
216
- end
217
-
218
- # Stores the error for later inspection
219
- def add_fatal_error(error)
220
- warn error if verbose
221
- @errors << error
222
- end
223
-
224
- # Normalize url to deal with characters that should be encodes, add trailing slash, convert to downcase...
225
- def normalize_url(url)
226
- Addressable::URI.parse(url).normalize.to_s
227
- end
228
-
229
- # Adds 'http' as default scheme, if there if none
230
- def with_default_scheme(url)
231
- URI.parse(url).scheme.nil? ? 'http://' + url : url
232
- end
233
-
234
- # Convert a relative url like "/users" to an absolute one like "http://example.com/users"
235
- # Respecting already absolute URLs like the ones starting with http:, ftp:, telnet:, mailto:, javascript: ...
236
- def absolutify_url(uri)
237
- if uri =~ /^\w*\:/i
238
- normalize_url(uri)
239
- else
240
- Addressable::URI.join(base_url, uri).normalize.to_s
241
- end
242
- rescue URI::InvalidURIError, Addressable::URI::InvalidURIError => e
243
- add_fatal_error "Link parsing exception: #{e.message}" and nil
244
- end
245
-
246
- # Returns the base url to absolutify relative links. This can be the one set on a <base> tag,
247
- # or the url of the document if no <base> tag was found.
248
- def base_url
249
- base_href || @url
250
- end
251
-
252
- # Returns the value of the href attribute on the <base /> tag, if it exists
253
- def base_href
254
- parsed_document.search('base').first.attributes['href'].value rescue nil
255
- end
256
-
257
- # Convert a protocol-relative url to its full form, depending on the scheme of the page that contains it
258
- def unrelativize_url(url)
259
- url =~ /^\/\// ? "#{scheme}://#{url[2..-1]}" : url
260
- end
261
-
262
- # Extracts the host from a given URL
263
- def host_from_url(url)
264
- URI.parse(url).host
265
- rescue URI::InvalidURIError, URI::InvalidComponentError, Addressable::URI::InvalidURIError => e
266
- add_fatal_error "Link parsing exception: #{e.message}" and nil
267
- end
268
-
269
- # Look for the first <p> block with 120 characters or more
270
- def secondary_description
271
- first_long_paragraph = parsed_document.search('//p[string-length() >= 120]').first
272
- first_long_paragraph ? first_long_paragraph.text : ''
273
- end
274
-
275
- def charset_from_meta_charset
276
- parsed_document.css("meta[charset]")[0].attributes['charset'].value rescue nil
277
- end
278
-
279
- def charset_from_content_type
280
- parsed_document.css("meta[http-equiv='Content-Type']")[0].attributes['content'].value.split(";")[1].split("=")[1] rescue nil
281
- end
282
- end
283
- end