metainspector 2.1.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: fcb1f8adaeb835639f198ee9ede5f54da02211fd
4
- data.tar.gz: 40bb733a1e3f2b48011c535951a2a2ac2bb097dc
3
+ metadata.gz: 9c230bbbcf325166d2e5906ed7f13af8e0f8ef06
4
+ data.tar.gz: 245e219fc420a20e8fd8eef722502a6cc23d8087
5
5
  SHA512:
6
- metadata.gz: 5f502ee4f24fb04b8d0afb8e369a5da2ac7c7c4a4b4c8a743cfbfd24f0f15125d139c5f35bd65c2fe9bce59d4015305c2fdf05f4fe6e696e691e90bb60e445b2
7
- data.tar.gz: 2f9252d0283f815b15bc7e4c43b33e588975b63a935354e3e0c34639a0bad1c96d52b63308f9902120829b3747c796fc43f36d95d5ee78237c9dbc4fb64baa80
6
+ metadata.gz: 791526a75ce5e23651a7cbded8ee4e9ab12fd75281d7089a2f2191ee44293c9f1b8d8587cae3284e2ed1c103d9020cfa2038fd1be4ddc90d2e78d1235fcc0374
7
+ data.tar.gz: f3c535c4a4b7618168e30cf2427a22365404dc00f698dde32669364a86770bc8445b20989974298afd124822fddea272afd000035663d7e0eb6cd5acb0676006
data/.travis.yml CHANGED
@@ -2,4 +2,5 @@ rvm:
2
2
  - 1.9.2
3
3
  - 1.9.3
4
4
  - 2.0.0
5
- - 2.1.0
5
+ - 2.1.0
6
+ - 2.1.1
data/README.md CHANGED
@@ -191,6 +191,17 @@ However, you can tell MetaInspector to allow these redirections with the option
191
191
  # And this will allow HTTP => HTTPS ("safe") and HTTPS => HTTP ("unsafe") redirections
192
192
  page = MetaInspector.new('facebook.com', :allow_redirections => :all)
193
193
 
194
+ ### Headers
195
+
196
+ By default, the following headers are set:
197
+
198
+ {'User-Agent' => "MetaInspector/#{MetaInspector::VERSION} (+https://github.com/jaimeiniesta/metainspector)"}
199
+
200
+ If you want to set custom headers then use the `headers` option:
201
+
202
+ # Set the User-Agent header
203
+ page = MetaInspector.new('example.com', :headers => {'User-Agent' => 'My custom User-Agent'})
204
+
194
205
  ### HTML Content Only
195
206
 
196
207
  MetaInspector will try to parse all URLs by default. If you want to raise an exception when trying to parse a non-html URL (one that has a content-type different than text/html), you can state it like this:
@@ -215,7 +226,7 @@ However, if you prefer you can also set the `warn_level: :warn` option, so that
215
226
 
216
227
  You can also set the `warn_level: :store` option so that exceptions found will be silenced, and left for you to inspect on `page.exceptions`. You can also ask for `page.ok?`, wich will return `true` if no exceptions are stored.
217
228
 
218
- You should avoid using the `:store` option, or use it wisely, as silencing errors can be problematic, it's always better to face the errors and treat them accordingly.
229
+ You should avoid using the `:store` option, or use it wisely, as silencing errors can be problematic, it's always better to face the errors and treat them accordingly.
219
230
 
220
231
  ## Examples
221
232
 
@@ -7,7 +7,6 @@ require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/reque
7
7
  require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/url'))
8
8
  require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/parser'))
9
9
  require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/document'))
10
- require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/deprecations'))
11
10
 
12
11
  module MetaInspector
13
12
  extend self
@@ -3,7 +3,7 @@
3
3
  module MetaInspector
4
4
  # A MetaInspector::Document knows about its URL and its contents
5
5
  class Document
6
- attr_reader :timeout, :html_content_only, :allow_redirections, :warn_level
6
+ attr_reader :timeout, :html_content_only, :allow_redirections, :warn_level, :headers
7
7
 
8
8
  include MetaInspector::Exceptionable
9
9
 
@@ -14,25 +14,22 @@ module MetaInspector
14
14
  # => allow_redirections: when :safe, allows HTTP => HTTPS redirections. When :all, it also allows HTTPS => HTTP
15
15
  # => document: the html of the url as a string
16
16
  # => warn_level: what to do when encountering exceptions. Can be :warn, :raise or nil
17
+ # => headers: object containing custom headers for the request
17
18
  def initialize(initial_url, options = {})
18
19
  options = defaults.merge(options)
19
20
  @timeout = options[:timeout]
20
21
  @html_content_only = options[:html_content_only]
21
22
  @allow_redirections = options[:allow_redirections]
22
23
  @document = options[:document]
23
-
24
- if options[:verbose] == true
25
- warn "The verbose option is deprecated since 1.17, please use warn_level: :warn instead"
26
- options[:warn_level] = :warn
27
- end
28
-
29
- @warn_level = options[:warn_level]
30
- @exception_log = options[:exception_log] || MetaInspector::ExceptionLog.new(warn_level: warn_level)
31
- @url = MetaInspector::URL.new(initial_url, exception_log: @exception_log)
32
- @request = MetaInspector::Request.new(@url, allow_redirections: @allow_redirections,
33
- timeout: @timeout,
34
- exception_log: @exception_log) unless @document
35
- @parser = MetaInspector::Parser.new(self, exception_log: @exception_log)
24
+ @headers = options[:headers]
25
+ @warn_level = options[:warn_level]
26
+ @exception_log = options[:exception_log] || MetaInspector::ExceptionLog.new(warn_level: warn_level)
27
+ @url = MetaInspector::URL.new(initial_url, exception_log: @exception_log)
28
+ @request = MetaInspector::Request.new(@url, allow_redirections: @allow_redirections,
29
+ timeout: @timeout,
30
+ exception_log: @exception_log,
31
+ headers: @headers) unless @document
32
+ @parser = MetaInspector::Parser.new(self, exception_log: @exception_log)
36
33
  end
37
34
 
38
35
  extend Forwardable
@@ -65,7 +62,11 @@ module MetaInspector
65
62
  private
66
63
 
67
64
  def defaults
68
- { :timeout => 20, :html_content_only => false, :warn_level => :raise }
65
+ { :timeout => 20,
66
+ :html_content_only => false,
67
+ :warn_level => :raise,
68
+ :headers => {'User-Agent' => "MetaInspector/#{MetaInspector::VERSION} (+https://github.com/jaimeiniesta/metainspector)"}
69
+ }
69
70
  end
70
71
 
71
72
  def document
@@ -17,6 +17,7 @@ module MetaInspector
17
17
  @allow_redirections = options[:allow_redirections]
18
18
  @timeout = options[:timeout]
19
19
  @exception_log = options[:exception_log]
20
+ @headers = options[:headers]
20
21
 
21
22
  response # as soon as it is set up, we make the request so we can fail early
22
23
  end
@@ -43,7 +44,11 @@ module MetaInspector
43
44
  end
44
45
 
45
46
  def fetch
46
- request = open(url, {:allow_redirections => @allow_redirections})
47
+ options = {}
48
+ options.merge!(:allow_redirections => @allow_redirections) if @allow_redirections
49
+ options.merge!(@headers) if @headers.is_a?(Hash)
50
+
51
+ request = open(url, options)
47
52
 
48
53
  @url.url = request.base_uri.to_s
49
54
 
@@ -51,7 +56,7 @@ module MetaInspector
51
56
  end
52
57
 
53
58
  def defaults
54
- { allow_redirections: false, timeout: 20, exception_log: MetaInspector::ExceptionLog.new }
59
+ { timeout: 20, exception_log: MetaInspector::ExceptionLog.new }
55
60
  end
56
61
  end
57
62
  end
@@ -1,5 +1,5 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  module MetaInspector
4
- VERSION = "2.1.0"
4
+ VERSION = "2.2.0"
5
5
  end
@@ -89,4 +89,30 @@ describe MetaInspector::Document do
89
89
  tar_url.title
90
90
  end
91
91
  end
92
+
93
+ describe 'headers' do
94
+ it "should include default headers" do
95
+ url = 'http://example.com/headers'
96
+ request = double('Request', base_uri: url)
97
+ expected_headers = {'User-Agent' => "MetaInspector/#{MetaInspector::VERSION} (+https://github.com/jaimeiniesta/metainspector)"}
98
+
99
+ MetaInspector::Request.any_instance.should_receive(:open)
100
+ .with(url, expected_headers)
101
+ .and_return(request)
102
+
103
+ MetaInspector::Document.new(url)
104
+ end
105
+
106
+ it "should include passed headers on the request" do
107
+ url = 'http://example.com/headers'
108
+ headers = {'User-Agent' => 'Mozilla', 'Referer' => 'https://github.com/'}
109
+ request = double('Request', base_uri: url)
110
+
111
+ MetaInspector::Request.any_instance.should_receive(:open)
112
+ .with(url, headers)
113
+ .and_return(request)
114
+
115
+ MetaInspector::Document.new(url, headers: headers)
116
+ end
117
+ end
92
118
  end
metadata CHANGED
@@ -1,55 +1,55 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: metainspector
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.0
4
+ version: 2.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jaime Iniesta
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-02-16 00:00:00.000000000 Z
11
+ date: 2014-04-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - "~>"
17
+ - - ~>
18
18
  - !ruby/object:Gem::Version
19
19
  version: '1.6'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - "~>"
24
+ - - ~>
25
25
  - !ruby/object:Gem::Version
26
26
  version: '1.6'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: open_uri_redirections
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - "~>"
31
+ - - ~>
32
32
  - !ruby/object:Gem::Version
33
33
  version: 0.1.4
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - "~>"
38
+ - - ~>
39
39
  - !ruby/object:Gem::Version
40
40
  version: 0.1.4
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: addressable
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - "~>"
45
+ - - ~>
46
46
  - !ruby/object:Gem::Version
47
47
  version: 2.3.5
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - "~>"
52
+ - - ~>
53
53
  - !ruby/object:Gem::Version
54
54
  version: 2.3.5
55
55
  - !ruby/object:Gem::Dependency
@@ -84,28 +84,28 @@ dependencies:
84
84
  name: awesome_print
85
85
  requirement: !ruby/object:Gem::Requirement
86
86
  requirements:
87
- - - "~>"
87
+ - - ~>
88
88
  - !ruby/object:Gem::Version
89
89
  version: 1.2.0
90
90
  type: :development
91
91
  prerelease: false
92
92
  version_requirements: !ruby/object:Gem::Requirement
93
93
  requirements:
94
- - - "~>"
94
+ - - ~>
95
95
  - !ruby/object:Gem::Version
96
96
  version: 1.2.0
97
97
  - !ruby/object:Gem::Dependency
98
98
  name: rake
99
99
  requirement: !ruby/object:Gem::Requirement
100
100
  requirements:
101
- - - "~>"
101
+ - - ~>
102
102
  - !ruby/object:Gem::Version
103
103
  version: 10.1.0
104
104
  type: :development
105
105
  prerelease: false
106
106
  version_requirements: !ruby/object:Gem::Requirement
107
107
  requirements:
108
- - - "~>"
108
+ - - ~>
109
109
  - !ruby/object:Gem::Version
110
110
  version: 10.1.0
111
111
  description: MetaInspector lets you scrape a web page and get its title, charset,
@@ -116,15 +116,14 @@ executables: []
116
116
  extensions: []
117
117
  extra_rdoc_files: []
118
118
  files:
119
- - ".gitignore"
120
- - ".rspec.example"
121
- - ".travis.yml"
119
+ - .gitignore
120
+ - .rspec.example
121
+ - .travis.yml
122
122
  - Gemfile
123
123
  - MIT-LICENSE
124
124
  - README.md
125
125
  - Rakefile
126
126
  - lib/meta_inspector.rb
127
- - lib/meta_inspector/deprecations.rb
128
127
  - lib/meta_inspector/document.rb
129
128
  - lib/meta_inspector/exception_log.rb
130
129
  - lib/meta_inspector/exceptionable.rb
@@ -182,17 +181,17 @@ require_paths:
182
181
  - lib
183
182
  required_ruby_version: !ruby/object:Gem::Requirement
184
183
  requirements:
185
- - - ">="
184
+ - - '>='
186
185
  - !ruby/object:Gem::Version
187
186
  version: '0'
188
187
  required_rubygems_version: !ruby/object:Gem::Requirement
189
188
  requirements:
190
- - - ">="
189
+ - - '>='
191
190
  - !ruby/object:Gem::Version
192
191
  version: '0'
193
192
  requirements: []
194
193
  rubyforge_project:
195
- rubygems_version: 2.1.11
194
+ rubygems_version: 2.2.2
196
195
  signing_key:
197
196
  specification_version: 4
198
197
  summary: MetaInspector is a ruby gem for web scraping purposes, that returns a hash
@@ -1,19 +0,0 @@
1
- # -*- encoding: utf-8 -*-
2
-
3
- module MetaInspector
4
- class Scraper < Document
5
- def initialize
6
- warn "The Scraper class is now deprecated since version 1.17, use Document instead"
7
- super
8
- end
9
-
10
- def errors
11
- warn "The #errors method is deprecated since version 1.17, use #exceptions instead"
12
- exceptions
13
- end
14
-
15
- def document
16
- warn "The #document method is deprecated since version 1.17, use #to_s instead"
17
- end
18
- end
19
- end