metainspector 2.1.0 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +2 -1
- data/README.md +12 -1
- data/lib/meta_inspector.rb +0 -1
- data/lib/meta_inspector/document.rb +16 -15
- data/lib/meta_inspector/request.rb +7 -2
- data/lib/meta_inspector/version.rb +1 -1
- data/spec/document_spec.rb +26 -0
- metadata +18 -19
- data/lib/meta_inspector/deprecations.rb +0 -19
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9c230bbbcf325166d2e5906ed7f13af8e0f8ef06
|
4
|
+
data.tar.gz: 245e219fc420a20e8fd8eef722502a6cc23d8087
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 791526a75ce5e23651a7cbded8ee4e9ab12fd75281d7089a2f2191ee44293c9f1b8d8587cae3284e2ed1c103d9020cfa2038fd1be4ddc90d2e78d1235fcc0374
|
7
|
+
data.tar.gz: f3c535c4a4b7618168e30cf2427a22365404dc00f698dde32669364a86770bc8445b20989974298afd124822fddea272afd000035663d7e0eb6cd5acb0676006
|
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -191,6 +191,17 @@ However, you can tell MetaInspector to allow these redirections with the option
|
|
191
191
|
# And this will allow HTTP => HTTPS ("safe") and HTTPS => HTTP ("unsafe") redirections
|
192
192
|
page = MetaInspector.new('facebook.com', :allow_redirections => :all)
|
193
193
|
|
194
|
+
### Headers
|
195
|
+
|
196
|
+
By default, the following headers are set:
|
197
|
+
|
198
|
+
{'User-Agent' => "MetaInspector/#{MetaInspector::VERSION} (+https://github.com/jaimeiniesta/metainspector)"}
|
199
|
+
|
200
|
+
If you want to set custom headers then use the `headers` option:
|
201
|
+
|
202
|
+
# Set the User-Agent header
|
203
|
+
page = MetaInspector.new('example.com', :headers => {'User-Agent' => 'My custom User-Agent'})
|
204
|
+
|
194
205
|
### HTML Content Only
|
195
206
|
|
196
207
|
MetaInspector will try to parse all URLs by default. If you want to raise an exception when trying to parse a non-html URL (one that has a content-type different than text/html), you can state it like this:
|
@@ -215,7 +226,7 @@ However, if you prefer you can also set the `warn_level: :warn` option, so that
|
|
215
226
|
|
216
227
|
You can also set the `warn_level: :store` option so that exceptions found will be silenced, and left for you to inspect on `page.exceptions`. You can also ask for `page.ok?`, wich will return `true` if no exceptions are stored.
|
217
228
|
|
218
|
-
You should avoid using the `:store` option, or use it wisely, as silencing errors can be problematic, it's always better to face the errors and treat them accordingly.
|
229
|
+
You should avoid using the `:store` option, or use it wisely, as silencing errors can be problematic, it's always better to face the errors and treat them accordingly.
|
219
230
|
|
220
231
|
## Examples
|
221
232
|
|
data/lib/meta_inspector.rb
CHANGED
@@ -7,7 +7,6 @@ require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/reque
|
|
7
7
|
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/url'))
|
8
8
|
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/parser'))
|
9
9
|
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/document'))
|
10
|
-
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/deprecations'))
|
11
10
|
|
12
11
|
module MetaInspector
|
13
12
|
extend self
|
@@ -3,7 +3,7 @@
|
|
3
3
|
module MetaInspector
|
4
4
|
# A MetaInspector::Document knows about its URL and its contents
|
5
5
|
class Document
|
6
|
-
attr_reader :timeout, :html_content_only, :allow_redirections, :warn_level
|
6
|
+
attr_reader :timeout, :html_content_only, :allow_redirections, :warn_level, :headers
|
7
7
|
|
8
8
|
include MetaInspector::Exceptionable
|
9
9
|
|
@@ -14,25 +14,22 @@ module MetaInspector
|
|
14
14
|
# => allow_redirections: when :safe, allows HTTP => HTTPS redirections. When :all, it also allows HTTPS => HTTP
|
15
15
|
# => document: the html of the url as a string
|
16
16
|
# => warn_level: what to do when encountering exceptions. Can be :warn, :raise or nil
|
17
|
+
# => headers: object containing custom headers for the request
|
17
18
|
def initialize(initial_url, options = {})
|
18
19
|
options = defaults.merge(options)
|
19
20
|
@timeout = options[:timeout]
|
20
21
|
@html_content_only = options[:html_content_only]
|
21
22
|
@allow_redirections = options[:allow_redirections]
|
22
23
|
@document = options[:document]
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
@
|
30
|
-
|
31
|
-
@
|
32
|
-
@request = MetaInspector::Request.new(@url, allow_redirections: @allow_redirections,
|
33
|
-
timeout: @timeout,
|
34
|
-
exception_log: @exception_log) unless @document
|
35
|
-
@parser = MetaInspector::Parser.new(self, exception_log: @exception_log)
|
24
|
+
@headers = options[:headers]
|
25
|
+
@warn_level = options[:warn_level]
|
26
|
+
@exception_log = options[:exception_log] || MetaInspector::ExceptionLog.new(warn_level: warn_level)
|
27
|
+
@url = MetaInspector::URL.new(initial_url, exception_log: @exception_log)
|
28
|
+
@request = MetaInspector::Request.new(@url, allow_redirections: @allow_redirections,
|
29
|
+
timeout: @timeout,
|
30
|
+
exception_log: @exception_log,
|
31
|
+
headers: @headers) unless @document
|
32
|
+
@parser = MetaInspector::Parser.new(self, exception_log: @exception_log)
|
36
33
|
end
|
37
34
|
|
38
35
|
extend Forwardable
|
@@ -65,7 +62,11 @@ module MetaInspector
|
|
65
62
|
private
|
66
63
|
|
67
64
|
def defaults
|
68
|
-
{ :timeout => 20,
|
65
|
+
{ :timeout => 20,
|
66
|
+
:html_content_only => false,
|
67
|
+
:warn_level => :raise,
|
68
|
+
:headers => {'User-Agent' => "MetaInspector/#{MetaInspector::VERSION} (+https://github.com/jaimeiniesta/metainspector)"}
|
69
|
+
}
|
69
70
|
end
|
70
71
|
|
71
72
|
def document
|
@@ -17,6 +17,7 @@ module MetaInspector
|
|
17
17
|
@allow_redirections = options[:allow_redirections]
|
18
18
|
@timeout = options[:timeout]
|
19
19
|
@exception_log = options[:exception_log]
|
20
|
+
@headers = options[:headers]
|
20
21
|
|
21
22
|
response # as soon as it is set up, we make the request so we can fail early
|
22
23
|
end
|
@@ -43,7 +44,11 @@ module MetaInspector
|
|
43
44
|
end
|
44
45
|
|
45
46
|
def fetch
|
46
|
-
|
47
|
+
options = {}
|
48
|
+
options.merge!(:allow_redirections => @allow_redirections) if @allow_redirections
|
49
|
+
options.merge!(@headers) if @headers.is_a?(Hash)
|
50
|
+
|
51
|
+
request = open(url, options)
|
47
52
|
|
48
53
|
@url.url = request.base_uri.to_s
|
49
54
|
|
@@ -51,7 +56,7 @@ module MetaInspector
|
|
51
56
|
end
|
52
57
|
|
53
58
|
def defaults
|
54
|
-
{
|
59
|
+
{ timeout: 20, exception_log: MetaInspector::ExceptionLog.new }
|
55
60
|
end
|
56
61
|
end
|
57
62
|
end
|
data/spec/document_spec.rb
CHANGED
@@ -89,4 +89,30 @@ describe MetaInspector::Document do
|
|
89
89
|
tar_url.title
|
90
90
|
end
|
91
91
|
end
|
92
|
+
|
93
|
+
describe 'headers' do
|
94
|
+
it "should include default headers" do
|
95
|
+
url = 'http://example.com/headers'
|
96
|
+
request = double('Request', base_uri: url)
|
97
|
+
expected_headers = {'User-Agent' => "MetaInspector/#{MetaInspector::VERSION} (+https://github.com/jaimeiniesta/metainspector)"}
|
98
|
+
|
99
|
+
MetaInspector::Request.any_instance.should_receive(:open)
|
100
|
+
.with(url, expected_headers)
|
101
|
+
.and_return(request)
|
102
|
+
|
103
|
+
MetaInspector::Document.new(url)
|
104
|
+
end
|
105
|
+
|
106
|
+
it "should include passed headers on the request" do
|
107
|
+
url = 'http://example.com/headers'
|
108
|
+
headers = {'User-Agent' => 'Mozilla', 'Referer' => 'https://github.com/'}
|
109
|
+
request = double('Request', base_uri: url)
|
110
|
+
|
111
|
+
MetaInspector::Request.any_instance.should_receive(:open)
|
112
|
+
.with(url, headers)
|
113
|
+
.and_return(request)
|
114
|
+
|
115
|
+
MetaInspector::Document.new(url, headers: headers)
|
116
|
+
end
|
117
|
+
end
|
92
118
|
end
|
metadata
CHANGED
@@ -1,55 +1,55 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jaime Iniesta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-04-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- -
|
17
|
+
- - ~>
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '1.6'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- -
|
24
|
+
- - ~>
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '1.6'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: open_uri_redirections
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- -
|
31
|
+
- - ~>
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: 0.1.4
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- -
|
38
|
+
- - ~>
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: 0.1.4
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: addressable
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- -
|
45
|
+
- - ~>
|
46
46
|
- !ruby/object:Gem::Version
|
47
47
|
version: 2.3.5
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- -
|
52
|
+
- - ~>
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: 2.3.5
|
55
55
|
- !ruby/object:Gem::Dependency
|
@@ -84,28 +84,28 @@ dependencies:
|
|
84
84
|
name: awesome_print
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
|
-
- -
|
87
|
+
- - ~>
|
88
88
|
- !ruby/object:Gem::Version
|
89
89
|
version: 1.2.0
|
90
90
|
type: :development
|
91
91
|
prerelease: false
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
93
93
|
requirements:
|
94
|
-
- -
|
94
|
+
- - ~>
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: 1.2.0
|
97
97
|
- !ruby/object:Gem::Dependency
|
98
98
|
name: rake
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
100
100
|
requirements:
|
101
|
-
- -
|
101
|
+
- - ~>
|
102
102
|
- !ruby/object:Gem::Version
|
103
103
|
version: 10.1.0
|
104
104
|
type: :development
|
105
105
|
prerelease: false
|
106
106
|
version_requirements: !ruby/object:Gem::Requirement
|
107
107
|
requirements:
|
108
|
-
- -
|
108
|
+
- - ~>
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: 10.1.0
|
111
111
|
description: MetaInspector lets you scrape a web page and get its title, charset,
|
@@ -116,15 +116,14 @@ executables: []
|
|
116
116
|
extensions: []
|
117
117
|
extra_rdoc_files: []
|
118
118
|
files:
|
119
|
-
-
|
120
|
-
-
|
121
|
-
-
|
119
|
+
- .gitignore
|
120
|
+
- .rspec.example
|
121
|
+
- .travis.yml
|
122
122
|
- Gemfile
|
123
123
|
- MIT-LICENSE
|
124
124
|
- README.md
|
125
125
|
- Rakefile
|
126
126
|
- lib/meta_inspector.rb
|
127
|
-
- lib/meta_inspector/deprecations.rb
|
128
127
|
- lib/meta_inspector/document.rb
|
129
128
|
- lib/meta_inspector/exception_log.rb
|
130
129
|
- lib/meta_inspector/exceptionable.rb
|
@@ -182,17 +181,17 @@ require_paths:
|
|
182
181
|
- lib
|
183
182
|
required_ruby_version: !ruby/object:Gem::Requirement
|
184
183
|
requirements:
|
185
|
-
- -
|
184
|
+
- - '>='
|
186
185
|
- !ruby/object:Gem::Version
|
187
186
|
version: '0'
|
188
187
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
189
188
|
requirements:
|
190
|
-
- -
|
189
|
+
- - '>='
|
191
190
|
- !ruby/object:Gem::Version
|
192
191
|
version: '0'
|
193
192
|
requirements: []
|
194
193
|
rubyforge_project:
|
195
|
-
rubygems_version: 2.
|
194
|
+
rubygems_version: 2.2.2
|
196
195
|
signing_key:
|
197
196
|
specification_version: 4
|
198
197
|
summary: MetaInspector is a ruby gem for web scraping purposes, that returns a hash
|
@@ -1,19 +0,0 @@
|
|
1
|
-
# -*- encoding: utf-8 -*-
|
2
|
-
|
3
|
-
module MetaInspector
|
4
|
-
class Scraper < Document
|
5
|
-
def initialize
|
6
|
-
warn "The Scraper class is now deprecated since version 1.17, use Document instead"
|
7
|
-
super
|
8
|
-
end
|
9
|
-
|
10
|
-
def errors
|
11
|
-
warn "The #errors method is deprecated since version 1.17, use #exceptions instead"
|
12
|
-
exceptions
|
13
|
-
end
|
14
|
-
|
15
|
-
def document
|
16
|
-
warn "The #document method is deprecated since version 1.17, use #to_s instead"
|
17
|
-
end
|
18
|
-
end
|
19
|
-
end
|