metainspector 2.1.0 → 2.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +2 -1
- data/README.md +12 -1
- data/lib/meta_inspector.rb +0 -1
- data/lib/meta_inspector/document.rb +16 -15
- data/lib/meta_inspector/request.rb +7 -2
- data/lib/meta_inspector/version.rb +1 -1
- data/spec/document_spec.rb +26 -0
- metadata +18 -19
- data/lib/meta_inspector/deprecations.rb +0 -19
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9c230bbbcf325166d2e5906ed7f13af8e0f8ef06
|
4
|
+
data.tar.gz: 245e219fc420a20e8fd8eef722502a6cc23d8087
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 791526a75ce5e23651a7cbded8ee4e9ab12fd75281d7089a2f2191ee44293c9f1b8d8587cae3284e2ed1c103d9020cfa2038fd1be4ddc90d2e78d1235fcc0374
|
7
|
+
data.tar.gz: f3c535c4a4b7618168e30cf2427a22365404dc00f698dde32669364a86770bc8445b20989974298afd124822fddea272afd000035663d7e0eb6cd5acb0676006
|
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -191,6 +191,17 @@ However, you can tell MetaInspector to allow these redirections with the option
|
|
191
191
|
# And this will allow HTTP => HTTPS ("safe") and HTTPS => HTTP ("unsafe") redirections
|
192
192
|
page = MetaInspector.new('facebook.com', :allow_redirections => :all)
|
193
193
|
|
194
|
+
### Headers
|
195
|
+
|
196
|
+
By default, the following headers are set:
|
197
|
+
|
198
|
+
{'User-Agent' => "MetaInspector/#{MetaInspector::VERSION} (+https://github.com/jaimeiniesta/metainspector)"}
|
199
|
+
|
200
|
+
If you want to set custom headers then use the `headers` option:
|
201
|
+
|
202
|
+
# Set the User-Agent header
|
203
|
+
page = MetaInspector.new('example.com', :headers => {'User-Agent' => 'My custom User-Agent'})
|
204
|
+
|
194
205
|
### HTML Content Only
|
195
206
|
|
196
207
|
MetaInspector will try to parse all URLs by default. If you want to raise an exception when trying to parse a non-html URL (one that has a content-type different than text/html), you can state it like this:
|
@@ -215,7 +226,7 @@ However, if you prefer you can also set the `warn_level: :warn` option, so that
|
|
215
226
|
|
216
227
|
You can also set the `warn_level: :store` option so that exceptions found will be silenced, and left for you to inspect on `page.exceptions`. You can also ask for `page.ok?`, wich will return `true` if no exceptions are stored.
|
217
228
|
|
218
|
-
You should avoid using the `:store` option, or use it wisely, as silencing errors can be problematic, it's always better to face the errors and treat them accordingly.
|
229
|
+
You should avoid using the `:store` option, or use it wisely, as silencing errors can be problematic, it's always better to face the errors and treat them accordingly.
|
219
230
|
|
220
231
|
## Examples
|
221
232
|
|
data/lib/meta_inspector.rb
CHANGED
@@ -7,7 +7,6 @@ require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/reque
|
|
7
7
|
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/url'))
|
8
8
|
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/parser'))
|
9
9
|
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/document'))
|
10
|
-
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/deprecations'))
|
11
10
|
|
12
11
|
module MetaInspector
|
13
12
|
extend self
|
@@ -3,7 +3,7 @@
|
|
3
3
|
module MetaInspector
|
4
4
|
# A MetaInspector::Document knows about its URL and its contents
|
5
5
|
class Document
|
6
|
-
attr_reader :timeout, :html_content_only, :allow_redirections, :warn_level
|
6
|
+
attr_reader :timeout, :html_content_only, :allow_redirections, :warn_level, :headers
|
7
7
|
|
8
8
|
include MetaInspector::Exceptionable
|
9
9
|
|
@@ -14,25 +14,22 @@ module MetaInspector
|
|
14
14
|
# => allow_redirections: when :safe, allows HTTP => HTTPS redirections. When :all, it also allows HTTPS => HTTP
|
15
15
|
# => document: the html of the url as a string
|
16
16
|
# => warn_level: what to do when encountering exceptions. Can be :warn, :raise or nil
|
17
|
+
# => headers: object containing custom headers for the request
|
17
18
|
def initialize(initial_url, options = {})
|
18
19
|
options = defaults.merge(options)
|
19
20
|
@timeout = options[:timeout]
|
20
21
|
@html_content_only = options[:html_content_only]
|
21
22
|
@allow_redirections = options[:allow_redirections]
|
22
23
|
@document = options[:document]
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
@
|
30
|
-
|
31
|
-
@
|
32
|
-
@request = MetaInspector::Request.new(@url, allow_redirections: @allow_redirections,
|
33
|
-
timeout: @timeout,
|
34
|
-
exception_log: @exception_log) unless @document
|
35
|
-
@parser = MetaInspector::Parser.new(self, exception_log: @exception_log)
|
24
|
+
@headers = options[:headers]
|
25
|
+
@warn_level = options[:warn_level]
|
26
|
+
@exception_log = options[:exception_log] || MetaInspector::ExceptionLog.new(warn_level: warn_level)
|
27
|
+
@url = MetaInspector::URL.new(initial_url, exception_log: @exception_log)
|
28
|
+
@request = MetaInspector::Request.new(@url, allow_redirections: @allow_redirections,
|
29
|
+
timeout: @timeout,
|
30
|
+
exception_log: @exception_log,
|
31
|
+
headers: @headers) unless @document
|
32
|
+
@parser = MetaInspector::Parser.new(self, exception_log: @exception_log)
|
36
33
|
end
|
37
34
|
|
38
35
|
extend Forwardable
|
@@ -65,7 +62,11 @@ module MetaInspector
|
|
65
62
|
private
|
66
63
|
|
67
64
|
def defaults
|
68
|
-
{ :timeout => 20,
|
65
|
+
{ :timeout => 20,
|
66
|
+
:html_content_only => false,
|
67
|
+
:warn_level => :raise,
|
68
|
+
:headers => {'User-Agent' => "MetaInspector/#{MetaInspector::VERSION} (+https://github.com/jaimeiniesta/metainspector)"}
|
69
|
+
}
|
69
70
|
end
|
70
71
|
|
71
72
|
def document
|
@@ -17,6 +17,7 @@ module MetaInspector
|
|
17
17
|
@allow_redirections = options[:allow_redirections]
|
18
18
|
@timeout = options[:timeout]
|
19
19
|
@exception_log = options[:exception_log]
|
20
|
+
@headers = options[:headers]
|
20
21
|
|
21
22
|
response # as soon as it is set up, we make the request so we can fail early
|
22
23
|
end
|
@@ -43,7 +44,11 @@ module MetaInspector
|
|
43
44
|
end
|
44
45
|
|
45
46
|
def fetch
|
46
|
-
|
47
|
+
options = {}
|
48
|
+
options.merge!(:allow_redirections => @allow_redirections) if @allow_redirections
|
49
|
+
options.merge!(@headers) if @headers.is_a?(Hash)
|
50
|
+
|
51
|
+
request = open(url, options)
|
47
52
|
|
48
53
|
@url.url = request.base_uri.to_s
|
49
54
|
|
@@ -51,7 +56,7 @@ module MetaInspector
|
|
51
56
|
end
|
52
57
|
|
53
58
|
def defaults
|
54
|
-
{
|
59
|
+
{ timeout: 20, exception_log: MetaInspector::ExceptionLog.new }
|
55
60
|
end
|
56
61
|
end
|
57
62
|
end
|
data/spec/document_spec.rb
CHANGED
@@ -89,4 +89,30 @@ describe MetaInspector::Document do
|
|
89
89
|
tar_url.title
|
90
90
|
end
|
91
91
|
end
|
92
|
+
|
93
|
+
describe 'headers' do
|
94
|
+
it "should include default headers" do
|
95
|
+
url = 'http://example.com/headers'
|
96
|
+
request = double('Request', base_uri: url)
|
97
|
+
expected_headers = {'User-Agent' => "MetaInspector/#{MetaInspector::VERSION} (+https://github.com/jaimeiniesta/metainspector)"}
|
98
|
+
|
99
|
+
MetaInspector::Request.any_instance.should_receive(:open)
|
100
|
+
.with(url, expected_headers)
|
101
|
+
.and_return(request)
|
102
|
+
|
103
|
+
MetaInspector::Document.new(url)
|
104
|
+
end
|
105
|
+
|
106
|
+
it "should include passed headers on the request" do
|
107
|
+
url = 'http://example.com/headers'
|
108
|
+
headers = {'User-Agent' => 'Mozilla', 'Referer' => 'https://github.com/'}
|
109
|
+
request = double('Request', base_uri: url)
|
110
|
+
|
111
|
+
MetaInspector::Request.any_instance.should_receive(:open)
|
112
|
+
.with(url, headers)
|
113
|
+
.and_return(request)
|
114
|
+
|
115
|
+
MetaInspector::Document.new(url, headers: headers)
|
116
|
+
end
|
117
|
+
end
|
92
118
|
end
|
metadata
CHANGED
@@ -1,55 +1,55 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jaime Iniesta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-04-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- -
|
17
|
+
- - ~>
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '1.6'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- -
|
24
|
+
- - ~>
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '1.6'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: open_uri_redirections
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- -
|
31
|
+
- - ~>
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: 0.1.4
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- -
|
38
|
+
- - ~>
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: 0.1.4
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: addressable
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- -
|
45
|
+
- - ~>
|
46
46
|
- !ruby/object:Gem::Version
|
47
47
|
version: 2.3.5
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- -
|
52
|
+
- - ~>
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: 2.3.5
|
55
55
|
- !ruby/object:Gem::Dependency
|
@@ -84,28 +84,28 @@ dependencies:
|
|
84
84
|
name: awesome_print
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
|
-
- -
|
87
|
+
- - ~>
|
88
88
|
- !ruby/object:Gem::Version
|
89
89
|
version: 1.2.0
|
90
90
|
type: :development
|
91
91
|
prerelease: false
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
93
93
|
requirements:
|
94
|
-
- -
|
94
|
+
- - ~>
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: 1.2.0
|
97
97
|
- !ruby/object:Gem::Dependency
|
98
98
|
name: rake
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
100
100
|
requirements:
|
101
|
-
- -
|
101
|
+
- - ~>
|
102
102
|
- !ruby/object:Gem::Version
|
103
103
|
version: 10.1.0
|
104
104
|
type: :development
|
105
105
|
prerelease: false
|
106
106
|
version_requirements: !ruby/object:Gem::Requirement
|
107
107
|
requirements:
|
108
|
-
- -
|
108
|
+
- - ~>
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: 10.1.0
|
111
111
|
description: MetaInspector lets you scrape a web page and get its title, charset,
|
@@ -116,15 +116,14 @@ executables: []
|
|
116
116
|
extensions: []
|
117
117
|
extra_rdoc_files: []
|
118
118
|
files:
|
119
|
-
-
|
120
|
-
-
|
121
|
-
-
|
119
|
+
- .gitignore
|
120
|
+
- .rspec.example
|
121
|
+
- .travis.yml
|
122
122
|
- Gemfile
|
123
123
|
- MIT-LICENSE
|
124
124
|
- README.md
|
125
125
|
- Rakefile
|
126
126
|
- lib/meta_inspector.rb
|
127
|
-
- lib/meta_inspector/deprecations.rb
|
128
127
|
- lib/meta_inspector/document.rb
|
129
128
|
- lib/meta_inspector/exception_log.rb
|
130
129
|
- lib/meta_inspector/exceptionable.rb
|
@@ -182,17 +181,17 @@ require_paths:
|
|
182
181
|
- lib
|
183
182
|
required_ruby_version: !ruby/object:Gem::Requirement
|
184
183
|
requirements:
|
185
|
-
- -
|
184
|
+
- - '>='
|
186
185
|
- !ruby/object:Gem::Version
|
187
186
|
version: '0'
|
188
187
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
189
188
|
requirements:
|
190
|
-
- -
|
189
|
+
- - '>='
|
191
190
|
- !ruby/object:Gem::Version
|
192
191
|
version: '0'
|
193
192
|
requirements: []
|
194
193
|
rubyforge_project:
|
195
|
-
rubygems_version: 2.
|
194
|
+
rubygems_version: 2.2.2
|
196
195
|
signing_key:
|
197
196
|
specification_version: 4
|
198
197
|
summary: MetaInspector is a ruby gem for web scraping purposes, that returns a hash
|
@@ -1,19 +0,0 @@
|
|
1
|
-
# -*- encoding: utf-8 -*-
|
2
|
-
|
3
|
-
module MetaInspector
|
4
|
-
class Scraper < Document
|
5
|
-
def initialize
|
6
|
-
warn "The Scraper class is now deprecated since version 1.17, use Document instead"
|
7
|
-
super
|
8
|
-
end
|
9
|
-
|
10
|
-
def errors
|
11
|
-
warn "The #errors method is deprecated since version 1.17, use #exceptions instead"
|
12
|
-
exceptions
|
13
|
-
end
|
14
|
-
|
15
|
-
def document
|
16
|
-
warn "The #document method is deprecated since version 1.17, use #to_s instead"
|
17
|
-
end
|
18
|
-
end
|
19
|
-
end
|