metainspector 5.1.3 → 5.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +1 -0
- data/CHANGELOG.md +4 -0
- data/README.md +7 -8
- data/lib/meta_inspector/document.rb +15 -20
- data/lib/meta_inspector/version.rb +1 -1
- data/meta_inspector.gemspec +1 -1
- data/spec/document_spec.rb +8 -17
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4d6981fee7a5b3c01045c6e152d20fb2c788c549
|
4
|
+
data.tar.gz: 4f0ff717747c9b1bdc5abae2eae1696bc4138033
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 782597ff87e03b177d73302efb5dcfa0713e7a27367fe93b43579960e89824caa8331a88c95d96410597d03ba0849fa4c4cf3fcf2caeb02d63bf2aa606aa4c08
|
7
|
+
data.tar.gz: 268e91699c34b9649e6441450ccb244fc1b49c6c06a3c1e6787f73fa04332b8172886228ce8a20ad714c727ee45cfa2665308aaa79b33eeef3ded7ac01225722
|
data/.travis.yml
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,9 @@
|
|
1
1
|
# MetaInpector Changelog
|
2
2
|
|
3
|
+
## [Changes in 5.2](https://github.com/jaimeiniesta/metainspector/compare/v5.1.0...v5.2.0)
|
4
|
+
|
5
|
+
Removes the deprecated `html_content_only` option, and replaces it by `allow_non_html_content`, by default `false`.
|
6
|
+
|
3
7
|
## [Changes in 5.1](https://github.com/jaimeiniesta/metainspector/compare/v5.0.0...v5.1.0)
|
4
8
|
|
5
9
|
Deprecates the `html_content_only` option, and turns it on by default.
|
data/README.md
CHANGED
@@ -326,24 +326,22 @@ MetaInspector.new('https://example.com', faraday_options: { ssl: { verify: false
|
|
326
326
|
# Now we can access the page
|
327
327
|
```
|
328
328
|
|
329
|
-
### HTML
|
329
|
+
### Allow non-HTML content type
|
330
330
|
|
331
|
-
MetaInspector will
|
331
|
+
MetaInspector will by default raise an exception when trying to parse a non-HTML URL (one that has a content-type different than text/html). You can disable this behaviour with:
|
332
332
|
|
333
333
|
```ruby
|
334
|
-
page = MetaInspector.new('sitevalidator.com', :
|
334
|
+
page = MetaInspector.new('sitevalidator.com', :allow_non_html_content => true)
|
335
335
|
```
|
336
336
|
|
337
|
-
This option is deprecated since 5.1.0 and will be removed in 5.2.0.
|
338
|
-
|
339
337
|
```ruby
|
340
338
|
page = MetaInspector.new('http://example.com/image.png')
|
341
339
|
page.content_type # "image/png"
|
342
|
-
page.description # will
|
340
|
+
page.description # will raise an exception
|
343
341
|
|
344
|
-
page = MetaInspector.new('http://example.com/image.png', :
|
342
|
+
page = MetaInspector.new('http://example.com/image.png', :allow_non_html_content => true)
|
345
343
|
page.content_type # "image/png"
|
346
|
-
page.description #
|
344
|
+
page.description # will return a garbled string
|
347
345
|
```
|
348
346
|
|
349
347
|
### URL Normalization
|
@@ -438,6 +436,7 @@ You can also come to chat with us on our [Gitter room](https://gitter.im/jaimein
|
|
438
436
|
|
439
437
|
* [go-metainspector](https://github.com/fern4lvarez/go-metainspector), a port of MetaInspector for Go.
|
440
438
|
* [Node-MetaInspector](https://github.com/gabceb/node-metainspector), a port of MetaInspector for Node.
|
439
|
+
* [MetaInvestigator](https://github.com/nekova/metainvestigator), a port of MetaInspector for Elixir.
|
441
440
|
|
442
441
|
## License
|
443
442
|
MetaInspector is released under the [MIT license](MIT-LICENSE).
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module MetaInspector
|
2
2
|
# A MetaInspector::Document knows about its URL and its contents
|
3
3
|
class Document
|
4
|
-
attr_reader :
|
4
|
+
attr_reader :allow_non_html_content, :allow_redirections, :headers
|
5
5
|
|
6
6
|
# Initializes a new instance of MetaInspector::Document, setting the URL
|
7
7
|
# Options:
|
@@ -14,19 +14,14 @@ module MetaInspector
|
|
14
14
|
# * normalize_url: true by default
|
15
15
|
# * faraday_options: an optional hash of options to pass to Faraday on the request
|
16
16
|
def initialize(initial_url, options = {})
|
17
|
-
unless options[:html_content_only].nil?
|
18
|
-
puts <<-EOS
|
19
|
-
DEPRECATION NOTICE: html_content_only is deprecated and turned on by default since 5.1.0,
|
20
|
-
this option will be removed in 5.2.0
|
21
|
-
EOS
|
22
|
-
end
|
23
17
|
options = defaults.merge(options)
|
24
18
|
@connection_timeout = options[:connection_timeout]
|
25
19
|
@read_timeout = options[:read_timeout]
|
26
20
|
@retries = options[:retries]
|
27
|
-
@html_content_only = options[:html_content_only]
|
28
21
|
|
29
|
-
@allow_redirections
|
22
|
+
@allow_redirections = options[:allow_redirections]
|
23
|
+
@allow_non_html_content = options[:allow_non_html_content]
|
24
|
+
|
30
25
|
@document = options[:document]
|
31
26
|
@download_images = options[:download_images]
|
32
27
|
@headers = options[:headers]
|
@@ -86,16 +81,16 @@ module MetaInspector
|
|
86
81
|
private
|
87
82
|
|
88
83
|
def defaults
|
89
|
-
{ :timeout
|
90
|
-
:retries
|
91
|
-
:
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
:
|
97
|
-
:normalize_url
|
98
|
-
:download_images
|
84
|
+
{ :timeout => 20,
|
85
|
+
:retries => 3,
|
86
|
+
:headers => {
|
87
|
+
'User-Agent' => default_user_agent,
|
88
|
+
'Accept-Encoding' => 'identity'
|
89
|
+
},
|
90
|
+
:allow_redirections => true,
|
91
|
+
:allow_non_html_content => false,
|
92
|
+
:normalize_url => true,
|
93
|
+
:download_images => true }
|
99
94
|
end
|
100
95
|
|
101
96
|
def default_user_agent
|
@@ -103,7 +98,7 @@ module MetaInspector
|
|
103
98
|
end
|
104
99
|
|
105
100
|
def document
|
106
|
-
@document ||= if
|
101
|
+
@document ||= if !allow_non_html_content && !content_type.nil? && content_type != 'text/html'
|
107
102
|
fail MetaInspector::ParserError.new "The url provided contains #{content_type} content instead of text/html content"
|
108
103
|
else
|
109
104
|
@request.read
|
data/meta_inspector.gemspec
CHANGED
@@ -21,7 +21,7 @@ Gem::Specification.new do |gem|
|
|
21
21
|
gem.add_dependency 'faraday-http-cache', '~> 1.2'
|
22
22
|
gem.add_dependency 'faraday-encoding', '~> 0.0.3'
|
23
23
|
gem.add_dependency 'addressable', '~> 2.4'
|
24
|
-
gem.add_dependency 'fastimage', '~>
|
24
|
+
gem.add_dependency 'fastimage', '~> 2.0'
|
25
25
|
gem.add_dependency 'nesty', '~> 1.0'
|
26
26
|
|
27
27
|
gem.add_development_dependency 'rspec', '~> 3.0'
|
data/spec/document_spec.rb
CHANGED
@@ -74,35 +74,26 @@ describe MetaInspector::Document do
|
|
74
74
|
})
|
75
75
|
end
|
76
76
|
|
77
|
-
describe
|
78
|
-
it "should not
|
77
|
+
describe "allow_non_html_content option" do
|
78
|
+
it "should not allow non-html content type by default" do
|
79
79
|
expect do
|
80
80
|
image_url = MetaInspector::Document.new('http://pagerankalert.com/image.png')
|
81
81
|
image_url.title
|
82
82
|
end.to raise_error(MetaInspector::ParserError)
|
83
83
|
end
|
84
84
|
|
85
|
-
it "should
|
85
|
+
it "should not allow non-html content type when explicitly disallowed" do
|
86
86
|
expect do
|
87
|
-
image_url = MetaInspector::Document.new('http://pagerankalert.com/image.png',
|
88
|
-
image_url.title
|
89
|
-
end.to_not raise_error
|
90
|
-
end
|
91
|
-
|
92
|
-
it "should handle errors when content is image/jpeg and html_content_type_only is true" do
|
93
|
-
expect do
|
94
|
-
image_url = MetaInspector::Document.new('http://pagerankalert.com/image.png', html_content_only: true)
|
95
|
-
|
87
|
+
image_url = MetaInspector::Document.new('http://pagerankalert.com/image.png', allow_non_html_content: false)
|
96
88
|
image_url.title
|
97
89
|
end.to raise_error(MetaInspector::ParserError)
|
98
90
|
end
|
99
91
|
|
100
|
-
it "should
|
92
|
+
it "should allow non-html content type when explicitly allowed" do
|
101
93
|
expect do
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
end.to raise_error(MetaInspector::ParserError)
|
94
|
+
image_url = MetaInspector::Document.new('http://pagerankalert.com/image.png', allow_non_html_content: true)
|
95
|
+
image_url.title
|
96
|
+
end.to_not raise_error(MetaInspector::ParserError)
|
106
97
|
end
|
107
98
|
end
|
108
99
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 5.
|
4
|
+
version: 5.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jaime Iniesta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-04-
|
11
|
+
date: 2016-04-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -114,14 +114,14 @@ dependencies:
|
|
114
114
|
requirements:
|
115
115
|
- - "~>"
|
116
116
|
- !ruby/object:Gem::Version
|
117
|
-
version:
|
117
|
+
version: '2.0'
|
118
118
|
type: :runtime
|
119
119
|
prerelease: false
|
120
120
|
version_requirements: !ruby/object:Gem::Requirement
|
121
121
|
requirements:
|
122
122
|
- - "~>"
|
123
123
|
- !ruby/object:Gem::Version
|
124
|
-
version:
|
124
|
+
version: '2.0'
|
125
125
|
- !ruby/object:Gem::Dependency
|
126
126
|
name: nesty
|
127
127
|
requirement: !ruby/object:Gem::Requirement
|