metainspector 5.1.3 → 5.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +1 -0
- data/CHANGELOG.md +4 -0
- data/README.md +7 -8
- data/lib/meta_inspector/document.rb +15 -20
- data/lib/meta_inspector/version.rb +1 -1
- data/meta_inspector.gemspec +1 -1
- data/spec/document_spec.rb +8 -17
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4d6981fee7a5b3c01045c6e152d20fb2c788c549
|
4
|
+
data.tar.gz: 4f0ff717747c9b1bdc5abae2eae1696bc4138033
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 782597ff87e03b177d73302efb5dcfa0713e7a27367fe93b43579960e89824caa8331a88c95d96410597d03ba0849fa4c4cf3fcf2caeb02d63bf2aa606aa4c08
|
7
|
+
data.tar.gz: 268e91699c34b9649e6441450ccb244fc1b49c6c06a3c1e6787f73fa04332b8172886228ce8a20ad714c727ee45cfa2665308aaa79b33eeef3ded7ac01225722
|
data/.travis.yml
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,9 @@
|
|
1
1
|
# MetaInpector Changelog
|
2
2
|
|
3
|
+
## [Changes in 5.2](https://github.com/jaimeiniesta/metainspector/compare/v5.1.0...v5.2.0)
|
4
|
+
|
5
|
+
Removes the deprecated `html_content_only` option, and replaces it by `allow_non_html_content`, by default `false`.
|
6
|
+
|
3
7
|
## [Changes in 5.1](https://github.com/jaimeiniesta/metainspector/compare/v5.0.0...v5.1.0)
|
4
8
|
|
5
9
|
Deprecates the `html_content_only` option, and turns it on by default.
|
data/README.md
CHANGED
@@ -326,24 +326,22 @@ MetaInspector.new('https://example.com', faraday_options: { ssl: { verify: false
|
|
326
326
|
# Now we can access the page
|
327
327
|
```
|
328
328
|
|
329
|
-
### HTML
|
329
|
+
### Allow non-HTML content type
|
330
330
|
|
331
|
-
MetaInspector will
|
331
|
+
MetaInspector will by default raise an exception when trying to parse a non-HTML URL (one that has a content-type different than text/html). You can disable this behaviour with:
|
332
332
|
|
333
333
|
```ruby
|
334
|
-
page = MetaInspector.new('sitevalidator.com', :
|
334
|
+
page = MetaInspector.new('sitevalidator.com', :allow_non_html_content => true)
|
335
335
|
```
|
336
336
|
|
337
|
-
This option is deprecated since 5.1.0 and will be removed in 5.2.0.
|
338
|
-
|
339
337
|
```ruby
|
340
338
|
page = MetaInspector.new('http://example.com/image.png')
|
341
339
|
page.content_type # "image/png"
|
342
|
-
page.description # will
|
340
|
+
page.description # will raise an exception
|
343
341
|
|
344
|
-
page = MetaInspector.new('http://example.com/image.png', :
|
342
|
+
page = MetaInspector.new('http://example.com/image.png', :allow_non_html_content => true)
|
345
343
|
page.content_type # "image/png"
|
346
|
-
page.description #
|
344
|
+
page.description # will return a garbled string
|
347
345
|
```
|
348
346
|
|
349
347
|
### URL Normalization
|
@@ -438,6 +436,7 @@ You can also come to chat with us on our [Gitter room](https://gitter.im/jaimein
|
|
438
436
|
|
439
437
|
* [go-metainspector](https://github.com/fern4lvarez/go-metainspector), a port of MetaInspector for Go.
|
440
438
|
* [Node-MetaInspector](https://github.com/gabceb/node-metainspector), a port of MetaInspector for Node.
|
439
|
+
* [MetaInvestigator](https://github.com/nekova/metainvestigator), a port of MetaInspector for Elixir.
|
441
440
|
|
442
441
|
## License
|
443
442
|
MetaInspector is released under the [MIT license](MIT-LICENSE).
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module MetaInspector
|
2
2
|
# A MetaInspector::Document knows about its URL and its contents
|
3
3
|
class Document
|
4
|
-
attr_reader :
|
4
|
+
attr_reader :allow_non_html_content, :allow_redirections, :headers
|
5
5
|
|
6
6
|
# Initializes a new instance of MetaInspector::Document, setting the URL
|
7
7
|
# Options:
|
@@ -14,19 +14,14 @@ module MetaInspector
|
|
14
14
|
# * normalize_url: true by default
|
15
15
|
# * faraday_options: an optional hash of options to pass to Faraday on the request
|
16
16
|
def initialize(initial_url, options = {})
|
17
|
-
unless options[:html_content_only].nil?
|
18
|
-
puts <<-EOS
|
19
|
-
DEPRECATION NOTICE: html_content_only is deprecated and turned on by default since 5.1.0,
|
20
|
-
this option will be removed in 5.2.0
|
21
|
-
EOS
|
22
|
-
end
|
23
17
|
options = defaults.merge(options)
|
24
18
|
@connection_timeout = options[:connection_timeout]
|
25
19
|
@read_timeout = options[:read_timeout]
|
26
20
|
@retries = options[:retries]
|
27
|
-
@html_content_only = options[:html_content_only]
|
28
21
|
|
29
|
-
@allow_redirections
|
22
|
+
@allow_redirections = options[:allow_redirections]
|
23
|
+
@allow_non_html_content = options[:allow_non_html_content]
|
24
|
+
|
30
25
|
@document = options[:document]
|
31
26
|
@download_images = options[:download_images]
|
32
27
|
@headers = options[:headers]
|
@@ -86,16 +81,16 @@ module MetaInspector
|
|
86
81
|
private
|
87
82
|
|
88
83
|
def defaults
|
89
|
-
{ :timeout
|
90
|
-
:retries
|
91
|
-
:
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
:
|
97
|
-
:normalize_url
|
98
|
-
:download_images
|
84
|
+
{ :timeout => 20,
|
85
|
+
:retries => 3,
|
86
|
+
:headers => {
|
87
|
+
'User-Agent' => default_user_agent,
|
88
|
+
'Accept-Encoding' => 'identity'
|
89
|
+
},
|
90
|
+
:allow_redirections => true,
|
91
|
+
:allow_non_html_content => false,
|
92
|
+
:normalize_url => true,
|
93
|
+
:download_images => true }
|
99
94
|
end
|
100
95
|
|
101
96
|
def default_user_agent
|
@@ -103,7 +98,7 @@ module MetaInspector
|
|
103
98
|
end
|
104
99
|
|
105
100
|
def document
|
106
|
-
@document ||= if
|
101
|
+
@document ||= if !allow_non_html_content && !content_type.nil? && content_type != 'text/html'
|
107
102
|
fail MetaInspector::ParserError.new "The url provided contains #{content_type} content instead of text/html content"
|
108
103
|
else
|
109
104
|
@request.read
|
data/meta_inspector.gemspec
CHANGED
@@ -21,7 +21,7 @@ Gem::Specification.new do |gem|
|
|
21
21
|
gem.add_dependency 'faraday-http-cache', '~> 1.2'
|
22
22
|
gem.add_dependency 'faraday-encoding', '~> 0.0.3'
|
23
23
|
gem.add_dependency 'addressable', '~> 2.4'
|
24
|
-
gem.add_dependency 'fastimage', '~>
|
24
|
+
gem.add_dependency 'fastimage', '~> 2.0'
|
25
25
|
gem.add_dependency 'nesty', '~> 1.0'
|
26
26
|
|
27
27
|
gem.add_development_dependency 'rspec', '~> 3.0'
|
data/spec/document_spec.rb
CHANGED
@@ -74,35 +74,26 @@ describe MetaInspector::Document do
|
|
74
74
|
})
|
75
75
|
end
|
76
76
|
|
77
|
-
describe
|
78
|
-
it "should not
|
77
|
+
describe "allow_non_html_content option" do
|
78
|
+
it "should not allow non-html content type by default" do
|
79
79
|
expect do
|
80
80
|
image_url = MetaInspector::Document.new('http://pagerankalert.com/image.png')
|
81
81
|
image_url.title
|
82
82
|
end.to raise_error(MetaInspector::ParserError)
|
83
83
|
end
|
84
84
|
|
85
|
-
it "should
|
85
|
+
it "should not allow non-html content type when explicitly disallowed" do
|
86
86
|
expect do
|
87
|
-
image_url = MetaInspector::Document.new('http://pagerankalert.com/image.png',
|
88
|
-
image_url.title
|
89
|
-
end.to_not raise_error
|
90
|
-
end
|
91
|
-
|
92
|
-
it "should handle errors when content is image/jpeg and html_content_type_only is true" do
|
93
|
-
expect do
|
94
|
-
image_url = MetaInspector::Document.new('http://pagerankalert.com/image.png', html_content_only: true)
|
95
|
-
|
87
|
+
image_url = MetaInspector::Document.new('http://pagerankalert.com/image.png', allow_non_html_content: false)
|
96
88
|
image_url.title
|
97
89
|
end.to raise_error(MetaInspector::ParserError)
|
98
90
|
end
|
99
91
|
|
100
|
-
it "should
|
92
|
+
it "should allow non-html content type when explicitly allowed" do
|
101
93
|
expect do
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
end.to raise_error(MetaInspector::ParserError)
|
94
|
+
image_url = MetaInspector::Document.new('http://pagerankalert.com/image.png', allow_non_html_content: true)
|
95
|
+
image_url.title
|
96
|
+
end.to_not raise_error(MetaInspector::ParserError)
|
106
97
|
end
|
107
98
|
end
|
108
99
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 5.
|
4
|
+
version: 5.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jaime Iniesta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-04-
|
11
|
+
date: 2016-04-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -114,14 +114,14 @@ dependencies:
|
|
114
114
|
requirements:
|
115
115
|
- - "~>"
|
116
116
|
- !ruby/object:Gem::Version
|
117
|
-
version:
|
117
|
+
version: '2.0'
|
118
118
|
type: :runtime
|
119
119
|
prerelease: false
|
120
120
|
version_requirements: !ruby/object:Gem::Requirement
|
121
121
|
requirements:
|
122
122
|
- - "~>"
|
123
123
|
- !ruby/object:Gem::Version
|
124
|
-
version:
|
124
|
+
version: '2.0'
|
125
125
|
- !ruby/object:Gem::Dependency
|
126
126
|
name: nesty
|
127
127
|
requirement: !ruby/object:Gem::Requirement
|