metainspector 5.0.2 → 5.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +3 -3
- data/lib/meta_inspector/document.rb +16 -9
- data/lib/meta_inspector/request.rb +2 -0
- data/lib/meta_inspector/version.rb +1 -1
- data/meta_inspector.gemspec +1 -0
- data/spec/document_spec.rb +12 -2
- data/spec/fixtures/encoding.response +27 -0
- data/spec/spec_helper.rb +3 -0
- metadata +17 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0f69de2705313a7048cf7d2067238a38f7e42bfc
|
4
|
+
data.tar.gz: 9b4770b9f1e769b5da1abab6814bc7dc4bb38fa6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2f35f03456f48aa4cfd65de50349ac7f67d08ad699e1f72c29251e95468f52457c137beeddcd4cf2afac261a9a6dfdb4d9462b831a907fd75d01bfa6168231e5
|
7
|
+
data.tar.gz: afc08f9b3217b7c6ee0fec83d1fef1f706af14f685a44d081724341e93ef352d2a69645436b375d9c0e08f49a576ab08669539f75b826b51bfbb3203e4f0e6a5
|
data/README.md
CHANGED
@@ -328,13 +328,13 @@ MetaInspector.new('https://example.com', faraday_options: { ssl: { verify: false
|
|
328
328
|
|
329
329
|
### HTML Content Only
|
330
330
|
|
331
|
-
MetaInspector will try to parse all URLs by default.
|
331
|
+
MetaInspector will try to parse all URLs by default. By default, it will raise an exception when trying to parse a non-html URL (one that has a content-type different than text/html). You can disable this behaviour with:
|
332
332
|
|
333
333
|
```ruby
|
334
|
-
page = MetaInspector.new('sitevalidator.com', :html_content_only =>
|
334
|
+
page = MetaInspector.new('sitevalidator.com', :html_content_only => false)
|
335
335
|
```
|
336
336
|
|
337
|
-
This is
|
337
|
+
This option is deprecated since 5.1.0 and will be removed in 5.2.0.
|
338
338
|
|
339
339
|
```ruby
|
340
340
|
page = MetaInspector.new('http://example.com/image.png')
|
@@ -8,8 +8,6 @@ module MetaInspector
|
|
8
8
|
# * connection_timeout: defaults to 20 seconds
|
9
9
|
# * read_timeout: defaults to 20 seconds
|
10
10
|
# * retries: defaults to 3 times
|
11
|
-
# * html_content_type_only: if an exception should be raised if request
|
12
|
-
# content-type is not text/html. Defaults to false.
|
13
11
|
# * allow_redirections: when true, follow HTTP redirects. Defaults to true
|
14
12
|
# * document: the html of the url as a string
|
15
13
|
# * headers: object containing custom headers for the request
|
@@ -20,7 +18,16 @@ module MetaInspector
|
|
20
18
|
@connection_timeout = options[:connection_timeout]
|
21
19
|
@read_timeout = options[:read_timeout]
|
22
20
|
@retries = options[:retries]
|
23
|
-
|
21
|
+
|
22
|
+
unless options[:html_content_only].nil?
|
23
|
+
@html_content_only = options[:html_content_only]
|
24
|
+
|
25
|
+
puts <<-EOS
|
26
|
+
DEPRECATION NOTICE: html_content_only is deprecated and turned on by default since 5.1.0,
|
27
|
+
this option will be removed in 5.2.0
|
28
|
+
EOS
|
29
|
+
end
|
30
|
+
|
24
31
|
@allow_redirections = options[:allow_redirections]
|
25
32
|
@document = options[:document]
|
26
33
|
@download_images = options[:download_images]
|
@@ -83,7 +90,7 @@ module MetaInspector
|
|
83
90
|
def defaults
|
84
91
|
{ :timeout => 20,
|
85
92
|
:retries => 3,
|
86
|
-
:html_content_only =>
|
93
|
+
:html_content_only => true,
|
87
94
|
:headers => {
|
88
95
|
'User-Agent' => default_user_agent,
|
89
96
|
'Accept-Encoding' => 'identity'
|
@@ -98,11 +105,11 @@ module MetaInspector
|
|
98
105
|
end
|
99
106
|
|
100
107
|
def document
|
101
|
-
@document ||= if html_content_only && content_type != 'text/html'
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
108
|
+
@document ||= if html_content_only && !content_type.nil? && content_type != 'text/html'
|
109
|
+
fail MetaInspector::ParserError.new "The url provided contains #{content_type} content instead of text/html content"
|
110
|
+
else
|
111
|
+
@request.read
|
112
|
+
end
|
106
113
|
end
|
107
114
|
end
|
108
115
|
end
|
@@ -2,6 +2,7 @@ require 'faraday'
|
|
2
2
|
require 'faraday_middleware'
|
3
3
|
require 'faraday-cookie_jar'
|
4
4
|
require 'faraday-http-cache'
|
5
|
+
require 'faraday/encoding'
|
5
6
|
|
6
7
|
module MetaInspector
|
7
8
|
|
@@ -62,6 +63,7 @@ module MetaInspector
|
|
62
63
|
end
|
63
64
|
|
64
65
|
faraday.headers.merge!(@headers || {})
|
66
|
+
faraday.response :encoding
|
65
67
|
faraday.adapter :net_http
|
66
68
|
end
|
67
69
|
|
data/meta_inspector.gemspec
CHANGED
@@ -19,6 +19,7 @@ Gem::Specification.new do |gem|
|
|
19
19
|
gem.add_dependency 'faraday_middleware', '~> 0.10'
|
20
20
|
gem.add_dependency 'faraday-cookie_jar', '~> 0.0'
|
21
21
|
gem.add_dependency 'faraday-http-cache', '~> 1.2'
|
22
|
+
gem.add_dependency 'faraday-encoding', '~> 0.0.2'
|
22
23
|
gem.add_dependency 'addressable', '~> 2.4'
|
23
24
|
gem.add_dependency 'fastimage', '~> 1.8.1'
|
24
25
|
gem.add_dependency 'nesty', '~> 1.0'
|
data/spec/document_spec.rb
CHANGED
@@ -75,11 +75,11 @@ describe MetaInspector::Document do
|
|
75
75
|
end
|
76
76
|
|
77
77
|
describe 'exception handling' do
|
78
|
-
it "should parse images when parse_html_content_type_only is not specified" do
|
78
|
+
it "should not parse images when parse_html_content_type_only is not specified" do
|
79
79
|
expect do
|
80
80
|
image_url = MetaInspector::Document.new('http://pagerankalert.com/image.png')
|
81
81
|
image_url.title
|
82
|
-
end.
|
82
|
+
end.to raise_error
|
83
83
|
end
|
84
84
|
|
85
85
|
it "should parse images when parse_html_content_type_only is false" do
|
@@ -137,4 +137,14 @@ describe MetaInspector::Document do
|
|
137
137
|
expect(MetaInspector.new('http://example.com/%EF%BD%9E', normalize_url: false).url).to eq('http://example.com/%EF%BD%9E')
|
138
138
|
end
|
139
139
|
end
|
140
|
+
|
141
|
+
describe 'page encoding' do
|
142
|
+
it 'should encode title according to the charset' do
|
143
|
+
expect(MetaInspector.new('http://example-rtl.com/').title).to eq('بالفيديو.. "مصطفى بكري" : انتخابات الائتلاف غير نزيهة وموجهة لفوز أشخاص بعينها')
|
144
|
+
end
|
145
|
+
|
146
|
+
it 'should encode description according to the charset' do
|
147
|
+
expect(MetaInspector.new('http://example-rtl.com/').description).to eq('أعلن النائب مصطفى بكري انسحابه من ائتلاف دعم مصر بعد اعتراضه على نتيجة الانتخابات الداخلية للائتلاف، وخسارته فيها، وقال إنه سيترشح غدا على منصب الوكيل بالمجلس')
|
148
|
+
end
|
149
|
+
end
|
140
150
|
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
HTTP/1.1 200 OK
|
2
|
+
Server: nginx admin
|
3
|
+
Date: Mon, 01 Feb 2016 16:18:35 GMT
|
4
|
+
Content-Type: text/html; charset=windows-1256
|
5
|
+
Transfer-Encoding: chunked
|
6
|
+
Connection: keep-alive
|
7
|
+
Vary: Accept-Encoding
|
8
|
+
X-Powered-By: PHP/5.3.29
|
9
|
+
Set-Cookie: PHPSESSID=a0ddee5488c7480bbc7b0a50ac472d2a; path=/
|
10
|
+
Expires: Thu, 19 Nov 1981 08:52:00 GMT
|
11
|
+
Cache-Control: no-store, no-cache, must-revalidate, post-check=0, pre-check=0
|
12
|
+
Pragma: no-cache
|
13
|
+
|
14
|
+
|
15
|
+
<!doctype html>
|
16
|
+
<html class="no-js" lang="ar" dir="rtl">
|
17
|
+
<head>
|
18
|
+
<title>��������.. "����� ����" : �������� �������� ��� ����� ������ ���� ����� ������</title>
|
19
|
+
<meta name="description" content="���� ������ ����� ���� ������� �� ������ ��� ��� ��� ������� ��� ����� ���������� �������� �������ݡ ������� ���ǡ ���� ��� ������ ��� ��� ���� ������ �������">
|
20
|
+
<meta name="keywords" content="��������,,,,�����,����,,,,��������,��������,���,�����,������,����,�����,������">
|
21
|
+
<meta property='og:title' content='��������.. "����� ����" : �������� �������� ��� ����� ������ ���� ����� ������' />
|
22
|
+
</head>
|
23
|
+
<body class="page-front">
|
24
|
+
<h1 class="hide">�����</h1>
|
25
|
+
|
26
|
+
</body>
|
27
|
+
</html>
|
data/spec/spec_helper.rb
CHANGED
@@ -107,3 +107,6 @@ FakeWeb.register_uri(:get, "http://unsafe-facebook.com/", :response => fixture
|
|
107
107
|
# These examples are used to test normalize URLs
|
108
108
|
FakeWeb.register_uri(:get, "http://example.com/%EF%BD%9E", :response => fixture_file("example.response"))
|
109
109
|
FakeWeb.register_uri(:get, "http://example.com/~", :response => fixture_file("example.response"))
|
110
|
+
|
111
|
+
# Example to test correct encoding
|
112
|
+
FakeWeb.register_uri(:get, "http://example-rtl.com/", :response => fixture_file("encoding.response"))
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 5.0
|
4
|
+
version: 5.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jaime Iniesta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-03-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -80,6 +80,20 @@ dependencies:
|
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '1.2'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: faraday-encoding
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 0.0.2
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: 0.0.2
|
83
97
|
- !ruby/object:Gem::Dependency
|
84
98
|
name: addressable
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
@@ -295,6 +309,7 @@ files:
|
|
295
309
|
- spec/fixtures/charset_001.response
|
296
310
|
- spec/fixtures/charset_002.response
|
297
311
|
- spec/fixtures/empty_page.response
|
312
|
+
- spec/fixtures/encoding.response
|
298
313
|
- spec/fixtures/example.response
|
299
314
|
- spec/fixtures/facebook.com.response
|
300
315
|
- spec/fixtures/guardian.co.uk.response
|