metainspector 5.0.2 → 5.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a38316bdb0ec3f428421adfa7fe90c61673a26e0
4
- data.tar.gz: 46a787a3feaffdb516871dacd8cc72f0009aae36
3
+ metadata.gz: 0f69de2705313a7048cf7d2067238a38f7e42bfc
4
+ data.tar.gz: 9b4770b9f1e769b5da1abab6814bc7dc4bb38fa6
5
5
  SHA512:
6
- metadata.gz: a1168ee9c0530f78f8a6efd6a5f67cf06c0f6f8a3bef1c427e70073056edf71eb6d90c94c84c4d8e97fc14e04924ad6c1f9ef35bf3eaaaaec269c5ebeb955bb5
7
- data.tar.gz: ada6c39cf2e0db2d241ef136d47429649899b1bc700c662447e71a57fd752b23c874b9e8c90a9ecb59e8e72430db34a9957769cf01c8e1632bf08aa225526deb
6
+ metadata.gz: 2f35f03456f48aa4cfd65de50349ac7f67d08ad699e1f72c29251e95468f52457c137beeddcd4cf2afac261a9a6dfdb4d9462b831a907fd75d01bfa6168231e5
7
+ data.tar.gz: afc08f9b3217b7c6ee0fec83d1fef1f706af14f685a44d081724341e93ef352d2a69645436b375d9c0e08f49a576ab08669539f75b826b51bfbb3203e4f0e6a5
data/README.md CHANGED
@@ -328,13 +328,13 @@ MetaInspector.new('https://example.com', faraday_options: { ssl: { verify: false
328
328
 
329
329
  ### HTML Content Only
330
330
 
331
- MetaInspector will try to parse all URLs by default. If you want to raise an exception when trying to parse a non-html URL (one that has a content-type different than text/html), you can state it like this:
331
+ MetaInspector will try to parse all URLs by default. By default, it will raise an exception when trying to parse a non-html URL (one that has a content-type different than text/html). You can disable this behaviour with:
332
332
 
333
333
  ```ruby
334
- page = MetaInspector.new('sitevalidator.com', :html_content_only => true)
334
+ page = MetaInspector.new('sitevalidator.com', :html_content_only => false)
335
335
  ```
336
336
 
337
- This is useful when using MetaInspector on web spidering. Although on the initial URL you'll probably have an HTML URL, following links you may find yourself trying to parse non-html URLs.
337
+ This option is deprecated since 5.1.0 and will be removed in 5.2.0.
338
338
 
339
339
  ```ruby
340
340
  page = MetaInspector.new('http://example.com/image.png')
@@ -8,8 +8,6 @@ module MetaInspector
8
8
  # * connection_timeout: defaults to 20 seconds
9
9
  # * read_timeout: defaults to 20 seconds
10
10
  # * retries: defaults to 3 times
11
- # * html_content_type_only: if an exception should be raised if request
12
- # content-type is not text/html. Defaults to false.
13
11
  # * allow_redirections: when true, follow HTTP redirects. Defaults to true
14
12
  # * document: the html of the url as a string
15
13
  # * headers: object containing custom headers for the request
@@ -20,7 +18,16 @@ module MetaInspector
20
18
  @connection_timeout = options[:connection_timeout]
21
19
  @read_timeout = options[:read_timeout]
22
20
  @retries = options[:retries]
23
- @html_content_only = options[:html_content_only]
21
+
22
+ unless options[:html_content_only].nil?
23
+ @html_content_only = options[:html_content_only]
24
+
25
+ puts <<-EOS
26
+ DEPRECATION NOTICE: html_content_only is deprecated and turned on by default since 5.1.0,
27
+ this option will be removed in 5.2.0
28
+ EOS
29
+ end
30
+
24
31
  @allow_redirections = options[:allow_redirections]
25
32
  @document = options[:document]
26
33
  @download_images = options[:download_images]
@@ -83,7 +90,7 @@ module MetaInspector
83
90
  def defaults
84
91
  { :timeout => 20,
85
92
  :retries => 3,
86
- :html_content_only => false,
93
+ :html_content_only => true,
87
94
  :headers => {
88
95
  'User-Agent' => default_user_agent,
89
96
  'Accept-Encoding' => 'identity'
@@ -98,11 +105,11 @@ module MetaInspector
98
105
  end
99
106
 
100
107
  def document
101
- @document ||= if html_content_only && content_type != 'text/html'
102
- fail MetaInspector::ParserError.new "The url provided contains #{content_type} content instead of text/html content"
103
- else
104
- @request.read
105
- end
108
+ @document ||= if html_content_only && !content_type.nil? && content_type != 'text/html'
109
+ fail MetaInspector::ParserError.new "The url provided contains #{content_type} content instead of text/html content"
110
+ else
111
+ @request.read
112
+ end
106
113
  end
107
114
  end
108
115
  end
@@ -2,6 +2,7 @@ require 'faraday'
2
2
  require 'faraday_middleware'
3
3
  require 'faraday-cookie_jar'
4
4
  require 'faraday-http-cache'
5
+ require 'faraday/encoding'
5
6
 
6
7
  module MetaInspector
7
8
 
@@ -62,6 +63,7 @@ module MetaInspector
62
63
  end
63
64
 
64
65
  faraday.headers.merge!(@headers || {})
66
+ faraday.response :encoding
65
67
  faraday.adapter :net_http
66
68
  end
67
69
 
@@ -1,3 +1,3 @@
1
1
  module MetaInspector
2
- VERSION = '5.0.2'
2
+ VERSION = '5.1.0'
3
3
  end
@@ -19,6 +19,7 @@ Gem::Specification.new do |gem|
19
19
  gem.add_dependency 'faraday_middleware', '~> 0.10'
20
20
  gem.add_dependency 'faraday-cookie_jar', '~> 0.0'
21
21
  gem.add_dependency 'faraday-http-cache', '~> 1.2'
22
+ gem.add_dependency 'faraday-encoding', '~> 0.0.2'
22
23
  gem.add_dependency 'addressable', '~> 2.4'
23
24
  gem.add_dependency 'fastimage', '~> 1.8.1'
24
25
  gem.add_dependency 'nesty', '~> 1.0'
@@ -75,11 +75,11 @@ describe MetaInspector::Document do
75
75
  end
76
76
 
77
77
  describe 'exception handling' do
78
- it "should parse images when parse_html_content_type_only is not specified" do
78
+ it "should not parse images when parse_html_content_type_only is not specified" do
79
79
  expect do
80
80
  image_url = MetaInspector::Document.new('http://pagerankalert.com/image.png')
81
81
  image_url.title
82
- end.to_not raise_error
82
+ end.to raise_error
83
83
  end
84
84
 
85
85
  it "should parse images when parse_html_content_type_only is false" do
@@ -137,4 +137,14 @@ describe MetaInspector::Document do
137
137
  expect(MetaInspector.new('http://example.com/%EF%BD%9E', normalize_url: false).url).to eq('http://example.com/%EF%BD%9E')
138
138
  end
139
139
  end
140
+
141
+ describe 'page encoding' do
142
+ it 'should encode title according to the charset' do
143
+ expect(MetaInspector.new('http://example-rtl.com/').title).to eq('بالفيديو.. "مصطفى بكري" : انتخابات الائتلاف غير نزيهة وموجهة لفوز أشخاص بعينها')
144
+ end
145
+
146
+ it 'should encode description according to the charset' do
147
+ expect(MetaInspector.new('http://example-rtl.com/').description).to eq('أعلن النائب مصطفى بكري انسحابه من ائتلاف دعم مصر بعد اعتراضه على نتيجة الانتخابات الداخلية للائتلاف، وخسارته فيها، وقال إنه سيترشح غدا على منصب الوكيل بالمجلس')
148
+ end
149
+ end
140
150
  end
@@ -0,0 +1,27 @@
1
+ HTTP/1.1 200 OK
2
+ Server: nginx admin
3
+ Date: Mon, 01 Feb 2016 16:18:35 GMT
4
+ Content-Type: text/html; charset=windows-1256
5
+ Transfer-Encoding: chunked
6
+ Connection: keep-alive
7
+ Vary: Accept-Encoding
8
+ X-Powered-By: PHP/5.3.29
9
+ Set-Cookie: PHPSESSID=a0ddee5488c7480bbc7b0a50ac472d2a; path=/
10
+ Expires: Thu, 19 Nov 1981 08:52:00 GMT
11
+ Cache-Control: no-store, no-cache, must-revalidate, post-check=0, pre-check=0
12
+ Pragma: no-cache
13
+
14
+
15
+ <!doctype html>
16
+ <html class="no-js" lang="ar" dir="rtl">
17
+ <head>
18
+ <title>��������.. "����� ����" : �������� �������� ��� ����� ������ ���� ����� ������</title>
19
+ <meta name="description" content="���� ������ ����� ���� ������� �� ������ ��� ��� ��� ������� ��� ����� ���������� �������� �������ݡ ������� ���ǡ ���� ��� ������ ��� ��� ���� ������ �������">
20
+ <meta name="keywords" content="��������,,,,�����,����,,,,��������,��������,���,�����,������,����,�����,������">
21
+ <meta property='og:title' content='��������.. "����� ����" : �������� �������� ��� ����� ������ ���� ����� ������' />
22
+ </head>
23
+ <body class="page-front">
24
+ <h1 class="hide">�����</h1>
25
+
26
+ </body>
27
+ </html>
data/spec/spec_helper.rb CHANGED
@@ -107,3 +107,6 @@ FakeWeb.register_uri(:get, "http://unsafe-facebook.com/", :response => fixture
107
107
  # These examples are used to test normalize URLs
108
108
  FakeWeb.register_uri(:get, "http://example.com/%EF%BD%9E", :response => fixture_file("example.response"))
109
109
  FakeWeb.register_uri(:get, "http://example.com/~", :response => fixture_file("example.response"))
110
+
111
+ # Example to test correct encoding
112
+ FakeWeb.register_uri(:get, "http://example-rtl.com/", :response => fixture_file("encoding.response"))
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: metainspector
3
3
  version: !ruby/object:Gem::Version
4
- version: 5.0.2
4
+ version: 5.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jaime Iniesta
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-02-24 00:00:00.000000000 Z
11
+ date: 2016-03-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -80,6 +80,20 @@ dependencies:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
82
  version: '1.2'
83
+ - !ruby/object:Gem::Dependency
84
+ name: faraday-encoding
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: 0.0.2
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: 0.0.2
83
97
  - !ruby/object:Gem::Dependency
84
98
  name: addressable
85
99
  requirement: !ruby/object:Gem::Requirement
@@ -295,6 +309,7 @@ files:
295
309
  - spec/fixtures/charset_001.response
296
310
  - spec/fixtures/charset_002.response
297
311
  - spec/fixtures/empty_page.response
312
+ - spec/fixtures/encoding.response
298
313
  - spec/fixtures/example.response
299
314
  - spec/fixtures/facebook.com.response
300
315
  - spec/fixtures/guardian.co.uk.response