metainspector 5.0.2 → 5.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a38316bdb0ec3f428421adfa7fe90c61673a26e0
4
- data.tar.gz: 46a787a3feaffdb516871dacd8cc72f0009aae36
3
+ metadata.gz: 0f69de2705313a7048cf7d2067238a38f7e42bfc
4
+ data.tar.gz: 9b4770b9f1e769b5da1abab6814bc7dc4bb38fa6
5
5
  SHA512:
6
- metadata.gz: a1168ee9c0530f78f8a6efd6a5f67cf06c0f6f8a3bef1c427e70073056edf71eb6d90c94c84c4d8e97fc14e04924ad6c1f9ef35bf3eaaaaec269c5ebeb955bb5
7
- data.tar.gz: ada6c39cf2e0db2d241ef136d47429649899b1bc700c662447e71a57fd752b23c874b9e8c90a9ecb59e8e72430db34a9957769cf01c8e1632bf08aa225526deb
6
+ metadata.gz: 2f35f03456f48aa4cfd65de50349ac7f67d08ad699e1f72c29251e95468f52457c137beeddcd4cf2afac261a9a6dfdb4d9462b831a907fd75d01bfa6168231e5
7
+ data.tar.gz: afc08f9b3217b7c6ee0fec83d1fef1f706af14f685a44d081724341e93ef352d2a69645436b375d9c0e08f49a576ab08669539f75b826b51bfbb3203e4f0e6a5
data/README.md CHANGED
@@ -328,13 +328,13 @@ MetaInspector.new('https://example.com', faraday_options: { ssl: { verify: false
328
328
 
329
329
  ### HTML Content Only
330
330
 
331
- MetaInspector will try to parse all URLs by default. If you want to raise an exception when trying to parse a non-html URL (one that has a content-type different than text/html), you can state it like this:
331
+ MetaInspector will try to parse all URLs by default. By default, it will raise an exception when trying to parse a non-html URL (one that has a content-type different than text/html). You can disable this behaviour with:
332
332
 
333
333
  ```ruby
334
- page = MetaInspector.new('sitevalidator.com', :html_content_only => true)
334
+ page = MetaInspector.new('sitevalidator.com', :html_content_only => false)
335
335
  ```
336
336
 
337
- This is useful when using MetaInspector on web spidering. Although on the initial URL you'll probably have an HTML URL, following links you may find yourself trying to parse non-html URLs.
337
+ This option is deprecated since 5.1.0 and will be removed in 5.2.0.
338
338
 
339
339
  ```ruby
340
340
  page = MetaInspector.new('http://example.com/image.png')
@@ -8,8 +8,6 @@ module MetaInspector
8
8
  # * connection_timeout: defaults to 20 seconds
9
9
  # * read_timeout: defaults to 20 seconds
10
10
  # * retries: defaults to 3 times
11
- # * html_content_type_only: if an exception should be raised if request
12
- # content-type is not text/html. Defaults to false.
13
11
  # * allow_redirections: when true, follow HTTP redirects. Defaults to true
14
12
  # * document: the html of the url as a string
15
13
  # * headers: object containing custom headers for the request
@@ -20,7 +18,16 @@ module MetaInspector
20
18
  @connection_timeout = options[:connection_timeout]
21
19
  @read_timeout = options[:read_timeout]
22
20
  @retries = options[:retries]
23
- @html_content_only = options[:html_content_only]
21
+
22
+ unless options[:html_content_only].nil?
23
+ @html_content_only = options[:html_content_only]
24
+
25
+ puts <<-EOS
26
+ DEPRECATION NOTICE: html_content_only is deprecated and turned on by default since 5.1.0,
27
+ this option will be removed in 5.2.0
28
+ EOS
29
+ end
30
+
24
31
  @allow_redirections = options[:allow_redirections]
25
32
  @document = options[:document]
26
33
  @download_images = options[:download_images]
@@ -83,7 +90,7 @@ module MetaInspector
83
90
  def defaults
84
91
  { :timeout => 20,
85
92
  :retries => 3,
86
- :html_content_only => false,
93
+ :html_content_only => true,
87
94
  :headers => {
88
95
  'User-Agent' => default_user_agent,
89
96
  'Accept-Encoding' => 'identity'
@@ -98,11 +105,11 @@ module MetaInspector
98
105
  end
99
106
 
100
107
  def document
101
- @document ||= if html_content_only && content_type != 'text/html'
102
- fail MetaInspector::ParserError.new "The url provided contains #{content_type} content instead of text/html content"
103
- else
104
- @request.read
105
- end
108
+ @document ||= if html_content_only && !content_type.nil? && content_type != 'text/html'
109
+ fail MetaInspector::ParserError.new "The url provided contains #{content_type} content instead of text/html content"
110
+ else
111
+ @request.read
112
+ end
106
113
  end
107
114
  end
108
115
  end
@@ -2,6 +2,7 @@ require 'faraday'
2
2
  require 'faraday_middleware'
3
3
  require 'faraday-cookie_jar'
4
4
  require 'faraday-http-cache'
5
+ require 'faraday/encoding'
5
6
 
6
7
  module MetaInspector
7
8
 
@@ -62,6 +63,7 @@ module MetaInspector
62
63
  end
63
64
 
64
65
  faraday.headers.merge!(@headers || {})
66
+ faraday.response :encoding
65
67
  faraday.adapter :net_http
66
68
  end
67
69
 
@@ -1,3 +1,3 @@
1
1
  module MetaInspector
2
- VERSION = '5.0.2'
2
+ VERSION = '5.1.0'
3
3
  end
@@ -19,6 +19,7 @@ Gem::Specification.new do |gem|
19
19
  gem.add_dependency 'faraday_middleware', '~> 0.10'
20
20
  gem.add_dependency 'faraday-cookie_jar', '~> 0.0'
21
21
  gem.add_dependency 'faraday-http-cache', '~> 1.2'
22
+ gem.add_dependency 'faraday-encoding', '~> 0.0.2'
22
23
  gem.add_dependency 'addressable', '~> 2.4'
23
24
  gem.add_dependency 'fastimage', '~> 1.8.1'
24
25
  gem.add_dependency 'nesty', '~> 1.0'
@@ -75,11 +75,11 @@ describe MetaInspector::Document do
75
75
  end
76
76
 
77
77
  describe 'exception handling' do
78
- it "should parse images when parse_html_content_type_only is not specified" do
78
+ it "should not parse images when parse_html_content_type_only is not specified" do
79
79
  expect do
80
80
  image_url = MetaInspector::Document.new('http://pagerankalert.com/image.png')
81
81
  image_url.title
82
- end.to_not raise_error
82
+ end.to raise_error
83
83
  end
84
84
 
85
85
  it "should parse images when parse_html_content_type_only is false" do
@@ -137,4 +137,14 @@ describe MetaInspector::Document do
137
137
  expect(MetaInspector.new('http://example.com/%EF%BD%9E', normalize_url: false).url).to eq('http://example.com/%EF%BD%9E')
138
138
  end
139
139
  end
140
+
141
+ describe 'page encoding' do
142
+ it 'should encode title according to the charset' do
143
+ expect(MetaInspector.new('http://example-rtl.com/').title).to eq('بالفيديو.. "مصطفى بكري" : انتخابات الائتلاف غير نزيهة وموجهة لفوز أشخاص بعينها')
144
+ end
145
+
146
+ it 'should encode description according to the charset' do
147
+ expect(MetaInspector.new('http://example-rtl.com/').description).to eq('أعلن النائب مصطفى بكري انسحابه من ائتلاف دعم مصر بعد اعتراضه على نتيجة الانتخابات الداخلية للائتلاف، وخسارته فيها، وقال إنه سيترشح غدا على منصب الوكيل بالمجلس')
148
+ end
149
+ end
140
150
  end
@@ -0,0 +1,27 @@
1
+ HTTP/1.1 200 OK
2
+ Server: nginx admin
3
+ Date: Mon, 01 Feb 2016 16:18:35 GMT
4
+ Content-Type: text/html; charset=windows-1256
5
+ Transfer-Encoding: chunked
6
+ Connection: keep-alive
7
+ Vary: Accept-Encoding
8
+ X-Powered-By: PHP/5.3.29
9
+ Set-Cookie: PHPSESSID=a0ddee5488c7480bbc7b0a50ac472d2a; path=/
10
+ Expires: Thu, 19 Nov 1981 08:52:00 GMT
11
+ Cache-Control: no-store, no-cache, must-revalidate, post-check=0, pre-check=0
12
+ Pragma: no-cache
13
+
14
+
15
+ <!doctype html>
16
+ <html class="no-js" lang="ar" dir="rtl">
17
+ <head>
18
+ <title>��������.. "����� ����" : �������� �������� ��� ����� ������ ���� ����� ������</title>
19
+ <meta name="description" content="���� ������ ����� ���� ������� �� ������ ��� ��� ��� ������� ��� ����� ���������� �������� �������ݡ ������� ���ǡ ���� ��� ������ ��� ��� ���� ������ �������">
20
+ <meta name="keywords" content="��������,,,,�����,����,,,,��������,��������,���,�����,������,����,�����,������">
21
+ <meta property='og:title' content='��������.. "����� ����" : �������� �������� ��� ����� ������ ���� ����� ������' />
22
+ </head>
23
+ <body class="page-front">
24
+ <h1 class="hide">�����</h1>
25
+
26
+ </body>
27
+ </html>
data/spec/spec_helper.rb CHANGED
@@ -107,3 +107,6 @@ FakeWeb.register_uri(:get, "http://unsafe-facebook.com/", :response => fixture
107
107
  # These examples are used to test normalize URLs
108
108
  FakeWeb.register_uri(:get, "http://example.com/%EF%BD%9E", :response => fixture_file("example.response"))
109
109
  FakeWeb.register_uri(:get, "http://example.com/~", :response => fixture_file("example.response"))
110
+
111
+ # Example to test correct encoding
112
+ FakeWeb.register_uri(:get, "http://example-rtl.com/", :response => fixture_file("encoding.response"))
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: metainspector
3
3
  version: !ruby/object:Gem::Version
4
- version: 5.0.2
4
+ version: 5.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jaime Iniesta
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-02-24 00:00:00.000000000 Z
11
+ date: 2016-03-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -80,6 +80,20 @@ dependencies:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
82
  version: '1.2'
83
+ - !ruby/object:Gem::Dependency
84
+ name: faraday-encoding
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: 0.0.2
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: 0.0.2
83
97
  - !ruby/object:Gem::Dependency
84
98
  name: addressable
85
99
  requirement: !ruby/object:Gem::Requirement
@@ -295,6 +309,7 @@ files:
295
309
  - spec/fixtures/charset_001.response
296
310
  - spec/fixtures/charset_002.response
297
311
  - spec/fixtures/empty_page.response
312
+ - spec/fixtures/encoding.response
298
313
  - spec/fixtures/example.response
299
314
  - spec/fixtures/facebook.com.response
300
315
  - spec/fixtures/guardian.co.uk.response