metainspector 5.7.0 → 5.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 860350c6b4704259715f0a0eea047893134d774928b0bf78c9633f5afec480ff
4
- data.tar.gz: d0abcb709893e63be252886723f8f202137dae3d02b506173a59b3368dcda7f7
3
+ metadata.gz: a349b7103e8b214dc3211da2bc46d830e8e88b85fa7bde9b734ff3ba13802be0
4
+ data.tar.gz: fc4b691504551ee1c0ca6df5f92f5f5dbf5f2ebfa71415cff5259a67aff17c5f
5
5
  SHA512:
6
- metadata.gz: ab74dde15e1864f4c747be953a512278578ab231bea4637e1fb752234766c5b895b47405bd8572c06fd421cd16170e69c4331fb3462a7177ea4c6c521748e1f9
7
- data.tar.gz: fde5cf878c7320d49c82aa430b2fc422044a07168e4b6e6cce53d3b33655b28f6ee811cc535ffca06be46c19403b83e7c453aa942895b9e77c675468fa3fea12
6
+ metadata.gz: 1a268679f23f0ecfdd4358304999200175c78c2bef37ad1d5acdba641b46512ec6e6dca8d18005b87bc7fbc9b632592ac657818c99d540324685a56912a8985c
7
+ data.tar.gz: 8f90160c9f923e69157cfc8c2d6b13f386dd394501d37330a6ede47e587d9474adbe9b932df86340fc3c14754eedd59538eef65944801f6567b26189be8db077
@@ -1,5 +1,10 @@
1
1
  # MetaInpector Changelog
2
2
 
3
+ ## [Changes in 5.7](https://github.com/jaimeiniesta/metainspector/compare/v5.6.0...v5.7.0)
4
+
5
+ * Avoids normalizing image URLs. https://github.com/jaimeiniesta/metainspector/pull/241
6
+ * Adds `NonHtmlErrorException` instead of `ParserError` https://github.com/jaimeiniesta/metainspector/pull/248
7
+
3
8
  ## [Changes in 5.6](https://github.com/jaimeiniesta/metainspector/compare/v5.5.0...v5.6.0)
4
9
 
5
10
  * New feature: `:encoding` option for force encoding of a parsed document.
data/README.md CHANGED
@@ -85,6 +85,12 @@ page.author # author of the page from the meta author tag
85
85
  page.best_author # best author of the page, from a selection of candidates
86
86
  page.description # returns the meta description
87
87
  page.best_description # returns the first non-empty description between the following candidates: standard meta description, og:description, twitter:description, the first long paragraph
88
+ page.h1 # returns h1 text array
89
+ page.h2 # returns h2 text array
90
+ page.h3 # returns h3 text array
91
+ page.h4 # returns h4 text array
92
+ page.h5 # returns h5 text array
93
+ page.h6 # returns h6 text array
88
94
  ```
89
95
 
90
96
  ### Links
@@ -48,7 +48,7 @@ module MetaInspector
48
48
  delegate [:content_type, :response] => :@request
49
49
 
50
50
  delegate [:parsed, :title, :best_title, :author, :best_author,
51
- :description, :best_description, :links,
51
+ :h1, :h2, :h3, :h4, :h5, :h6, :description, :best_description, :links,
52
52
  :images, :feed, :charset, :meta_tags,
53
53
  :meta_tag, :meta, :favicon,
54
54
  :head_links, :stylesheets, :canonicals] => :@parser
@@ -66,6 +66,12 @@ module MetaInspector
66
66
  'best_author' => best_author,
67
67
  'description' => description,
68
68
  'best_description' => best_description,
69
+ 'h1' => h1,
70
+ 'h2' => h2,
71
+ 'h3' => h3,
72
+ 'h4' => h4,
73
+ 'h5' => h5,
74
+ 'h6' => h6,
69
75
  'links' => links.to_hash,
70
76
  'images' => images.to_a,
71
77
  'charset' => charset,
@@ -26,7 +26,8 @@ module MetaInspector
26
26
  delegate [:head_links, :stylesheets, :canonicals, :feed] => :@head_links_parser
27
27
  delegate [:links, :base_url] => :@links_parser
28
28
  delegate :images => :@images_parser
29
- delegate [:title, :best_title, :author, :best_author, :description, :best_description] => :@texts_parser
29
+ delegate [:title, :best_title, :author, :best_author, :description, :best_description,
30
+ :h1, :h2, :h3, :h4, :h5, :h6] => :@texts_parser
30
31
 
31
32
  # Returns the whole parsed document
32
33
  def parsed
@@ -13,6 +13,30 @@ module MetaInspector
13
13
  @best_title ||= find_best_title
14
14
  end
15
15
 
16
+ def h1
17
+ @h1 ||= find_heading('h1')
18
+ end
19
+
20
+ def h2
21
+ @h2 ||= find_heading('h2')
22
+ end
23
+
24
+ def h3
25
+ @h3 ||= find_heading('h3')
26
+ end
27
+
28
+ def h4
29
+ @h4 ||= find_heading('h4')
30
+ end
31
+
32
+ def h5
33
+ @h5 ||= find_heading('h5')
34
+ end
35
+
36
+ def h6
37
+ @h6 ||= find_heading('h6')
38
+ end
39
+
16
40
  # Returns the meta author, if present
17
41
  def author
18
42
  @author ||= meta['author']
@@ -45,6 +69,10 @@ module MetaInspector
45
69
 
46
70
  private
47
71
 
72
+ def find_heading(heading)
73
+ parsed.css(heading).map { |tag| tag.inner_text.strip.gsub(/\s+/, ' ') }.reject(&:empty?)
74
+ end
75
+
48
76
  # Look for candidates per list of priority
49
77
  def find_best_title
50
78
  candidates = [
@@ -1,3 +1,3 @@
1
1
  module MetaInspector
2
- VERSION = '5.7.0'
2
+ VERSION = '5.8.0'
3
3
  end
@@ -15,19 +15,19 @@ Gem::Specification.new do |gem|
15
15
  gem.version = MetaInspector::VERSION
16
16
 
17
17
  gem.add_dependency 'nokogiri', '~> 1.10.4'
18
- gem.add_dependency 'faraday', '~> 0.15.3'
19
- gem.add_dependency 'faraday_middleware', '~> 0.12.2'
18
+ gem.add_dependency 'faraday', '~> 0.17.0'
19
+ gem.add_dependency 'faraday_middleware', '~> 0.13.1'
20
20
  gem.add_dependency 'faraday-cookie_jar', '~> 0.0.6'
21
21
  gem.add_dependency 'faraday-http-cache', '~> 2.0.0'
22
22
  gem.add_dependency 'faraday-encoding', '~> 0.0.5'
23
- gem.add_dependency 'addressable', '~> 2.5.2'
24
- gem.add_dependency 'fastimage', '~> 2.1.4'
23
+ gem.add_dependency 'addressable', '~> 2.7.0'
24
+ gem.add_dependency 'fastimage', '~> 2.1.7'
25
25
  gem.add_dependency 'nesty', '~> 1.0.2'
26
26
 
27
- gem.add_development_dependency 'rspec', '~> 3.8.0'
27
+ gem.add_development_dependency 'rspec', '~> 3.9.0'
28
28
  gem.add_development_dependency 'webmock', '~> 3.7.6'
29
29
  gem.add_development_dependency 'awesome_print', '~> 1.8.0'
30
- gem.add_development_dependency 'rake', '~> 12.3.1'
30
+ gem.add_development_dependency 'rake', '~> 13.0.0'
31
31
  gem.add_development_dependency 'pry', '~> 0.12.2'
32
- gem.add_development_dependency 'rubocop', '~> 0.60.0'
32
+ gem.add_development_dependency 'rubocop', '~> 0.75.1'
33
33
  end
@@ -44,6 +44,12 @@ describe MetaInspector::Document do
44
44
  "images" => ["http://pagerankalert.com/images/pagerank_alert.png?1305794559"],
45
45
  "charset" => "utf-8",
46
46
  "feed" => "http://feeds.feedburner.com/PageRankAlert",
47
+ "h1" => [],
48
+ "h2" => ["Track your PageRank changes"],
49
+ "h3" => ["WHAT'S YOUR PAGERANK?"],
50
+ "h4" => ["Build your own lists", "Get e-mail alerts", "Track your history"],
51
+ "h5" => [],
52
+ "h6" => [],
47
53
  "content_type" => "text/html",
48
54
  "meta_tags" => {
49
55
  "name" => {
@@ -0,0 +1,23 @@
1
+ HTTP/1.1 200 OK
2
+ Server: nginx/0.7.67
3
+ Date: Fri, 18 Nov 2011 21:46:46 GMT
4
+ Content-Type: text/html
5
+ Connection: keep-alive
6
+ Last-Modified: Mon, 14 Nov 2011 16:53:18 GMT
7
+ Content-Length: 4987
8
+ X-Varnish: 2000423390
9
+ Age: 0
10
+ Via: 1.1 varnish
11
+
12
+ <html>
13
+ <head>
14
+ </head>
15
+ <body>
16
+ <h1>H1</h1>
17
+ <h2>H2</h2>
18
+ <h3>H3</h3>
19
+ <h4>H4</h4>
20
+ <h5>H5</h5>
21
+ <h6>H6</h6>
22
+ </body>
23
+ </html>
@@ -6,6 +6,48 @@ describe MetaInspector do
6
6
  expect(page.title).to eq('An example page')
7
7
  end
8
8
 
9
+ describe "#h1" do
10
+ it "should find h1 content" do
11
+ page = MetaInspector.new('http://example.com/headings')
12
+ expect(page.h1.first).to eq('H1')
13
+ end
14
+ end
15
+
16
+ describe "#h2" do
17
+ it "should find h2 content" do
18
+ page = MetaInspector.new('http://example.com/headings')
19
+ expect(page.h2.first).to eq('H2')
20
+ end
21
+ end
22
+
23
+ describe "#h3" do
24
+ it "should find h3 content" do
25
+ page = MetaInspector.new('http://example.com/headings')
26
+ expect(page.h3.first).to eq('H3')
27
+ end
28
+ end
29
+
30
+ describe "#h4" do
31
+ it "should find h4 content" do
32
+ page = MetaInspector.new('http://example.com/headings')
33
+ expect(page.h4.first).to eq('H4')
34
+ end
35
+ end
36
+
37
+ describe "#h5" do
38
+ it "should find h5 content" do
39
+ page = MetaInspector.new('http://example.com/headings')
40
+ expect(page.h5.first).to eq('H5')
41
+ end
42
+ end
43
+
44
+ describe "#h6" do
45
+ it "should find h6 content" do
46
+ page = MetaInspector.new('http://example.com/headings')
47
+ expect(page.h6.first).to eq('H6')
48
+ end
49
+ end
50
+
9
51
  describe '#best_title' do
10
52
  it "should find 'head title' when that's the only thing" do
11
53
  page = MetaInspector.new('http://example.com/title_in_head')
@@ -29,6 +29,7 @@ RSpec.configure do |config|
29
29
  stub_request(:get, "http://example.com/desc_in_twitter").to_return(fixture_file("desc_in_twitter.response"))
30
30
  stub_request(:get, "http://example.com/empty").to_return(fixture_file("empty_page.response"))
31
31
  stub_request(:get, "http://example.com/head_links").to_return(fixture_file("head_links.response"))
32
+ stub_request(:get, "http://example.com/headings").to_return(fixture_file("headings.response"))
32
33
  stub_request(:get, "http://example.com/invalid_byte_seq").to_return(fixture_file("invalid_byte_seq.response"))
33
34
  stub_request(:get, "http://example.com/invalid_utf8_byte_seq").to_return(fixture_file("invalid_utf8_byte_seq.response"))
34
35
  stub_request(:get, "http://example.com/invalid_href").to_return(fixture_file("invalid_href.response"))
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: metainspector
3
3
  version: !ruby/object:Gem::Version
4
- version: 5.7.0
4
+ version: 5.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jaime Iniesta
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-10-01 00:00:00.000000000 Z
11
+ date: 2019-10-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -30,28 +30,28 @@ dependencies:
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: 0.15.3
33
+ version: 0.17.0
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: 0.15.3
40
+ version: 0.17.0
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: faraday_middleware
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: 0.12.2
47
+ version: 0.13.1
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: 0.12.2
54
+ version: 0.13.1
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: faraday-cookie_jar
57
57
  requirement: !ruby/object:Gem::Requirement
@@ -100,28 +100,28 @@ dependencies:
100
100
  requirements:
101
101
  - - "~>"
102
102
  - !ruby/object:Gem::Version
103
- version: 2.5.2
103
+ version: 2.7.0
104
104
  type: :runtime
105
105
  prerelease: false
106
106
  version_requirements: !ruby/object:Gem::Requirement
107
107
  requirements:
108
108
  - - "~>"
109
109
  - !ruby/object:Gem::Version
110
- version: 2.5.2
110
+ version: 2.7.0
111
111
  - !ruby/object:Gem::Dependency
112
112
  name: fastimage
113
113
  requirement: !ruby/object:Gem::Requirement
114
114
  requirements:
115
115
  - - "~>"
116
116
  - !ruby/object:Gem::Version
117
- version: 2.1.4
117
+ version: 2.1.7
118
118
  type: :runtime
119
119
  prerelease: false
120
120
  version_requirements: !ruby/object:Gem::Requirement
121
121
  requirements:
122
122
  - - "~>"
123
123
  - !ruby/object:Gem::Version
124
- version: 2.1.4
124
+ version: 2.1.7
125
125
  - !ruby/object:Gem::Dependency
126
126
  name: nesty
127
127
  requirement: !ruby/object:Gem::Requirement
@@ -142,14 +142,14 @@ dependencies:
142
142
  requirements:
143
143
  - - "~>"
144
144
  - !ruby/object:Gem::Version
145
- version: 3.8.0
145
+ version: 3.9.0
146
146
  type: :development
147
147
  prerelease: false
148
148
  version_requirements: !ruby/object:Gem::Requirement
149
149
  requirements:
150
150
  - - "~>"
151
151
  - !ruby/object:Gem::Version
152
- version: 3.8.0
152
+ version: 3.9.0
153
153
  - !ruby/object:Gem::Dependency
154
154
  name: webmock
155
155
  requirement: !ruby/object:Gem::Requirement
@@ -184,14 +184,14 @@ dependencies:
184
184
  requirements:
185
185
  - - "~>"
186
186
  - !ruby/object:Gem::Version
187
- version: 12.3.1
187
+ version: 13.0.0
188
188
  type: :development
189
189
  prerelease: false
190
190
  version_requirements: !ruby/object:Gem::Requirement
191
191
  requirements:
192
192
  - - "~>"
193
193
  - !ruby/object:Gem::Version
194
- version: 12.3.1
194
+ version: 13.0.0
195
195
  - !ruby/object:Gem::Dependency
196
196
  name: pry
197
197
  requirement: !ruby/object:Gem::Requirement
@@ -212,14 +212,14 @@ dependencies:
212
212
  requirements:
213
213
  - - "~>"
214
214
  - !ruby/object:Gem::Version
215
- version: 0.60.0
215
+ version: 0.75.1
216
216
  type: :development
217
217
  prerelease: false
218
218
  version_requirements: !ruby/object:Gem::Requirement
219
219
  requirements:
220
220
  - - "~>"
221
221
  - !ruby/object:Gem::Version
222
- version: 0.60.0
222
+ version: 0.75.1
223
223
  description: MetaInspector lets you scrape a web page and get its links, images, texts,
224
224
  meta tags...
225
225
  email:
@@ -280,6 +280,7 @@ files:
280
280
  - spec/fixtures/facebook.com.response
281
281
  - spec/fixtures/guardian.co.uk.response
282
282
  - spec/fixtures/head_links.response
283
+ - spec/fixtures/headings.response
283
284
  - spec/fixtures/https.facebook.com.response
284
285
  - spec/fixtures/international.response
285
286
  - spec/fixtures/invalid_byte_seq.response