metainspector 5.7.0 → 5.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +6 -0
- data/lib/meta_inspector/document.rb +7 -1
- data/lib/meta_inspector/parser.rb +2 -1
- data/lib/meta_inspector/parsers/texts.rb +28 -0
- data/lib/meta_inspector/version.rb +1 -1
- data/meta_inspector.gemspec +7 -7
- data/spec/document_spec.rb +6 -0
- data/spec/fixtures/headings.response +23 -0
- data/spec/meta_inspector/texts_spec.rb +42 -0
- data/spec/spec_helper.rb +1 -0
- metadata +17 -16
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a349b7103e8b214dc3211da2bc46d830e8e88b85fa7bde9b734ff3ba13802be0
|
4
|
+
data.tar.gz: fc4b691504551ee1c0ca6df5f92f5f5dbf5f2ebfa71415cff5259a67aff17c5f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1a268679f23f0ecfdd4358304999200175c78c2bef37ad1d5acdba641b46512ec6e6dca8d18005b87bc7fbc9b632592ac657818c99d540324685a56912a8985c
|
7
|
+
data.tar.gz: 8f90160c9f923e69157cfc8c2d6b13f386dd394501d37330a6ede47e587d9474adbe9b932df86340fc3c14754eedd59538eef65944801f6567b26189be8db077
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,10 @@
|
|
1
1
|
# MetaInpector Changelog
|
2
2
|
|
3
|
+
## [Changes in 5.7](https://github.com/jaimeiniesta/metainspector/compare/v5.6.0...v5.7.0)
|
4
|
+
|
5
|
+
* Avoids normalizing image URLs. https://github.com/jaimeiniesta/metainspector/pull/241
|
6
|
+
* Adds `NonHtmlErrorException` instead of `ParserError` https://github.com/jaimeiniesta/metainspector/pull/248
|
7
|
+
|
3
8
|
## [Changes in 5.6](https://github.com/jaimeiniesta/metainspector/compare/v5.5.0...v5.6.0)
|
4
9
|
|
5
10
|
* New feature: `:encoding` option for force encoding of a parsed document.
|
data/README.md
CHANGED
@@ -85,6 +85,12 @@ page.author # author of the page from the meta author tag
|
|
85
85
|
page.best_author # best author of the page, from a selection of candidates
|
86
86
|
page.description # returns the meta description
|
87
87
|
page.best_description # returns the first non-empty description between the following candidates: standard meta description, og:description, twitter:description, the first long paragraph
|
88
|
+
page.h1 # returns h1 text array
|
89
|
+
page.h2 # returns h2 text array
|
90
|
+
page.h3 # returns h3 text array
|
91
|
+
page.h4 # returns h4 text array
|
92
|
+
page.h5 # returns h5 text array
|
93
|
+
page.h6 # returns h6 text array
|
88
94
|
```
|
89
95
|
|
90
96
|
### Links
|
@@ -48,7 +48,7 @@ module MetaInspector
|
|
48
48
|
delegate [:content_type, :response] => :@request
|
49
49
|
|
50
50
|
delegate [:parsed, :title, :best_title, :author, :best_author,
|
51
|
-
:description, :best_description, :links,
|
51
|
+
:h1, :h2, :h3, :h4, :h5, :h6, :description, :best_description, :links,
|
52
52
|
:images, :feed, :charset, :meta_tags,
|
53
53
|
:meta_tag, :meta, :favicon,
|
54
54
|
:head_links, :stylesheets, :canonicals] => :@parser
|
@@ -66,6 +66,12 @@ module MetaInspector
|
|
66
66
|
'best_author' => best_author,
|
67
67
|
'description' => description,
|
68
68
|
'best_description' => best_description,
|
69
|
+
'h1' => h1,
|
70
|
+
'h2' => h2,
|
71
|
+
'h3' => h3,
|
72
|
+
'h4' => h4,
|
73
|
+
'h5' => h5,
|
74
|
+
'h6' => h6,
|
69
75
|
'links' => links.to_hash,
|
70
76
|
'images' => images.to_a,
|
71
77
|
'charset' => charset,
|
@@ -26,7 +26,8 @@ module MetaInspector
|
|
26
26
|
delegate [:head_links, :stylesheets, :canonicals, :feed] => :@head_links_parser
|
27
27
|
delegate [:links, :base_url] => :@links_parser
|
28
28
|
delegate :images => :@images_parser
|
29
|
-
delegate [:title, :best_title, :author, :best_author, :description, :best_description
|
29
|
+
delegate [:title, :best_title, :author, :best_author, :description, :best_description,
|
30
|
+
:h1, :h2, :h3, :h4, :h5, :h6] => :@texts_parser
|
30
31
|
|
31
32
|
# Returns the whole parsed document
|
32
33
|
def parsed
|
@@ -13,6 +13,30 @@ module MetaInspector
|
|
13
13
|
@best_title ||= find_best_title
|
14
14
|
end
|
15
15
|
|
16
|
+
def h1
|
17
|
+
@h1 ||= find_heading('h1')
|
18
|
+
end
|
19
|
+
|
20
|
+
def h2
|
21
|
+
@h2 ||= find_heading('h2')
|
22
|
+
end
|
23
|
+
|
24
|
+
def h3
|
25
|
+
@h3 ||= find_heading('h3')
|
26
|
+
end
|
27
|
+
|
28
|
+
def h4
|
29
|
+
@h4 ||= find_heading('h4')
|
30
|
+
end
|
31
|
+
|
32
|
+
def h5
|
33
|
+
@h5 ||= find_heading('h5')
|
34
|
+
end
|
35
|
+
|
36
|
+
def h6
|
37
|
+
@h6 ||= find_heading('h6')
|
38
|
+
end
|
39
|
+
|
16
40
|
# Returns the meta author, if present
|
17
41
|
def author
|
18
42
|
@author ||= meta['author']
|
@@ -45,6 +69,10 @@ module MetaInspector
|
|
45
69
|
|
46
70
|
private
|
47
71
|
|
72
|
+
def find_heading(heading)
|
73
|
+
parsed.css(heading).map { |tag| tag.inner_text.strip.gsub(/\s+/, ' ') }.reject(&:empty?)
|
74
|
+
end
|
75
|
+
|
48
76
|
# Look for candidates per list of priority
|
49
77
|
def find_best_title
|
50
78
|
candidates = [
|
data/meta_inspector.gemspec
CHANGED
@@ -15,19 +15,19 @@ Gem::Specification.new do |gem|
|
|
15
15
|
gem.version = MetaInspector::VERSION
|
16
16
|
|
17
17
|
gem.add_dependency 'nokogiri', '~> 1.10.4'
|
18
|
-
gem.add_dependency 'faraday', '~> 0.
|
19
|
-
gem.add_dependency 'faraday_middleware', '~> 0.
|
18
|
+
gem.add_dependency 'faraday', '~> 0.17.0'
|
19
|
+
gem.add_dependency 'faraday_middleware', '~> 0.13.1'
|
20
20
|
gem.add_dependency 'faraday-cookie_jar', '~> 0.0.6'
|
21
21
|
gem.add_dependency 'faraday-http-cache', '~> 2.0.0'
|
22
22
|
gem.add_dependency 'faraday-encoding', '~> 0.0.5'
|
23
|
-
gem.add_dependency 'addressable', '~> 2.
|
24
|
-
gem.add_dependency 'fastimage', '~> 2.1.
|
23
|
+
gem.add_dependency 'addressable', '~> 2.7.0'
|
24
|
+
gem.add_dependency 'fastimage', '~> 2.1.7'
|
25
25
|
gem.add_dependency 'nesty', '~> 1.0.2'
|
26
26
|
|
27
|
-
gem.add_development_dependency 'rspec', '~> 3.
|
27
|
+
gem.add_development_dependency 'rspec', '~> 3.9.0'
|
28
28
|
gem.add_development_dependency 'webmock', '~> 3.7.6'
|
29
29
|
gem.add_development_dependency 'awesome_print', '~> 1.8.0'
|
30
|
-
gem.add_development_dependency 'rake', '~>
|
30
|
+
gem.add_development_dependency 'rake', '~> 13.0.0'
|
31
31
|
gem.add_development_dependency 'pry', '~> 0.12.2'
|
32
|
-
gem.add_development_dependency 'rubocop', '~> 0.
|
32
|
+
gem.add_development_dependency 'rubocop', '~> 0.75.1'
|
33
33
|
end
|
data/spec/document_spec.rb
CHANGED
@@ -44,6 +44,12 @@ describe MetaInspector::Document do
|
|
44
44
|
"images" => ["http://pagerankalert.com/images/pagerank_alert.png?1305794559"],
|
45
45
|
"charset" => "utf-8",
|
46
46
|
"feed" => "http://feeds.feedburner.com/PageRankAlert",
|
47
|
+
"h1" => [],
|
48
|
+
"h2" => ["Track your PageRank changes"],
|
49
|
+
"h3" => ["WHAT'S YOUR PAGERANK?"],
|
50
|
+
"h4" => ["Build your own lists", "Get e-mail alerts", "Track your history"],
|
51
|
+
"h5" => [],
|
52
|
+
"h6" => [],
|
47
53
|
"content_type" => "text/html",
|
48
54
|
"meta_tags" => {
|
49
55
|
"name" => {
|
@@ -0,0 +1,23 @@
|
|
1
|
+
HTTP/1.1 200 OK
|
2
|
+
Server: nginx/0.7.67
|
3
|
+
Date: Fri, 18 Nov 2011 21:46:46 GMT
|
4
|
+
Content-Type: text/html
|
5
|
+
Connection: keep-alive
|
6
|
+
Last-Modified: Mon, 14 Nov 2011 16:53:18 GMT
|
7
|
+
Content-Length: 4987
|
8
|
+
X-Varnish: 2000423390
|
9
|
+
Age: 0
|
10
|
+
Via: 1.1 varnish
|
11
|
+
|
12
|
+
<html>
|
13
|
+
<head>
|
14
|
+
</head>
|
15
|
+
<body>
|
16
|
+
<h1>H1</h1>
|
17
|
+
<h2>H2</h2>
|
18
|
+
<h3>H3</h3>
|
19
|
+
<h4>H4</h4>
|
20
|
+
<h5>H5</h5>
|
21
|
+
<h6>H6</h6>
|
22
|
+
</body>
|
23
|
+
</html>
|
@@ -6,6 +6,48 @@ describe MetaInspector do
|
|
6
6
|
expect(page.title).to eq('An example page')
|
7
7
|
end
|
8
8
|
|
9
|
+
describe "#h1" do
|
10
|
+
it "should find h1 content" do
|
11
|
+
page = MetaInspector.new('http://example.com/headings')
|
12
|
+
expect(page.h1.first).to eq('H1')
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
describe "#h2" do
|
17
|
+
it "should find h2 content" do
|
18
|
+
page = MetaInspector.new('http://example.com/headings')
|
19
|
+
expect(page.h2.first).to eq('H2')
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
describe "#h3" do
|
24
|
+
it "should find h3 content" do
|
25
|
+
page = MetaInspector.new('http://example.com/headings')
|
26
|
+
expect(page.h3.first).to eq('H3')
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
describe "#h4" do
|
31
|
+
it "should find h4 content" do
|
32
|
+
page = MetaInspector.new('http://example.com/headings')
|
33
|
+
expect(page.h4.first).to eq('H4')
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
describe "#h5" do
|
38
|
+
it "should find h5 content" do
|
39
|
+
page = MetaInspector.new('http://example.com/headings')
|
40
|
+
expect(page.h5.first).to eq('H5')
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
describe "#h6" do
|
45
|
+
it "should find h6 content" do
|
46
|
+
page = MetaInspector.new('http://example.com/headings')
|
47
|
+
expect(page.h6.first).to eq('H6')
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
9
51
|
describe '#best_title' do
|
10
52
|
it "should find 'head title' when that's the only thing" do
|
11
53
|
page = MetaInspector.new('http://example.com/title_in_head')
|
data/spec/spec_helper.rb
CHANGED
@@ -29,6 +29,7 @@ RSpec.configure do |config|
|
|
29
29
|
stub_request(:get, "http://example.com/desc_in_twitter").to_return(fixture_file("desc_in_twitter.response"))
|
30
30
|
stub_request(:get, "http://example.com/empty").to_return(fixture_file("empty_page.response"))
|
31
31
|
stub_request(:get, "http://example.com/head_links").to_return(fixture_file("head_links.response"))
|
32
|
+
stub_request(:get, "http://example.com/headings").to_return(fixture_file("headings.response"))
|
32
33
|
stub_request(:get, "http://example.com/invalid_byte_seq").to_return(fixture_file("invalid_byte_seq.response"))
|
33
34
|
stub_request(:get, "http://example.com/invalid_utf8_byte_seq").to_return(fixture_file("invalid_utf8_byte_seq.response"))
|
34
35
|
stub_request(:get, "http://example.com/invalid_href").to_return(fixture_file("invalid_href.response"))
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 5.
|
4
|
+
version: 5.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jaime Iniesta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-10-
|
11
|
+
date: 2019-10-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -30,28 +30,28 @@ dependencies:
|
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: 0.
|
33
|
+
version: 0.17.0
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: 0.
|
40
|
+
version: 0.17.0
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: faraday_middleware
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: 0.
|
47
|
+
version: 0.13.1
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: 0.
|
54
|
+
version: 0.13.1
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: faraday-cookie_jar
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -100,28 +100,28 @@ dependencies:
|
|
100
100
|
requirements:
|
101
101
|
- - "~>"
|
102
102
|
- !ruby/object:Gem::Version
|
103
|
-
version: 2.
|
103
|
+
version: 2.7.0
|
104
104
|
type: :runtime
|
105
105
|
prerelease: false
|
106
106
|
version_requirements: !ruby/object:Gem::Requirement
|
107
107
|
requirements:
|
108
108
|
- - "~>"
|
109
109
|
- !ruby/object:Gem::Version
|
110
|
-
version: 2.
|
110
|
+
version: 2.7.0
|
111
111
|
- !ruby/object:Gem::Dependency
|
112
112
|
name: fastimage
|
113
113
|
requirement: !ruby/object:Gem::Requirement
|
114
114
|
requirements:
|
115
115
|
- - "~>"
|
116
116
|
- !ruby/object:Gem::Version
|
117
|
-
version: 2.1.
|
117
|
+
version: 2.1.7
|
118
118
|
type: :runtime
|
119
119
|
prerelease: false
|
120
120
|
version_requirements: !ruby/object:Gem::Requirement
|
121
121
|
requirements:
|
122
122
|
- - "~>"
|
123
123
|
- !ruby/object:Gem::Version
|
124
|
-
version: 2.1.
|
124
|
+
version: 2.1.7
|
125
125
|
- !ruby/object:Gem::Dependency
|
126
126
|
name: nesty
|
127
127
|
requirement: !ruby/object:Gem::Requirement
|
@@ -142,14 +142,14 @@ dependencies:
|
|
142
142
|
requirements:
|
143
143
|
- - "~>"
|
144
144
|
- !ruby/object:Gem::Version
|
145
|
-
version: 3.
|
145
|
+
version: 3.9.0
|
146
146
|
type: :development
|
147
147
|
prerelease: false
|
148
148
|
version_requirements: !ruby/object:Gem::Requirement
|
149
149
|
requirements:
|
150
150
|
- - "~>"
|
151
151
|
- !ruby/object:Gem::Version
|
152
|
-
version: 3.
|
152
|
+
version: 3.9.0
|
153
153
|
- !ruby/object:Gem::Dependency
|
154
154
|
name: webmock
|
155
155
|
requirement: !ruby/object:Gem::Requirement
|
@@ -184,14 +184,14 @@ dependencies:
|
|
184
184
|
requirements:
|
185
185
|
- - "~>"
|
186
186
|
- !ruby/object:Gem::Version
|
187
|
-
version:
|
187
|
+
version: 13.0.0
|
188
188
|
type: :development
|
189
189
|
prerelease: false
|
190
190
|
version_requirements: !ruby/object:Gem::Requirement
|
191
191
|
requirements:
|
192
192
|
- - "~>"
|
193
193
|
- !ruby/object:Gem::Version
|
194
|
-
version:
|
194
|
+
version: 13.0.0
|
195
195
|
- !ruby/object:Gem::Dependency
|
196
196
|
name: pry
|
197
197
|
requirement: !ruby/object:Gem::Requirement
|
@@ -212,14 +212,14 @@ dependencies:
|
|
212
212
|
requirements:
|
213
213
|
- - "~>"
|
214
214
|
- !ruby/object:Gem::Version
|
215
|
-
version: 0.
|
215
|
+
version: 0.75.1
|
216
216
|
type: :development
|
217
217
|
prerelease: false
|
218
218
|
version_requirements: !ruby/object:Gem::Requirement
|
219
219
|
requirements:
|
220
220
|
- - "~>"
|
221
221
|
- !ruby/object:Gem::Version
|
222
|
-
version: 0.
|
222
|
+
version: 0.75.1
|
223
223
|
description: MetaInspector lets you scrape a web page and get its links, images, texts,
|
224
224
|
meta tags...
|
225
225
|
email:
|
@@ -280,6 +280,7 @@ files:
|
|
280
280
|
- spec/fixtures/facebook.com.response
|
281
281
|
- spec/fixtures/guardian.co.uk.response
|
282
282
|
- spec/fixtures/head_links.response
|
283
|
+
- spec/fixtures/headings.response
|
283
284
|
- spec/fixtures/https.facebook.com.response
|
284
285
|
- spec/fixtures/international.response
|
285
286
|
- spec/fixtures/invalid_byte_seq.response
|