metainspector 5.7.0 → 5.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +6 -0
- data/lib/meta_inspector/document.rb +7 -1
- data/lib/meta_inspector/parser.rb +2 -1
- data/lib/meta_inspector/parsers/texts.rb +28 -0
- data/lib/meta_inspector/version.rb +1 -1
- data/meta_inspector.gemspec +7 -7
- data/spec/document_spec.rb +6 -0
- data/spec/fixtures/headings.response +23 -0
- data/spec/meta_inspector/texts_spec.rb +42 -0
- data/spec/spec_helper.rb +1 -0
- metadata +17 -16
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a349b7103e8b214dc3211da2bc46d830e8e88b85fa7bde9b734ff3ba13802be0
|
4
|
+
data.tar.gz: fc4b691504551ee1c0ca6df5f92f5f5dbf5f2ebfa71415cff5259a67aff17c5f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1a268679f23f0ecfdd4358304999200175c78c2bef37ad1d5acdba641b46512ec6e6dca8d18005b87bc7fbc9b632592ac657818c99d540324685a56912a8985c
|
7
|
+
data.tar.gz: 8f90160c9f923e69157cfc8c2d6b13f386dd394501d37330a6ede47e587d9474adbe9b932df86340fc3c14754eedd59538eef65944801f6567b26189be8db077
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,10 @@
|
|
1
1
|
# MetaInpector Changelog
|
2
2
|
|
3
|
+
## [Changes in 5.7](https://github.com/jaimeiniesta/metainspector/compare/v5.6.0...v5.7.0)
|
4
|
+
|
5
|
+
* Avoids normalizing image URLs. https://github.com/jaimeiniesta/metainspector/pull/241
|
6
|
+
* Adds `NonHtmlErrorException` instead of `ParserError` https://github.com/jaimeiniesta/metainspector/pull/248
|
7
|
+
|
3
8
|
## [Changes in 5.6](https://github.com/jaimeiniesta/metainspector/compare/v5.5.0...v5.6.0)
|
4
9
|
|
5
10
|
* New feature: `:encoding` option for force encoding of a parsed document.
|
data/README.md
CHANGED
@@ -85,6 +85,12 @@ page.author # author of the page from the meta author tag
|
|
85
85
|
page.best_author # best author of the page, from a selection of candidates
|
86
86
|
page.description # returns the meta description
|
87
87
|
page.best_description # returns the first non-empty description between the following candidates: standard meta description, og:description, twitter:description, the first long paragraph
|
88
|
+
page.h1 # returns h1 text array
|
89
|
+
page.h2 # returns h2 text array
|
90
|
+
page.h3 # returns h3 text array
|
91
|
+
page.h4 # returns h4 text array
|
92
|
+
page.h5 # returns h5 text array
|
93
|
+
page.h6 # returns h6 text array
|
88
94
|
```
|
89
95
|
|
90
96
|
### Links
|
@@ -48,7 +48,7 @@ module MetaInspector
|
|
48
48
|
delegate [:content_type, :response] => :@request
|
49
49
|
|
50
50
|
delegate [:parsed, :title, :best_title, :author, :best_author,
|
51
|
-
:description, :best_description, :links,
|
51
|
+
:h1, :h2, :h3, :h4, :h5, :h6, :description, :best_description, :links,
|
52
52
|
:images, :feed, :charset, :meta_tags,
|
53
53
|
:meta_tag, :meta, :favicon,
|
54
54
|
:head_links, :stylesheets, :canonicals] => :@parser
|
@@ -66,6 +66,12 @@ module MetaInspector
|
|
66
66
|
'best_author' => best_author,
|
67
67
|
'description' => description,
|
68
68
|
'best_description' => best_description,
|
69
|
+
'h1' => h1,
|
70
|
+
'h2' => h2,
|
71
|
+
'h3' => h3,
|
72
|
+
'h4' => h4,
|
73
|
+
'h5' => h5,
|
74
|
+
'h6' => h6,
|
69
75
|
'links' => links.to_hash,
|
70
76
|
'images' => images.to_a,
|
71
77
|
'charset' => charset,
|
@@ -26,7 +26,8 @@ module MetaInspector
|
|
26
26
|
delegate [:head_links, :stylesheets, :canonicals, :feed] => :@head_links_parser
|
27
27
|
delegate [:links, :base_url] => :@links_parser
|
28
28
|
delegate :images => :@images_parser
|
29
|
-
delegate [:title, :best_title, :author, :best_author, :description, :best_description
|
29
|
+
delegate [:title, :best_title, :author, :best_author, :description, :best_description,
|
30
|
+
:h1, :h2, :h3, :h4, :h5, :h6] => :@texts_parser
|
30
31
|
|
31
32
|
# Returns the whole parsed document
|
32
33
|
def parsed
|
@@ -13,6 +13,30 @@ module MetaInspector
|
|
13
13
|
@best_title ||= find_best_title
|
14
14
|
end
|
15
15
|
|
16
|
+
def h1
|
17
|
+
@h1 ||= find_heading('h1')
|
18
|
+
end
|
19
|
+
|
20
|
+
def h2
|
21
|
+
@h2 ||= find_heading('h2')
|
22
|
+
end
|
23
|
+
|
24
|
+
def h3
|
25
|
+
@h3 ||= find_heading('h3')
|
26
|
+
end
|
27
|
+
|
28
|
+
def h4
|
29
|
+
@h4 ||= find_heading('h4')
|
30
|
+
end
|
31
|
+
|
32
|
+
def h5
|
33
|
+
@h5 ||= find_heading('h5')
|
34
|
+
end
|
35
|
+
|
36
|
+
def h6
|
37
|
+
@h6 ||= find_heading('h6')
|
38
|
+
end
|
39
|
+
|
16
40
|
# Returns the meta author, if present
|
17
41
|
def author
|
18
42
|
@author ||= meta['author']
|
@@ -45,6 +69,10 @@ module MetaInspector
|
|
45
69
|
|
46
70
|
private
|
47
71
|
|
72
|
+
def find_heading(heading)
|
73
|
+
parsed.css(heading).map { |tag| tag.inner_text.strip.gsub(/\s+/, ' ') }.reject(&:empty?)
|
74
|
+
end
|
75
|
+
|
48
76
|
# Look for candidates per list of priority
|
49
77
|
def find_best_title
|
50
78
|
candidates = [
|
data/meta_inspector.gemspec
CHANGED
@@ -15,19 +15,19 @@ Gem::Specification.new do |gem|
|
|
15
15
|
gem.version = MetaInspector::VERSION
|
16
16
|
|
17
17
|
gem.add_dependency 'nokogiri', '~> 1.10.4'
|
18
|
-
gem.add_dependency 'faraday', '~> 0.
|
19
|
-
gem.add_dependency 'faraday_middleware', '~> 0.
|
18
|
+
gem.add_dependency 'faraday', '~> 0.17.0'
|
19
|
+
gem.add_dependency 'faraday_middleware', '~> 0.13.1'
|
20
20
|
gem.add_dependency 'faraday-cookie_jar', '~> 0.0.6'
|
21
21
|
gem.add_dependency 'faraday-http-cache', '~> 2.0.0'
|
22
22
|
gem.add_dependency 'faraday-encoding', '~> 0.0.5'
|
23
|
-
gem.add_dependency 'addressable', '~> 2.
|
24
|
-
gem.add_dependency 'fastimage', '~> 2.1.
|
23
|
+
gem.add_dependency 'addressable', '~> 2.7.0'
|
24
|
+
gem.add_dependency 'fastimage', '~> 2.1.7'
|
25
25
|
gem.add_dependency 'nesty', '~> 1.0.2'
|
26
26
|
|
27
|
-
gem.add_development_dependency 'rspec', '~> 3.
|
27
|
+
gem.add_development_dependency 'rspec', '~> 3.9.0'
|
28
28
|
gem.add_development_dependency 'webmock', '~> 3.7.6'
|
29
29
|
gem.add_development_dependency 'awesome_print', '~> 1.8.0'
|
30
|
-
gem.add_development_dependency 'rake', '~>
|
30
|
+
gem.add_development_dependency 'rake', '~> 13.0.0'
|
31
31
|
gem.add_development_dependency 'pry', '~> 0.12.2'
|
32
|
-
gem.add_development_dependency 'rubocop', '~> 0.
|
32
|
+
gem.add_development_dependency 'rubocop', '~> 0.75.1'
|
33
33
|
end
|
data/spec/document_spec.rb
CHANGED
@@ -44,6 +44,12 @@ describe MetaInspector::Document do
|
|
44
44
|
"images" => ["http://pagerankalert.com/images/pagerank_alert.png?1305794559"],
|
45
45
|
"charset" => "utf-8",
|
46
46
|
"feed" => "http://feeds.feedburner.com/PageRankAlert",
|
47
|
+
"h1" => [],
|
48
|
+
"h2" => ["Track your PageRank changes"],
|
49
|
+
"h3" => ["WHAT'S YOUR PAGERANK?"],
|
50
|
+
"h4" => ["Build your own lists", "Get e-mail alerts", "Track your history"],
|
51
|
+
"h5" => [],
|
52
|
+
"h6" => [],
|
47
53
|
"content_type" => "text/html",
|
48
54
|
"meta_tags" => {
|
49
55
|
"name" => {
|
@@ -0,0 +1,23 @@
|
|
1
|
+
HTTP/1.1 200 OK
|
2
|
+
Server: nginx/0.7.67
|
3
|
+
Date: Fri, 18 Nov 2011 21:46:46 GMT
|
4
|
+
Content-Type: text/html
|
5
|
+
Connection: keep-alive
|
6
|
+
Last-Modified: Mon, 14 Nov 2011 16:53:18 GMT
|
7
|
+
Content-Length: 4987
|
8
|
+
X-Varnish: 2000423390
|
9
|
+
Age: 0
|
10
|
+
Via: 1.1 varnish
|
11
|
+
|
12
|
+
<html>
|
13
|
+
<head>
|
14
|
+
</head>
|
15
|
+
<body>
|
16
|
+
<h1>H1</h1>
|
17
|
+
<h2>H2</h2>
|
18
|
+
<h3>H3</h3>
|
19
|
+
<h4>H4</h4>
|
20
|
+
<h5>H5</h5>
|
21
|
+
<h6>H6</h6>
|
22
|
+
</body>
|
23
|
+
</html>
|
@@ -6,6 +6,48 @@ describe MetaInspector do
|
|
6
6
|
expect(page.title).to eq('An example page')
|
7
7
|
end
|
8
8
|
|
9
|
+
describe "#h1" do
|
10
|
+
it "should find h1 content" do
|
11
|
+
page = MetaInspector.new('http://example.com/headings')
|
12
|
+
expect(page.h1.first).to eq('H1')
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
describe "#h2" do
|
17
|
+
it "should find h2 content" do
|
18
|
+
page = MetaInspector.new('http://example.com/headings')
|
19
|
+
expect(page.h2.first).to eq('H2')
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
describe "#h3" do
|
24
|
+
it "should find h3 content" do
|
25
|
+
page = MetaInspector.new('http://example.com/headings')
|
26
|
+
expect(page.h3.first).to eq('H3')
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
describe "#h4" do
|
31
|
+
it "should find h4 content" do
|
32
|
+
page = MetaInspector.new('http://example.com/headings')
|
33
|
+
expect(page.h4.first).to eq('H4')
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
describe "#h5" do
|
38
|
+
it "should find h5 content" do
|
39
|
+
page = MetaInspector.new('http://example.com/headings')
|
40
|
+
expect(page.h5.first).to eq('H5')
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
describe "#h6" do
|
45
|
+
it "should find h6 content" do
|
46
|
+
page = MetaInspector.new('http://example.com/headings')
|
47
|
+
expect(page.h6.first).to eq('H6')
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
9
51
|
describe '#best_title' do
|
10
52
|
it "should find 'head title' when that's the only thing" do
|
11
53
|
page = MetaInspector.new('http://example.com/title_in_head')
|
data/spec/spec_helper.rb
CHANGED
@@ -29,6 +29,7 @@ RSpec.configure do |config|
|
|
29
29
|
stub_request(:get, "http://example.com/desc_in_twitter").to_return(fixture_file("desc_in_twitter.response"))
|
30
30
|
stub_request(:get, "http://example.com/empty").to_return(fixture_file("empty_page.response"))
|
31
31
|
stub_request(:get, "http://example.com/head_links").to_return(fixture_file("head_links.response"))
|
32
|
+
stub_request(:get, "http://example.com/headings").to_return(fixture_file("headings.response"))
|
32
33
|
stub_request(:get, "http://example.com/invalid_byte_seq").to_return(fixture_file("invalid_byte_seq.response"))
|
33
34
|
stub_request(:get, "http://example.com/invalid_utf8_byte_seq").to_return(fixture_file("invalid_utf8_byte_seq.response"))
|
34
35
|
stub_request(:get, "http://example.com/invalid_href").to_return(fixture_file("invalid_href.response"))
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 5.
|
4
|
+
version: 5.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jaime Iniesta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-10-
|
11
|
+
date: 2019-10-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -30,28 +30,28 @@ dependencies:
|
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: 0.
|
33
|
+
version: 0.17.0
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: 0.
|
40
|
+
version: 0.17.0
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: faraday_middleware
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: 0.
|
47
|
+
version: 0.13.1
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: 0.
|
54
|
+
version: 0.13.1
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: faraday-cookie_jar
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -100,28 +100,28 @@ dependencies:
|
|
100
100
|
requirements:
|
101
101
|
- - "~>"
|
102
102
|
- !ruby/object:Gem::Version
|
103
|
-
version: 2.
|
103
|
+
version: 2.7.0
|
104
104
|
type: :runtime
|
105
105
|
prerelease: false
|
106
106
|
version_requirements: !ruby/object:Gem::Requirement
|
107
107
|
requirements:
|
108
108
|
- - "~>"
|
109
109
|
- !ruby/object:Gem::Version
|
110
|
-
version: 2.
|
110
|
+
version: 2.7.0
|
111
111
|
- !ruby/object:Gem::Dependency
|
112
112
|
name: fastimage
|
113
113
|
requirement: !ruby/object:Gem::Requirement
|
114
114
|
requirements:
|
115
115
|
- - "~>"
|
116
116
|
- !ruby/object:Gem::Version
|
117
|
-
version: 2.1.
|
117
|
+
version: 2.1.7
|
118
118
|
type: :runtime
|
119
119
|
prerelease: false
|
120
120
|
version_requirements: !ruby/object:Gem::Requirement
|
121
121
|
requirements:
|
122
122
|
- - "~>"
|
123
123
|
- !ruby/object:Gem::Version
|
124
|
-
version: 2.1.
|
124
|
+
version: 2.1.7
|
125
125
|
- !ruby/object:Gem::Dependency
|
126
126
|
name: nesty
|
127
127
|
requirement: !ruby/object:Gem::Requirement
|
@@ -142,14 +142,14 @@ dependencies:
|
|
142
142
|
requirements:
|
143
143
|
- - "~>"
|
144
144
|
- !ruby/object:Gem::Version
|
145
|
-
version: 3.
|
145
|
+
version: 3.9.0
|
146
146
|
type: :development
|
147
147
|
prerelease: false
|
148
148
|
version_requirements: !ruby/object:Gem::Requirement
|
149
149
|
requirements:
|
150
150
|
- - "~>"
|
151
151
|
- !ruby/object:Gem::Version
|
152
|
-
version: 3.
|
152
|
+
version: 3.9.0
|
153
153
|
- !ruby/object:Gem::Dependency
|
154
154
|
name: webmock
|
155
155
|
requirement: !ruby/object:Gem::Requirement
|
@@ -184,14 +184,14 @@ dependencies:
|
|
184
184
|
requirements:
|
185
185
|
- - "~>"
|
186
186
|
- !ruby/object:Gem::Version
|
187
|
-
version:
|
187
|
+
version: 13.0.0
|
188
188
|
type: :development
|
189
189
|
prerelease: false
|
190
190
|
version_requirements: !ruby/object:Gem::Requirement
|
191
191
|
requirements:
|
192
192
|
- - "~>"
|
193
193
|
- !ruby/object:Gem::Version
|
194
|
-
version:
|
194
|
+
version: 13.0.0
|
195
195
|
- !ruby/object:Gem::Dependency
|
196
196
|
name: pry
|
197
197
|
requirement: !ruby/object:Gem::Requirement
|
@@ -212,14 +212,14 @@ dependencies:
|
|
212
212
|
requirements:
|
213
213
|
- - "~>"
|
214
214
|
- !ruby/object:Gem::Version
|
215
|
-
version: 0.
|
215
|
+
version: 0.75.1
|
216
216
|
type: :development
|
217
217
|
prerelease: false
|
218
218
|
version_requirements: !ruby/object:Gem::Requirement
|
219
219
|
requirements:
|
220
220
|
- - "~>"
|
221
221
|
- !ruby/object:Gem::Version
|
222
|
-
version: 0.
|
222
|
+
version: 0.75.1
|
223
223
|
description: MetaInspector lets you scrape a web page and get its links, images, texts,
|
224
224
|
meta tags...
|
225
225
|
email:
|
@@ -280,6 +280,7 @@ files:
|
|
280
280
|
- spec/fixtures/facebook.com.response
|
281
281
|
- spec/fixtures/guardian.co.uk.response
|
282
282
|
- spec/fixtures/head_links.response
|
283
|
+
- spec/fixtures/headings.response
|
283
284
|
- spec/fixtures/https.facebook.com.response
|
284
285
|
- spec/fixtures/international.response
|
285
286
|
- spec/fixtures/invalid_byte_seq.response
|