metainspector 5.7.0 → 5.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 860350c6b4704259715f0a0eea047893134d774928b0bf78c9633f5afec480ff
4
- data.tar.gz: d0abcb709893e63be252886723f8f202137dae3d02b506173a59b3368dcda7f7
3
+ metadata.gz: e684c1133fc1fee2a9ffcd99b3294d22bb1d6d8598c7faf0e5c86a117a0e8663
4
+ data.tar.gz: b542f91035175aa6304495dd8f5ec79bcf8c0279bd700948f20bc742832b0653
5
5
  SHA512:
6
- metadata.gz: ab74dde15e1864f4c747be953a512278578ab231bea4637e1fb752234766c5b895b47405bd8572c06fd421cd16170e69c4331fb3462a7177ea4c6c521748e1f9
7
- data.tar.gz: fde5cf878c7320d49c82aa430b2fc422044a07168e4b6e6cce53d3b33655b28f6ee811cc535ffca06be46c19403b83e7c453aa942895b9e77c675468fa3fea12
6
+ metadata.gz: 796aa57288c6873fad48f67cc9c9da36f04d84cd24d66c2825944e42a5655a4d5cbf2a86fddc5592c44afb166a069630d372e3faa6ac4e8964eb8408ee9176aa
7
+ data.tar.gz: 16dcd111e4197a836a75b357fd035a051247b04f2afc52ff4832a4c63eef0bd986568873f17807dbe1951bcb4b42400d379449ee98fdd2a5afee16b152f439ce
@@ -1,6 +1,5 @@
1
1
  script: "bundle exec rspec -b"
2
2
  rvm:
3
- - 2.3.6
4
- - 2.4.3
5
- - 2.5.0
6
- - 2.6.4
3
+ - 2.5.8
4
+ - 2.6.6
5
+ - 2.7.1
@@ -1,5 +1,32 @@
1
1
  # MetaInpector Changelog
2
2
 
3
+ ## Unreleased
4
+
5
+ * Upgrade to Faraday 1.1.
6
+
7
+ ## [Changes in 5.10.1](https://github.com/jaimeiniesta/metainspector/compare/v5.10.0...v5.10.1)
8
+
9
+ * Fix for empty base_href. Makes relative links work when base_href is nil but empty ("").
10
+ * Drop support for Ruby 2.4, add support for Ruby 2.7.
11
+
12
+ ## [Changes in 5.10](https://github.com/jaimeiniesta/metainspector/compare/v5.9.0...v5.10.0)
13
+
14
+ * Upgrade to Faraday 1.0.
15
+
16
+ ## [Changes in 5.9](https://github.com/jaimeiniesta/metainspector/compare/v5.8.0...v5.9.0)
17
+
18
+ * Added #feeds method to retrieve all feeds of a page.
19
+ * Adds deprecation warning on #feed method.
20
+
21
+ ## [Changes in 5.8](https://github.com/jaimeiniesta/metainspector/compare/v5.7.0...v5.8.0)
22
+
23
+ * Added h1..h6 support.
24
+
25
+ ## [Changes in 5.7](https://github.com/jaimeiniesta/metainspector/compare/v5.6.0...v5.7.0)
26
+
27
+ * Avoids normalizing image URLs. https://github.com/jaimeiniesta/metainspector/pull/241
28
+ * Adds `NonHtmlErrorException` instead of `ParserError` https://github.com/jaimeiniesta/metainspector/pull/248
29
+
3
30
  ## [Changes in 5.6](https://github.com/jaimeiniesta/metainspector/compare/v5.5.0...v5.6.0)
4
31
 
5
32
  * New feature: `:encoding` option for force encoding of a parsed document.
data/README.md CHANGED
@@ -22,6 +22,8 @@ If you're using it on a Rails application, just add it to your Gemfile and run `
22
22
  gem 'metainspector'
23
23
  ```
24
24
 
25
+ Supported Ruby versions are defined in [`.travis.yml`](.travis.yml).
26
+
25
27
  ## Usage
26
28
 
27
29
  Initialize a MetaInspector instance for an URL, like this:
@@ -73,7 +75,7 @@ page.root_url # Root url (scheme + host, like http://sitevalidator.co
73
75
  page.head_links # an array of hashes of all head/links
74
76
  page.stylesheets # an array of hashes of all head/links where rel='stylesheet'
75
77
  page.canonicals # an array of hashes of all head/links where rel='canonical'
76
- page.feed # Get rss or atom links in meta data fields as array
78
+ page.feeds # Get rss or atom links in meta data fields as array of hash in the form { href: "...", title: "...", type: "..." }
77
79
  ```
78
80
 
79
81
  ### Texts
@@ -85,6 +87,12 @@ page.author # author of the page from the meta author tag
85
87
  page.best_author # best author of the page, from a selection of candidates
86
88
  page.description # returns the meta description
87
89
  page.best_description # returns the first non-empty description between the following candidates: standard meta description, og:description, twitter:description, the first long paragraph
90
+ page.h1 # returns h1 text array
91
+ page.h2 # returns h2 text array
92
+ page.h3 # returns h3 text array
93
+ page.h4 # returns h4 text array
94
+ page.h5 # returns h5 text array
95
+ page.h6 # returns h6 text array
88
96
  ```
89
97
 
90
98
  ### Links
@@ -48,8 +48,8 @@ module MetaInspector
48
48
  delegate [:content_type, :response] => :@request
49
49
 
50
50
  delegate [:parsed, :title, :best_title, :author, :best_author,
51
- :description, :best_description, :links,
52
- :images, :feed, :charset, :meta_tags,
51
+ :h1, :h2, :h3, :h4, :h5, :h6, :description, :best_description, :links,
52
+ :images, :feeds, :feed, :charset, :meta_tags,
53
53
  :meta_tag, :meta, :favicon,
54
54
  :head_links, :stylesheets, :canonicals] => :@parser
55
55
 
@@ -66,10 +66,17 @@ module MetaInspector
66
66
  'best_author' => best_author,
67
67
  'description' => description,
68
68
  'best_description' => best_description,
69
+ 'h1' => h1,
70
+ 'h2' => h2,
71
+ 'h3' => h3,
72
+ 'h4' => h4,
73
+ 'h5' => h5,
74
+ 'h6' => h6,
69
75
  'links' => links.to_hash,
70
76
  'images' => images.to_a,
71
77
  'charset' => charset,
72
78
  'feed' => feed,
79
+ 'feeds' => feeds,
73
80
  'content_type' => content_type,
74
81
  'meta_tags' => meta_tags,
75
82
  'favicon' => images.favicon,
@@ -23,10 +23,11 @@ module MetaInspector
23
23
  extend Forwardable
24
24
  delegate [:url, :scheme, :host] => :@document
25
25
  delegate [:meta_tags, :meta_tag, :meta, :charset] => :@meta_tag_parser
26
- delegate [:head_links, :stylesheets, :canonicals, :feed] => :@head_links_parser
26
+ delegate [:head_links, :stylesheets, :canonicals, :feeds, :feed] => :@head_links_parser
27
27
  delegate [:links, :base_url] => :@links_parser
28
28
  delegate :images => :@images_parser
29
- delegate [:title, :best_title, :author, :best_author, :description, :best_description] => :@texts_parser
29
+ delegate [:title, :best_title, :author, :best_author, :description, :best_description,
30
+ :h1, :h2, :h3, :h4, :h5, :h6] => :@texts_parser
30
31
 
31
32
  # Returns the whole parsed document
32
33
  def parsed
@@ -3,6 +3,10 @@ module MetaInspector
3
3
  class HeadLinksParser < Base
4
4
  delegate [:parsed, :base_url] => :@main_parser
5
5
 
6
+ KNOWN_FEED_TYPES = %w[
7
+ application/rss+xml application/atom+xml application/json
8
+ ].freeze
9
+
6
10
  def head_links
7
11
  @head_links ||= parsed.css('head link').map do |tag|
8
12
  Hash[
@@ -24,16 +28,25 @@ module MetaInspector
24
28
  @canonicals ||= head_links.select { |hl| hl[:rel] == 'canonical' }
25
29
  end
26
30
 
27
- # Returns the parsed document meta rss link
28
- def feed
29
- @feed ||= (parsed_feed('rss') || parsed_feed('atom'))
30
- end
31
+ def feeds
32
+ @feeds ||=
33
+ parsed.search("//link[@rel='alternate']").map do |link|
34
+ next if !KNOWN_FEED_TYPES.include?(link["type"]) || link["href"].to_s.strip == ''
31
35
 
32
- private
36
+ {
37
+ title: link["title"],
38
+ href: URL.absolutify(link["href"], base_url),
39
+ type: link["type"]
40
+ }
41
+ end.compact
42
+ end
33
43
 
34
- def parsed_feed(format)
35
- feed = parsed.search("//link[@type='application/#{format}+xml']").find{|link| link.attributes["href"] }
36
- feed ? URL.absolutify(feed['href'], base_url) : nil
44
+ def feed
45
+ warn "DEPRECATION: Use MetaInspector#feeds instead of #feed. The former gives you all feeds and their metadata, the latter will be removed."
46
+ @feed ||= begin
47
+ first_feed = feeds.find { |l| /\/(rss|atom)\+xml$/i =~ l[:type] } || {}
48
+ first_feed[:href]
49
+ end
37
50
  end
38
51
  end
39
52
  end
@@ -47,7 +47,8 @@ module MetaInspector
47
47
  # This can be the one set on a <base> tag,
48
48
  # or the url of the document if no <base> tag was found.
49
49
  def base_url
50
- base_href || url
50
+ current_base_href = base_href.to_s.strip.empty? ? nil : base_href
51
+ current_base_href || url
51
52
  end
52
53
 
53
54
  # Returns the value of the href attribute on the <base /> tag, if exists
@@ -13,6 +13,30 @@ module MetaInspector
13
13
  @best_title ||= find_best_title
14
14
  end
15
15
 
16
+ def h1
17
+ @h1 ||= find_heading('h1')
18
+ end
19
+
20
+ def h2
21
+ @h2 ||= find_heading('h2')
22
+ end
23
+
24
+ def h3
25
+ @h3 ||= find_heading('h3')
26
+ end
27
+
28
+ def h4
29
+ @h4 ||= find_heading('h4')
30
+ end
31
+
32
+ def h5
33
+ @h5 ||= find_heading('h5')
34
+ end
35
+
36
+ def h6
37
+ @h6 ||= find_heading('h6')
38
+ end
39
+
16
40
  # Returns the meta author, if present
17
41
  def author
18
42
  @author ||= meta['author']
@@ -45,6 +69,10 @@ module MetaInspector
45
69
 
46
70
  private
47
71
 
72
+ def find_heading(heading)
73
+ parsed.css(heading).map { |tag| tag.inner_text.strip.gsub(/\s+/, ' ') }.reject(&:empty?)
74
+ end
75
+
48
76
  # Look for candidates per list of priority
49
77
  def find_best_title
50
78
  candidates = [
@@ -48,7 +48,7 @@ module MetaInspector
48
48
  @response ||= fetch
49
49
  rescue Faraday::TimeoutError => e
50
50
  raise MetaInspector::TimeoutError.new(e)
51
- rescue Faraday::Error::ConnectionFailed, Faraday::SSLError, URI::InvalidURIError, FaradayMiddleware::RedirectLimitReached => e
51
+ rescue Faraday::ConnectionFailed, Faraday::SSLError, URI::InvalidURIError, FaradayMiddleware::RedirectLimitReached => e
52
52
  raise MetaInspector::RequestError.new(e)
53
53
  end
54
54
 
@@ -1,3 +1,3 @@
1
1
  module MetaInspector
2
- VERSION = '5.7.0'
2
+ VERSION = '5.11.0'
3
3
  end
@@ -1,11 +1,11 @@
1
1
  require File.expand_path('../lib/meta_inspector/version', __FILE__)
2
2
 
3
3
  Gem::Specification.new do |gem|
4
- gem.authors = ["Jaime Iniesta"]
5
- gem.email = ["jaimeiniesta@gmail.com"]
4
+ gem.author = "Jaime Iniesta"
5
+ gem.email = "jaimeiniesta@gmail.com"
6
6
  gem.description = %q{MetaInspector lets you scrape a web page and get its links, images, texts, meta tags...}
7
7
  gem.summary = %q{MetaInspector is a ruby gem for web scraping purposes, that returns metadata from a given URL}
8
- gem.homepage = "https://github.com/jaimeiniesta/metainspector"
8
+ gem.homepage = "https://github.com/metainspector/metainspector"
9
9
  gem.license = "MIT"
10
10
 
11
11
  gem.files = `git ls-files`.split("\n")
@@ -14,20 +14,20 @@ Gem::Specification.new do |gem|
14
14
  gem.require_paths = ["lib"]
15
15
  gem.version = MetaInspector::VERSION
16
16
 
17
- gem.add_dependency 'nokogiri', '~> 1.10.4'
18
- gem.add_dependency 'faraday', '~> 0.15.3'
19
- gem.add_dependency 'faraday_middleware', '~> 0.12.2'
20
- gem.add_dependency 'faraday-cookie_jar', '~> 0.0.6'
21
- gem.add_dependency 'faraday-http-cache', '~> 2.0.0'
17
+ gem.add_dependency 'nokogiri', '~> 1.10.9'
18
+ gem.add_dependency 'faraday', '~> 1.1.0'
19
+ gem.add_dependency 'faraday_middleware', '~> 1.0.0'
20
+ gem.add_dependency 'faraday-cookie_jar', '~> 0.0.7'
21
+ gem.add_dependency 'faraday-http-cache', '~> 2.2.0'
22
22
  gem.add_dependency 'faraday-encoding', '~> 0.0.5'
23
- gem.add_dependency 'addressable', '~> 2.5.2'
24
- gem.add_dependency 'fastimage', '~> 2.1.4'
23
+ gem.add_dependency 'addressable', '~> 2.7.0'
24
+ gem.add_dependency 'fastimage', '~> 2.1.7'
25
25
  gem.add_dependency 'nesty', '~> 1.0.2'
26
26
 
27
- gem.add_development_dependency 'rspec', '~> 3.8.0'
28
- gem.add_development_dependency 'webmock', '~> 3.7.6'
27
+ gem.add_development_dependency 'rspec', '~> 3.9.0'
28
+ gem.add_development_dependency 'webmock', '~> 3.8.3'
29
29
  gem.add_development_dependency 'awesome_print', '~> 1.8.0'
30
- gem.add_development_dependency 'rake', '~> 12.3.1'
31
- gem.add_development_dependency 'pry', '~> 0.12.2'
32
- gem.add_development_dependency 'rubocop', '~> 0.60.0'
30
+ gem.add_development_dependency 'rake', '~> 13.0.1'
31
+ gem.add_development_dependency 'pry', '~> 0.13.1'
32
+ gem.add_development_dependency 'rubocop', '~> 0.82.0'
33
33
  end
@@ -44,6 +44,13 @@ describe MetaInspector::Document do
44
44
  "images" => ["http://pagerankalert.com/images/pagerank_alert.png?1305794559"],
45
45
  "charset" => "utf-8",
46
46
  "feed" => "http://feeds.feedburner.com/PageRankAlert",
47
+ "feeds" => [{href: "http://feeds.feedburner.com/PageRankAlert", title: "PageRankAlert.com blog", type: "application/rss+xml"}],
48
+ "h1" => [],
49
+ "h2" => ["Track your PageRank changes"],
50
+ "h3" => ["WHAT'S YOUR PAGERANK?"],
51
+ "h4" => ["Build your own lists", "Get e-mail alerts", "Track your history"],
52
+ "h5" => [],
53
+ "h6" => [],
47
54
  "content_type" => "text/html",
48
55
  "meta_tags" => {
49
56
  "name" => {
@@ -0,0 +1,23 @@
1
+ HTTP/1.1 200
2
+ date: Wed, 08 Jan 2020 23:21:58 GMT
3
+ content-type: text/html; charset=UTF-8
4
+ server: nginx/0.7.67
5
+
6
+ <!DOCTYPE html>
7
+ <html>
8
+ <head>
9
+ <title>a page with feeds</title>
10
+ <link rel="alternate" title="Articles - JSON Feed" type="application/json" href="https://example.org/feed.json" />
11
+ <link rel="alternate" title="Comments - JSON Feed" type="application/json" href="https://example.org/feed/comments.json" />
12
+ <link rel="alternate" title="Articles - RSS Feed" type="application/rss+xml" href="https://example.org/feed.rss" />
13
+ <link rel="alternate" title="Comments - RSS Feed" type="application/rss+xml" href="https://example.org/feed/comments.rss" />
14
+ <link rel="alternate" title="Articles - Atom Feed" type="application/atom+xml" href="https://example.org/feed.xml" />
15
+ <link rel="alternate" title="Comments - Atom Feed" type="application/atom+xml" href="https://example.org/feed/comments.xml" />
16
+
17
+ <link rel="alternate" title="Invalid Feed" />
18
+ <link rel="alternate" title="Feed with empty href" type="application/atom+xml" href="" />
19
+ </head>
20
+ <body>
21
+
22
+ </body>
23
+ </html>
@@ -0,0 +1,23 @@
1
+ HTTP/1.1 200 OK
2
+ Server: nginx/0.7.67
3
+ Date: Fri, 18 Nov 2011 21:46:46 GMT
4
+ Content-Type: text/html
5
+ Connection: keep-alive
6
+ Last-Modified: Mon, 14 Nov 2011 16:53:18 GMT
7
+ Content-Length: 4987
8
+ X-Varnish: 2000423390
9
+ Age: 0
10
+ Via: 1.1 varnish
11
+
12
+ <html>
13
+ <head>
14
+ </head>
15
+ <body>
16
+ <h1>H1</h1>
17
+ <h2>H2</h2>
18
+ <h3>H3</h3>
19
+ <h4>H4</h4>
20
+ <h5>H5</h5>
21
+ <h6>H6</h6>
22
+ </body>
23
+ </html>
@@ -0,0 +1,22 @@
1
+ HTTP/1.1 200 OK
2
+ Server: nginx/1.0.5
3
+ Date: Thu, 29 Dec 2011 23:10:13 GMT
4
+ Content-Type: text/html
5
+ Content-Length: 15013
6
+ Last-Modified: Fri, 02 Dec 2011 21:00:49 GMT
7
+ Connection: keep-alive
8
+ Accept-Ranges: bytes
9
+
10
+ <!DOCTYPE html>
11
+ <html>
12
+ <head>
13
+ <base href=""/>
14
+ <meta charset="utf-8" />
15
+ <title>Relative links</title>
16
+ </head>
17
+ <body>
18
+ <p>Relative links</p>
19
+ <a href="about">About</a>
20
+ <a href="../sitemap">Sitemap</a>
21
+ </body>
22
+ </html>
@@ -39,7 +39,10 @@ describe MetaInspector do
39
39
  context "on page with some broken feed links" do
40
40
  let(:page){ MetaInspector.new('http://example.com/broken_head_links') }
41
41
  it "tries to find correct one" do
42
- expect(page.feed).to eq("http://www.guardian.co.uk/media/techcrunch/rss")
42
+ expected = [
43
+ { title: "TechCrunch RSS feed", href: "http://www.guardian.co.uk/media/techcrunch/rss", type: "application/rss+xml" }
44
+ ]
45
+ expect(page.feeds).to eq(expected)
43
46
  end
44
47
  end
45
48
  end
@@ -145,6 +145,13 @@ describe MetaInspector do
145
145
  end
146
146
  end
147
147
 
148
+ describe 'Relative links with empty or blank base' do
149
+ it 'should get the relative links from a document' do
150
+ m = MetaInspector.new('http://relativewithemptybase.com/company')
151
+ expect(m.links.internal).to eq(['http://relativewithemptybase.com/about', 'http://relativewithemptybase.com/sitemap'])
152
+ end
153
+ end
154
+
148
155
  describe 'Relative links with base' do
149
156
  it 'should get the relative links from a document' do
150
157
  m = MetaInspector.new('http://relativewithbase.com/company/page2')
@@ -190,20 +197,37 @@ describe MetaInspector do
190
197
  end
191
198
  end
192
199
 
193
- describe "Feed" do
194
- it "should get rss feed" do
195
- @m = MetaInspector.new('http://www.iteh.at')
196
- expect(@m.feed).to eq('http://www.iteh.at/de/rss/')
197
- end
200
+ context "Feeds" do
201
+ let(:meta) { MetaInspector.new('http://feeds.example.com') }
202
+
203
+ describe "#feeds" do
204
+ it "should return all the document's feeds" do
205
+ expected = [
206
+ { title: "Articles - JSON Feed", href: "https://example.org/feed.json", type: "application/json" },
207
+ { title: "Comments - JSON Feed", href: "https://example.org/feed/comments.json", type: "application/json" },
208
+ { title: "Articles - RSS Feed", href: "https://example.org/feed.rss", type: "application/rss+xml" },
209
+ { title: "Comments - RSS Feed", href: "https://example.org/feed/comments.rss", type: "application/rss+xml" },
210
+ { title: "Articles - Atom Feed", href: "https://example.org/feed.xml", type: "application/atom+xml" },
211
+ { title: "Comments - Atom Feed", href: "https://example.org/feed/comments.xml", type: "application/atom+xml" }
212
+ ]
213
+ expect(meta.feeds).to eq(expected)
214
+ end
198
215
 
199
- it "should get atom feed" do
200
- @m = MetaInspector.new('http://www.tea-tron.com/jbravo/blog/')
201
- expect(@m.feed).to eq('http://www.tea-tron.com/jbravo/blog/feed/')
216
+ it "should return nothing if no feeds found" do
217
+ @m = MetaInspector.new('http://www.alazan.com')
218
+ expect(@m.feeds).to eq([])
219
+ end
202
220
  end
203
221
 
204
- it "should return nil if no feed found" do
205
- @m = MetaInspector.new('http://www.alazan.com')
206
- expect(@m.feed).to eq(nil)
222
+ describe "#feed" do
223
+ it "should return the first feed's href" do
224
+ expect(meta.feed).to eq("https://example.org/feed.rss")
225
+ end
226
+
227
+ it "should give a deprecation warning" do
228
+ warning = "DEPRECATION: Use MetaInspector#feeds instead of #feed. The former gives you all feeds and their metadata, the latter will be removed.\n"
229
+ expect { meta.feed }.to output(warning).to_stderr
230
+ end
207
231
  end
208
232
  end
209
233
  end