metainspector 5.7.0 → 5.11.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 860350c6b4704259715f0a0eea047893134d774928b0bf78c9633f5afec480ff
4
- data.tar.gz: d0abcb709893e63be252886723f8f202137dae3d02b506173a59b3368dcda7f7
3
+ metadata.gz: e684c1133fc1fee2a9ffcd99b3294d22bb1d6d8598c7faf0e5c86a117a0e8663
4
+ data.tar.gz: b542f91035175aa6304495dd8f5ec79bcf8c0279bd700948f20bc742832b0653
5
5
  SHA512:
6
- metadata.gz: ab74dde15e1864f4c747be953a512278578ab231bea4637e1fb752234766c5b895b47405bd8572c06fd421cd16170e69c4331fb3462a7177ea4c6c521748e1f9
7
- data.tar.gz: fde5cf878c7320d49c82aa430b2fc422044a07168e4b6e6cce53d3b33655b28f6ee811cc535ffca06be46c19403b83e7c453aa942895b9e77c675468fa3fea12
6
+ metadata.gz: 796aa57288c6873fad48f67cc9c9da36f04d84cd24d66c2825944e42a5655a4d5cbf2a86fddc5592c44afb166a069630d372e3faa6ac4e8964eb8408ee9176aa
7
+ data.tar.gz: 16dcd111e4197a836a75b357fd035a051247b04f2afc52ff4832a4c63eef0bd986568873f17807dbe1951bcb4b42400d379449ee98fdd2a5afee16b152f439ce
@@ -1,6 +1,5 @@
1
1
  script: "bundle exec rspec -b"
2
2
  rvm:
3
- - 2.3.6
4
- - 2.4.3
5
- - 2.5.0
6
- - 2.6.4
3
+ - 2.5.8
4
+ - 2.6.6
5
+ - 2.7.1
@@ -1,5 +1,32 @@
1
1
  # MetaInpector Changelog
2
2
 
3
+ ## Unreleased
4
+
5
+ * Upgrade to Faraday 1.1.
6
+
7
+ ## [Changes in 5.10.1](https://github.com/jaimeiniesta/metainspector/compare/v5.10.0...v5.10.1)
8
+
9
+ * Fix for empty base_href. Makes relative links work when base_href is nil but empty ("").
10
+ * Drop support for Ruby 2.4, add support for Ruby 2.7.
11
+
12
+ ## [Changes in 5.10](https://github.com/jaimeiniesta/metainspector/compare/v5.9.0...v5.10.0)
13
+
14
+ * Upgrade to Faraday 1.0.
15
+
16
+ ## [Changes in 5.9](https://github.com/jaimeiniesta/metainspector/compare/v5.8.0...v5.9.0)
17
+
18
+ * Added #feeds method to retrieve all feeds of a page.
19
+ * Adds deprecation warning on #feed method.
20
+
21
+ ## [Changes in 5.8](https://github.com/jaimeiniesta/metainspector/compare/v5.7.0...v5.8.0)
22
+
23
+ * Added h1..h6 support.
24
+
25
+ ## [Changes in 5.7](https://github.com/jaimeiniesta/metainspector/compare/v5.6.0...v5.7.0)
26
+
27
+ * Avoids normalizing image URLs. https://github.com/jaimeiniesta/metainspector/pull/241
28
+ * Adds `NonHtmlErrorException` instead of `ParserError` https://github.com/jaimeiniesta/metainspector/pull/248
29
+
3
30
  ## [Changes in 5.6](https://github.com/jaimeiniesta/metainspector/compare/v5.5.0...v5.6.0)
4
31
 
5
32
  * New feature: `:encoding` option for force encoding of a parsed document.
data/README.md CHANGED
@@ -22,6 +22,8 @@ If you're using it on a Rails application, just add it to your Gemfile and run `
22
22
  gem 'metainspector'
23
23
  ```
24
24
 
25
+ Supported Ruby versions are defined in [`.travis.yml`](.travis.yml).
26
+
25
27
  ## Usage
26
28
 
27
29
  Initialize a MetaInspector instance for an URL, like this:
@@ -73,7 +75,7 @@ page.root_url # Root url (scheme + host, like http://sitevalidator.co
73
75
  page.head_links # an array of hashes of all head/links
74
76
  page.stylesheets # an array of hashes of all head/links where rel='stylesheet'
75
77
  page.canonicals # an array of hashes of all head/links where rel='canonical'
76
- page.feed # Get rss or atom links in meta data fields as array
78
+ page.feeds # Get rss or atom links in meta data fields as array of hash in the form { href: "...", title: "...", type: "..." }
77
79
  ```
78
80
 
79
81
  ### Texts
@@ -85,6 +87,12 @@ page.author # author of the page from the meta author tag
85
87
  page.best_author # best author of the page, from a selection of candidates
86
88
  page.description # returns the meta description
87
89
  page.best_description # returns the first non-empty description between the following candidates: standard meta description, og:description, twitter:description, the first long paragraph
90
+ page.h1 # returns h1 text array
91
+ page.h2 # returns h2 text array
92
+ page.h3 # returns h3 text array
93
+ page.h4 # returns h4 text array
94
+ page.h5 # returns h5 text array
95
+ page.h6 # returns h6 text array
88
96
  ```
89
97
 
90
98
  ### Links
@@ -48,8 +48,8 @@ module MetaInspector
48
48
  delegate [:content_type, :response] => :@request
49
49
 
50
50
  delegate [:parsed, :title, :best_title, :author, :best_author,
51
- :description, :best_description, :links,
52
- :images, :feed, :charset, :meta_tags,
51
+ :h1, :h2, :h3, :h4, :h5, :h6, :description, :best_description, :links,
52
+ :images, :feeds, :feed, :charset, :meta_tags,
53
53
  :meta_tag, :meta, :favicon,
54
54
  :head_links, :stylesheets, :canonicals] => :@parser
55
55
 
@@ -66,10 +66,17 @@ module MetaInspector
66
66
  'best_author' => best_author,
67
67
  'description' => description,
68
68
  'best_description' => best_description,
69
+ 'h1' => h1,
70
+ 'h2' => h2,
71
+ 'h3' => h3,
72
+ 'h4' => h4,
73
+ 'h5' => h5,
74
+ 'h6' => h6,
69
75
  'links' => links.to_hash,
70
76
  'images' => images.to_a,
71
77
  'charset' => charset,
72
78
  'feed' => feed,
79
+ 'feeds' => feeds,
73
80
  'content_type' => content_type,
74
81
  'meta_tags' => meta_tags,
75
82
  'favicon' => images.favicon,
@@ -23,10 +23,11 @@ module MetaInspector
23
23
  extend Forwardable
24
24
  delegate [:url, :scheme, :host] => :@document
25
25
  delegate [:meta_tags, :meta_tag, :meta, :charset] => :@meta_tag_parser
26
- delegate [:head_links, :stylesheets, :canonicals, :feed] => :@head_links_parser
26
+ delegate [:head_links, :stylesheets, :canonicals, :feeds, :feed] => :@head_links_parser
27
27
  delegate [:links, :base_url] => :@links_parser
28
28
  delegate :images => :@images_parser
29
- delegate [:title, :best_title, :author, :best_author, :description, :best_description] => :@texts_parser
29
+ delegate [:title, :best_title, :author, :best_author, :description, :best_description,
30
+ :h1, :h2, :h3, :h4, :h5, :h6] => :@texts_parser
30
31
 
31
32
  # Returns the whole parsed document
32
33
  def parsed
@@ -3,6 +3,10 @@ module MetaInspector
3
3
  class HeadLinksParser < Base
4
4
  delegate [:parsed, :base_url] => :@main_parser
5
5
 
6
+ KNOWN_FEED_TYPES = %w[
7
+ application/rss+xml application/atom+xml application/json
8
+ ].freeze
9
+
6
10
  def head_links
7
11
  @head_links ||= parsed.css('head link').map do |tag|
8
12
  Hash[
@@ -24,16 +28,25 @@ module MetaInspector
24
28
  @canonicals ||= head_links.select { |hl| hl[:rel] == 'canonical' }
25
29
  end
26
30
 
27
- # Returns the parsed document meta rss link
28
- def feed
29
- @feed ||= (parsed_feed('rss') || parsed_feed('atom'))
30
- end
31
+ def feeds
32
+ @feeds ||=
33
+ parsed.search("//link[@rel='alternate']").map do |link|
34
+ next if !KNOWN_FEED_TYPES.include?(link["type"]) || link["href"].to_s.strip == ''
31
35
 
32
- private
36
+ {
37
+ title: link["title"],
38
+ href: URL.absolutify(link["href"], base_url),
39
+ type: link["type"]
40
+ }
41
+ end.compact
42
+ end
33
43
 
34
- def parsed_feed(format)
35
- feed = parsed.search("//link[@type='application/#{format}+xml']").find{|link| link.attributes["href"] }
36
- feed ? URL.absolutify(feed['href'], base_url) : nil
44
+ def feed
45
+ warn "DEPRECATION: Use MetaInspector#feeds instead of #feed. The former gives you all feeds and their metadata, the latter will be removed."
46
+ @feed ||= begin
47
+ first_feed = feeds.find { |l| /\/(rss|atom)\+xml$/i =~ l[:type] } || {}
48
+ first_feed[:href]
49
+ end
37
50
  end
38
51
  end
39
52
  end
@@ -47,7 +47,8 @@ module MetaInspector
47
47
  # This can be the one set on a <base> tag,
48
48
  # or the url of the document if no <base> tag was found.
49
49
  def base_url
50
- base_href || url
50
+ current_base_href = base_href.to_s.strip.empty? ? nil : base_href
51
+ current_base_href || url
51
52
  end
52
53
 
53
54
  # Returns the value of the href attribute on the <base /> tag, if exists
@@ -13,6 +13,30 @@ module MetaInspector
13
13
  @best_title ||= find_best_title
14
14
  end
15
15
 
16
+ def h1
17
+ @h1 ||= find_heading('h1')
18
+ end
19
+
20
+ def h2
21
+ @h2 ||= find_heading('h2')
22
+ end
23
+
24
+ def h3
25
+ @h3 ||= find_heading('h3')
26
+ end
27
+
28
+ def h4
29
+ @h4 ||= find_heading('h4')
30
+ end
31
+
32
+ def h5
33
+ @h5 ||= find_heading('h5')
34
+ end
35
+
36
+ def h6
37
+ @h6 ||= find_heading('h6')
38
+ end
39
+
16
40
  # Returns the meta author, if present
17
41
  def author
18
42
  @author ||= meta['author']
@@ -45,6 +69,10 @@ module MetaInspector
45
69
 
46
70
  private
47
71
 
72
+ def find_heading(heading)
73
+ parsed.css(heading).map { |tag| tag.inner_text.strip.gsub(/\s+/, ' ') }.reject(&:empty?)
74
+ end
75
+
48
76
  # Look for candidates per list of priority
49
77
  def find_best_title
50
78
  candidates = [
@@ -48,7 +48,7 @@ module MetaInspector
48
48
  @response ||= fetch
49
49
  rescue Faraday::TimeoutError => e
50
50
  raise MetaInspector::TimeoutError.new(e)
51
- rescue Faraday::Error::ConnectionFailed, Faraday::SSLError, URI::InvalidURIError, FaradayMiddleware::RedirectLimitReached => e
51
+ rescue Faraday::ConnectionFailed, Faraday::SSLError, URI::InvalidURIError, FaradayMiddleware::RedirectLimitReached => e
52
52
  raise MetaInspector::RequestError.new(e)
53
53
  end
54
54
 
@@ -1,3 +1,3 @@
1
1
  module MetaInspector
2
- VERSION = '5.7.0'
2
+ VERSION = '5.11.0'
3
3
  end
@@ -1,11 +1,11 @@
1
1
  require File.expand_path('../lib/meta_inspector/version', __FILE__)
2
2
 
3
3
  Gem::Specification.new do |gem|
4
- gem.authors = ["Jaime Iniesta"]
5
- gem.email = ["jaimeiniesta@gmail.com"]
4
+ gem.author = "Jaime Iniesta"
5
+ gem.email = "jaimeiniesta@gmail.com"
6
6
  gem.description = %q{MetaInspector lets you scrape a web page and get its links, images, texts, meta tags...}
7
7
  gem.summary = %q{MetaInspector is a ruby gem for web scraping purposes, that returns metadata from a given URL}
8
- gem.homepage = "https://github.com/jaimeiniesta/metainspector"
8
+ gem.homepage = "https://github.com/metainspector/metainspector"
9
9
  gem.license = "MIT"
10
10
 
11
11
  gem.files = `git ls-files`.split("\n")
@@ -14,20 +14,20 @@ Gem::Specification.new do |gem|
14
14
  gem.require_paths = ["lib"]
15
15
  gem.version = MetaInspector::VERSION
16
16
 
17
- gem.add_dependency 'nokogiri', '~> 1.10.4'
18
- gem.add_dependency 'faraday', '~> 0.15.3'
19
- gem.add_dependency 'faraday_middleware', '~> 0.12.2'
20
- gem.add_dependency 'faraday-cookie_jar', '~> 0.0.6'
21
- gem.add_dependency 'faraday-http-cache', '~> 2.0.0'
17
+ gem.add_dependency 'nokogiri', '~> 1.10.9'
18
+ gem.add_dependency 'faraday', '~> 1.1.0'
19
+ gem.add_dependency 'faraday_middleware', '~> 1.0.0'
20
+ gem.add_dependency 'faraday-cookie_jar', '~> 0.0.7'
21
+ gem.add_dependency 'faraday-http-cache', '~> 2.2.0'
22
22
  gem.add_dependency 'faraday-encoding', '~> 0.0.5'
23
- gem.add_dependency 'addressable', '~> 2.5.2'
24
- gem.add_dependency 'fastimage', '~> 2.1.4'
23
+ gem.add_dependency 'addressable', '~> 2.7.0'
24
+ gem.add_dependency 'fastimage', '~> 2.1.7'
25
25
  gem.add_dependency 'nesty', '~> 1.0.2'
26
26
 
27
- gem.add_development_dependency 'rspec', '~> 3.8.0'
28
- gem.add_development_dependency 'webmock', '~> 3.7.6'
27
+ gem.add_development_dependency 'rspec', '~> 3.9.0'
28
+ gem.add_development_dependency 'webmock', '~> 3.8.3'
29
29
  gem.add_development_dependency 'awesome_print', '~> 1.8.0'
30
- gem.add_development_dependency 'rake', '~> 12.3.1'
31
- gem.add_development_dependency 'pry', '~> 0.12.2'
32
- gem.add_development_dependency 'rubocop', '~> 0.60.0'
30
+ gem.add_development_dependency 'rake', '~> 13.0.1'
31
+ gem.add_development_dependency 'pry', '~> 0.13.1'
32
+ gem.add_development_dependency 'rubocop', '~> 0.82.0'
33
33
  end
@@ -44,6 +44,13 @@ describe MetaInspector::Document do
44
44
  "images" => ["http://pagerankalert.com/images/pagerank_alert.png?1305794559"],
45
45
  "charset" => "utf-8",
46
46
  "feed" => "http://feeds.feedburner.com/PageRankAlert",
47
+ "feeds" => [{href: "http://feeds.feedburner.com/PageRankAlert", title: "PageRankAlert.com blog", type: "application/rss+xml"}],
48
+ "h1" => [],
49
+ "h2" => ["Track your PageRank changes"],
50
+ "h3" => ["WHAT'S YOUR PAGERANK?"],
51
+ "h4" => ["Build your own lists", "Get e-mail alerts", "Track your history"],
52
+ "h5" => [],
53
+ "h6" => [],
47
54
  "content_type" => "text/html",
48
55
  "meta_tags" => {
49
56
  "name" => {
@@ -0,0 +1,23 @@
1
+ HTTP/1.1 200
2
+ date: Wed, 08 Jan 2020 23:21:58 GMT
3
+ content-type: text/html; charset=UTF-8
4
+ server: nginx/0.7.67
5
+
6
+ <!DOCTYPE html>
7
+ <html>
8
+ <head>
9
+ <title>a page with feeds</title>
10
+ <link rel="alternate" title="Articles - JSON Feed" type="application/json" href="https://example.org/feed.json" />
11
+ <link rel="alternate" title="Comments - JSON Feed" type="application/json" href="https://example.org/feed/comments.json" />
12
+ <link rel="alternate" title="Articles - RSS Feed" type="application/rss+xml" href="https://example.org/feed.rss" />
13
+ <link rel="alternate" title="Comments - RSS Feed" type="application/rss+xml" href="https://example.org/feed/comments.rss" />
14
+ <link rel="alternate" title="Articles - Atom Feed" type="application/atom+xml" href="https://example.org/feed.xml" />
15
+ <link rel="alternate" title="Comments - Atom Feed" type="application/atom+xml" href="https://example.org/feed/comments.xml" />
16
+
17
+ <link rel="alternate" title="Invalid Feed" />
18
+ <link rel="alternate" title="Feed with empty href" type="application/atom+xml" href="" />
19
+ </head>
20
+ <body>
21
+
22
+ </body>
23
+ </html>
@@ -0,0 +1,23 @@
1
+ HTTP/1.1 200 OK
2
+ Server: nginx/0.7.67
3
+ Date: Fri, 18 Nov 2011 21:46:46 GMT
4
+ Content-Type: text/html
5
+ Connection: keep-alive
6
+ Last-Modified: Mon, 14 Nov 2011 16:53:18 GMT
7
+ Content-Length: 4987
8
+ X-Varnish: 2000423390
9
+ Age: 0
10
+ Via: 1.1 varnish
11
+
12
+ <html>
13
+ <head>
14
+ </head>
15
+ <body>
16
+ <h1>H1</h1>
17
+ <h2>H2</h2>
18
+ <h3>H3</h3>
19
+ <h4>H4</h4>
20
+ <h5>H5</h5>
21
+ <h6>H6</h6>
22
+ </body>
23
+ </html>
@@ -0,0 +1,22 @@
1
+ HTTP/1.1 200 OK
2
+ Server: nginx/1.0.5
3
+ Date: Thu, 29 Dec 2011 23:10:13 GMT
4
+ Content-Type: text/html
5
+ Content-Length: 15013
6
+ Last-Modified: Fri, 02 Dec 2011 21:00:49 GMT
7
+ Connection: keep-alive
8
+ Accept-Ranges: bytes
9
+
10
+ <!DOCTYPE html>
11
+ <html>
12
+ <head>
13
+ <base href=""/>
14
+ <meta charset="utf-8" />
15
+ <title>Relative links</title>
16
+ </head>
17
+ <body>
18
+ <p>Relative links</p>
19
+ <a href="about">About</a>
20
+ <a href="../sitemap">Sitemap</a>
21
+ </body>
22
+ </html>
@@ -39,7 +39,10 @@ describe MetaInspector do
39
39
  context "on page with some broken feed links" do
40
40
  let(:page){ MetaInspector.new('http://example.com/broken_head_links') }
41
41
  it "tries to find correct one" do
42
- expect(page.feed).to eq("http://www.guardian.co.uk/media/techcrunch/rss")
42
+ expected = [
43
+ { title: "TechCrunch RSS feed", href: "http://www.guardian.co.uk/media/techcrunch/rss", type: "application/rss+xml" }
44
+ ]
45
+ expect(page.feeds).to eq(expected)
43
46
  end
44
47
  end
45
48
  end
@@ -145,6 +145,13 @@ describe MetaInspector do
145
145
  end
146
146
  end
147
147
 
148
+ describe 'Relative links with empty or blank base' do
149
+ it 'should get the relative links from a document' do
150
+ m = MetaInspector.new('http://relativewithemptybase.com/company')
151
+ expect(m.links.internal).to eq(['http://relativewithemptybase.com/about', 'http://relativewithemptybase.com/sitemap'])
152
+ end
153
+ end
154
+
148
155
  describe 'Relative links with base' do
149
156
  it 'should get the relative links from a document' do
150
157
  m = MetaInspector.new('http://relativewithbase.com/company/page2')
@@ -190,20 +197,37 @@ describe MetaInspector do
190
197
  end
191
198
  end
192
199
 
193
- describe "Feed" do
194
- it "should get rss feed" do
195
- @m = MetaInspector.new('http://www.iteh.at')
196
- expect(@m.feed).to eq('http://www.iteh.at/de/rss/')
197
- end
200
+ context "Feeds" do
201
+ let(:meta) { MetaInspector.new('http://feeds.example.com') }
202
+
203
+ describe "#feeds" do
204
+ it "should return all the document's feeds" do
205
+ expected = [
206
+ { title: "Articles - JSON Feed", href: "https://example.org/feed.json", type: "application/json" },
207
+ { title: "Comments - JSON Feed", href: "https://example.org/feed/comments.json", type: "application/json" },
208
+ { title: "Articles - RSS Feed", href: "https://example.org/feed.rss", type: "application/rss+xml" },
209
+ { title: "Comments - RSS Feed", href: "https://example.org/feed/comments.rss", type: "application/rss+xml" },
210
+ { title: "Articles - Atom Feed", href: "https://example.org/feed.xml", type: "application/atom+xml" },
211
+ { title: "Comments - Atom Feed", href: "https://example.org/feed/comments.xml", type: "application/atom+xml" }
212
+ ]
213
+ expect(meta.feeds).to eq(expected)
214
+ end
198
215
 
199
- it "should get atom feed" do
200
- @m = MetaInspector.new('http://www.tea-tron.com/jbravo/blog/')
201
- expect(@m.feed).to eq('http://www.tea-tron.com/jbravo/blog/feed/')
216
+ it "should return nothing if no feeds found" do
217
+ @m = MetaInspector.new('http://www.alazan.com')
218
+ expect(@m.feeds).to eq([])
219
+ end
202
220
  end
203
221
 
204
- it "should return nil if no feed found" do
205
- @m = MetaInspector.new('http://www.alazan.com')
206
- expect(@m.feed).to eq(nil)
222
+ describe "#feed" do
223
+ it "should return the first feed's href" do
224
+ expect(meta.feed).to eq("https://example.org/feed.rss")
225
+ end
226
+
227
+ it "should give a deprecation warning" do
228
+ warning = "DEPRECATION: Use MetaInspector#feeds instead of #feed. The former gives you all feeds and their metadata, the latter will be removed.\n"
229
+ expect { meta.feed }.to output(warning).to_stderr
230
+ end
207
231
  end
208
232
  end
209
233
  end