metainspector 4.0.0.rc3 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/.rubocop.yml.example +4 -0
- data/lib/meta_inspector/document.rb +41 -33
- data/lib/meta_inspector/exception_log.rb +2 -2
- data/lib/meta_inspector/exceptionable.rb +1 -1
- data/lib/meta_inspector/parser.rb +5 -5
- data/lib/meta_inspector/parsers/base.rb +1 -1
- data/lib/meta_inspector/parsers/images.rb +9 -5
- data/lib/meta_inspector/parsers/links.rb +13 -10
- data/lib/meta_inspector/parsers/meta_tags.rb +11 -11
- data/lib/meta_inspector/parsers/texts.rb +4 -3
- data/lib/meta_inspector/request.rb +5 -6
- data/lib/meta_inspector/url.rb +9 -5
- data/lib/meta_inspector/version.rb +1 -1
- data/meta_inspector.gemspec +1 -0
- metadata +19 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4fbb85a1c08f497b3c38edbdc97e0c8d96ee6c6a
|
4
|
+
data.tar.gz: 9ce2c80b81b1eb085037312e75fb82d1e46f4202
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e12a19a7598d3a9c7d83d90c121336964490dcd8b334f72d9ceb64ea8efab67c3b269445eb1ebf46eb5385169ea04a81ef155533dbe92779614eb3e0a10c50b3
|
7
|
+
data.tar.gz: 555a9b35ee7f51def2c45a24e46996cc130a65d15daebda9841c7be74fda8a2c76cb0097c53a67ad763b80272db52d84f8bdb7b99ecee124929a19b3c36a6338
|
data/.gitignore
CHANGED
@@ -5,16 +5,18 @@ module MetaInspector
|
|
5
5
|
|
6
6
|
include MetaInspector::Exceptionable
|
7
7
|
|
8
|
-
# Initializes a new instance of MetaInspector::Document, setting the URL
|
8
|
+
# Initializes a new instance of MetaInspector::Document, setting the URL
|
9
9
|
# Options:
|
10
|
-
#
|
11
|
-
#
|
12
|
-
#
|
13
|
-
#
|
14
|
-
#
|
15
|
-
#
|
16
|
-
#
|
17
|
-
#
|
10
|
+
# * connection_timeout: defaults to 20 seconds
|
11
|
+
# * read_timeout: defaults to 20 seconds
|
12
|
+
# * retries: defaults to 3 times
|
13
|
+
# * html_content_type_only: if an exception should be raised if request
|
14
|
+
# content-type is not text/html. Defaults to false.
|
15
|
+
# * allow_redirections: when true, follow HTTP redirects. Defaults to true
|
16
|
+
# * document: the html of the url as a string
|
17
|
+
# * warn_level: what to do when encountering exceptions.
|
18
|
+
# Can be :warn, :raise or nil
|
19
|
+
# * headers: object containing custom headers for the request
|
18
20
|
def initialize(initial_url, options = {})
|
19
21
|
options = defaults.merge(options)
|
20
22
|
@connection_timeout = options[:connection_timeout]
|
@@ -37,25 +39,28 @@ module MetaInspector
|
|
37
39
|
end
|
38
40
|
|
39
41
|
extend Forwardable
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
42
|
+
delegate [:url, :scheme, :host, :root_url] => :@url
|
43
|
+
|
44
|
+
delegate [:content_type, :response] => :@request
|
45
|
+
|
46
|
+
delegate [:parsed, :title, :description, :links,
|
47
|
+
:images, :feed, :charset, :meta_tags,
|
48
|
+
:meta_tag, :meta, :favicon] => :@parser
|
44
49
|
|
45
50
|
# Returns all document data as a nested Hash
|
46
51
|
def to_hash
|
47
52
|
{
|
48
|
-
'url'
|
49
|
-
'title'
|
50
|
-
'links'
|
51
|
-
'images'
|
52
|
-
'charset'
|
53
|
-
'feed'
|
54
|
-
'content_type'
|
55
|
-
'meta_tags'
|
56
|
-
'favicon'
|
57
|
-
'response'
|
58
|
-
|
53
|
+
'url' => url,
|
54
|
+
'title' => title,
|
55
|
+
'links' => links.to_hash,
|
56
|
+
'images' => images.to_a,
|
57
|
+
'charset' => charset,
|
58
|
+
'feed' => feed,
|
59
|
+
'content_type' => content_type,
|
60
|
+
'meta_tags' => meta_tags,
|
61
|
+
'favicon' => images.favicon,
|
62
|
+
'response' => { 'status' => response.status,
|
63
|
+
'headers' => response.headers }
|
59
64
|
}
|
60
65
|
end
|
61
66
|
|
@@ -67,18 +72,21 @@ module MetaInspector
|
|
67
72
|
private
|
68
73
|
|
69
74
|
def defaults
|
70
|
-
{ :timeout
|
71
|
-
:retries
|
72
|
-
:html_content_only
|
73
|
-
:warn_level
|
74
|
-
:headers
|
75
|
-
:allow_redirections => true
|
76
|
-
|
75
|
+
{ :timeout => 20,
|
76
|
+
:retries => 3,
|
77
|
+
:html_content_only => false,
|
78
|
+
:warn_level => :raise,
|
79
|
+
:headers => { 'User-Agent' => default_user_agent },
|
80
|
+
:allow_redirections => true }
|
81
|
+
end
|
82
|
+
|
83
|
+
def default_user_agent
|
84
|
+
"MetaInspector/#{MetaInspector::VERSION} (+https://github.com/jaimeiniesta/metainspector)"
|
77
85
|
end
|
78
86
|
|
79
87
|
def document
|
80
|
-
@document ||= if html_content_only && content_type !=
|
81
|
-
|
88
|
+
@document ||= if html_content_only && content_type != 'text/html'
|
89
|
+
fail "The url provided contains #{content_type} content instead of text/html content"
|
82
90
|
else
|
83
91
|
@request.read
|
84
92
|
end
|
@@ -12,7 +12,7 @@ module MetaInspector
|
|
12
12
|
def <<(exception)
|
13
13
|
case warn_level
|
14
14
|
when :raise
|
15
|
-
|
15
|
+
fail exception
|
16
16
|
when :warn
|
17
17
|
warn exception
|
18
18
|
when :store
|
@@ -24,7 +24,7 @@ module MetaInspector
|
|
24
24
|
if warn_level == :store
|
25
25
|
exceptions.empty?
|
26
26
|
else
|
27
|
-
warn
|
27
|
+
warn 'ExceptionLog#ok? should only be used when warn_level is :store'
|
28
28
|
end
|
29
29
|
end
|
30
30
|
end
|
@@ -20,11 +20,11 @@ module MetaInspector
|
|
20
20
|
end
|
21
21
|
|
22
22
|
extend Forwardable
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
23
|
+
delegate [:url, :scheme, :host] => :@document
|
24
|
+
delegate [:meta_tags, :meta_tag, :meta, :charset] => :@meta_tag_parser
|
25
|
+
delegate [:links, :feed, :base_url] => :@links_parser
|
26
|
+
delegate :images => :@images_parser
|
27
|
+
delegate [:title, :description] => :@texts_parser
|
28
28
|
|
29
29
|
# Returns the whole parsed document
|
30
30
|
def parsed
|
@@ -1,8 +1,8 @@
|
|
1
1
|
module MetaInspector
|
2
2
|
module Parsers
|
3
3
|
class ImagesParser < Base
|
4
|
-
|
5
|
-
|
4
|
+
delegate [:parsed, :meta, :base_url] => :@main_parser
|
5
|
+
delegate [:each, :length, :size, :[], :last] => :images_collection
|
6
6
|
|
7
7
|
include Enumerable
|
8
8
|
|
@@ -11,7 +11,7 @@ module MetaInspector
|
|
11
11
|
end
|
12
12
|
|
13
13
|
# Returns the parsed image from Facebook's open graph property tags
|
14
|
-
# Most
|
14
|
+
# Most major websites now define this property and is usually relevant
|
15
15
|
# See doc at http://developers.facebook.com/docs/opengraph/
|
16
16
|
# If none found, tries with Twitter image
|
17
17
|
def best
|
@@ -30,11 +30,15 @@ module MetaInspector
|
|
30
30
|
private
|
31
31
|
|
32
32
|
def images_collection
|
33
|
-
@images_collection ||=
|
33
|
+
@images_collection ||= absolutified_images
|
34
|
+
end
|
35
|
+
|
36
|
+
def absolutified_images
|
37
|
+
parsed_images.map { |i| URL.absolutify(i, base_url) }
|
34
38
|
end
|
35
39
|
|
36
40
|
def parsed_images
|
37
|
-
|
41
|
+
cleanup(parsed.search('//img/@src'))
|
38
42
|
end
|
39
43
|
end
|
40
44
|
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module MetaInspector
|
2
2
|
module Parsers
|
3
3
|
class LinksParser < Base
|
4
|
-
|
4
|
+
delegate [:parsed, :url, :scheme, :host] => :@main_parser
|
5
5
|
|
6
6
|
def links
|
7
7
|
self
|
@@ -9,37 +9,39 @@ module MetaInspector
|
|
9
9
|
|
10
10
|
# Returns all links found, unprocessed
|
11
11
|
def raw
|
12
|
-
@raw ||= cleanup(parsed.search(
|
12
|
+
@raw ||= cleanup(parsed.search('//a/@href')).compact.uniq
|
13
13
|
end
|
14
14
|
|
15
15
|
# Returns all links found, unrelavitized and absolutified
|
16
16
|
def all
|
17
|
-
@all ||= raw.map { |
|
17
|
+
@all ||= raw.map { |link| URL.absolutify(URL.unrelativize(link, scheme), base_url) }
|
18
18
|
.compact.uniq
|
19
19
|
end
|
20
20
|
|
21
21
|
# Returns all HTTP links found
|
22
22
|
def http
|
23
|
-
@http ||= all.select {|
|
23
|
+
@http ||= all.select { |link| link =~ /^http(s)?:\/\//i}
|
24
24
|
end
|
25
25
|
|
26
26
|
# Returns all non-HTTP links found
|
27
27
|
def non_http
|
28
|
-
@non_http ||= all.select {|
|
28
|
+
@non_http ||= all.select { |link| link !~ /^http(s)?:\/\//i}
|
29
29
|
end
|
30
30
|
|
31
31
|
# Returns all internal HTTP links found
|
32
32
|
def internal
|
33
|
-
@internal ||= http.select {|link| URL.new(link).host == host }
|
33
|
+
@internal ||= http.select { |link| URL.new(link).host == host }
|
34
34
|
end
|
35
35
|
|
36
36
|
# Returns all external HTTP links found
|
37
37
|
def external
|
38
|
-
@external ||= http.select {|link| URL.new(link).host != host }
|
38
|
+
@external ||= http.select { |link| URL.new(link).host != host }
|
39
39
|
end
|
40
40
|
|
41
41
|
def to_hash
|
42
|
-
{ 'internal' => internal,
|
42
|
+
{ 'internal' => internal,
|
43
|
+
'external' => external,
|
44
|
+
'non_http' => non_http }
|
43
45
|
end
|
44
46
|
|
45
47
|
# Returns the parsed document meta rss link
|
@@ -47,7 +49,8 @@ module MetaInspector
|
|
47
49
|
@feed ||= (parsed_feed('rss') || parsed_feed('atom'))
|
48
50
|
end
|
49
51
|
|
50
|
-
# Returns the base url to absolutify relative links.
|
52
|
+
# Returns the base url to absolutify relative links.
|
53
|
+
# This can be the one set on a <base> tag,
|
51
54
|
# or the url of the document if no <base> tag was found.
|
52
55
|
def base_url
|
53
56
|
base_href || url
|
@@ -60,7 +63,7 @@ module MetaInspector
|
|
60
63
|
feed ? URL.absolutify(feed.attributes['href'].value, base_url) : nil
|
61
64
|
end
|
62
65
|
|
63
|
-
# Returns the value of the href attribute on the <base /> tag, if
|
66
|
+
# Returns the value of the href attribute on the <base /> tag, if exists
|
64
67
|
def base_href
|
65
68
|
parsed.search('base').first.attributes['href'].value rescue nil
|
66
69
|
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module MetaInspector
|
2
2
|
module Parsers
|
3
3
|
class MetaTagsParser < Base
|
4
|
-
|
4
|
+
delegate :parsed => :@main_parser
|
5
5
|
|
6
6
|
def meta_tags
|
7
7
|
{
|
@@ -20,10 +20,10 @@ module MetaInspector
|
|
20
20
|
meta_tag['name']
|
21
21
|
.merge(meta_tag['http-equiv'])
|
22
22
|
.merge(meta_tag['property'])
|
23
|
-
.merge(
|
23
|
+
.merge('charset' => meta_tag['charset'])
|
24
24
|
end
|
25
25
|
|
26
|
-
# Returns the charset from the meta tags,
|
26
|
+
# Returns the charset from the meta tags, searching in this order:
|
27
27
|
# <meta charset='utf-8' />
|
28
28
|
# <meta http-equiv="Content-Type" content="text/html; charset=windows-1252" />
|
29
29
|
def charset
|
@@ -33,12 +33,12 @@ module MetaInspector
|
|
33
33
|
private
|
34
34
|
|
35
35
|
def charset_from_meta_charset
|
36
|
-
parsed.css(
|
36
|
+
parsed.css('meta[charset]')[0].attributes['charset'].value rescue nil
|
37
37
|
end
|
38
38
|
|
39
39
|
def charset_from_meta_content_type
|
40
40
|
parsed.css("meta[http-equiv='Content-Type']")[0]
|
41
|
-
.attributes['content'].value.split(
|
41
|
+
.attributes['content'].value.split(';')[1].split('=')[1] rescue nil
|
42
42
|
end
|
43
43
|
|
44
44
|
def meta_tags_by(attribute)
|
@@ -58,12 +58,12 @@ module MetaInspector
|
|
58
58
|
def convert_each_array_to_first_element_on(hash)
|
59
59
|
hash.each_pair do |k, v|
|
60
60
|
hash[k] = if v.is_a?(Hash)
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
61
|
+
convert_each_array_to_first_element_on(v)
|
62
|
+
elsif v.is_a?(Array)
|
63
|
+
v.first
|
64
|
+
else
|
65
|
+
v
|
66
|
+
end
|
67
67
|
end
|
68
68
|
end
|
69
69
|
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module MetaInspector
|
2
2
|
module Parsers
|
3
3
|
class TextsParser < Base
|
4
|
-
|
4
|
+
delegate [:parsed, :meta] => :@main_parser
|
5
5
|
|
6
6
|
# Returns the parsed document title, from the content of the <title> tag
|
7
7
|
# within the <head> section.
|
@@ -9,8 +9,9 @@ module MetaInspector
|
|
9
9
|
@title ||= parsed.css('head title').inner_text rescue nil
|
10
10
|
end
|
11
11
|
|
12
|
-
# A description getter that first checks for a meta description
|
13
|
-
# guess by looking at the first paragraph
|
12
|
+
# A description getter that first checks for a meta description
|
13
|
+
# and if not present will guess by looking at the first paragraph
|
14
|
+
# with more than 120 characters
|
14
15
|
def description
|
15
16
|
meta['description'] || secondary_description
|
16
17
|
end
|
@@ -18,25 +18,24 @@ module MetaInspector
|
|
18
18
|
@exception_log = options[:exception_log]
|
19
19
|
@headers = options[:headers]
|
20
20
|
|
21
|
-
response #
|
21
|
+
response # request early so we can fail early
|
22
22
|
end
|
23
23
|
|
24
24
|
extend Forwardable
|
25
|
-
|
25
|
+
delegate :url => :@url
|
26
26
|
|
27
27
|
def read
|
28
28
|
response.body if response
|
29
29
|
end
|
30
30
|
|
31
31
|
def content_type
|
32
|
-
response.headers[
|
32
|
+
response.headers['content-type'].split(';')[0] if response
|
33
33
|
end
|
34
34
|
|
35
35
|
def response
|
36
|
-
request_count ||= 0
|
37
|
-
request_count += 1
|
38
36
|
@response ||= fetch
|
39
|
-
rescue Faraday::TimeoutError, Faraday::Error::ConnectionFailed,
|
37
|
+
rescue Faraday::TimeoutError, Faraday::Error::ConnectionFailed,
|
38
|
+
RuntimeError => e
|
40
39
|
@exception_log << e
|
41
40
|
nil
|
42
41
|
end
|
data/lib/meta_inspector/url.rb
CHANGED
@@ -28,20 +28,23 @@ module MetaInspector
|
|
28
28
|
@url = normalized(with_default_scheme(new_url))
|
29
29
|
end
|
30
30
|
|
31
|
-
# Converts a protocol-relative url to its full form,
|
31
|
+
# Converts a protocol-relative url to its full form,
|
32
|
+
# depending on the scheme of the page that contains it
|
32
33
|
def self.unrelativize(url, scheme)
|
33
34
|
url =~ /^\/\// ? "#{scheme}://#{url[2..-1]}" : url
|
34
35
|
end
|
35
36
|
|
36
|
-
#
|
37
|
-
#
|
37
|
+
# Converts a relative URL to an absolute URL, like:
|
38
|
+
# "/faq" => "http://example.com/faq"
|
39
|
+
# Respecting already absolute URLs like the ones starting with
|
40
|
+
# http:, ftp:, telnet:, mailto:, javascript: ...
|
38
41
|
def self.absolutify(url, base_url)
|
39
42
|
if url =~ /^\w*\:/i
|
40
43
|
MetaInspector::URL.new(url).url
|
41
44
|
else
|
42
45
|
Addressable::URI.join(base_url, url).normalize.to_s
|
43
46
|
end
|
44
|
-
rescue Addressable::URI::InvalidURIError
|
47
|
+
rescue Addressable::URI::InvalidURIError
|
45
48
|
nil
|
46
49
|
end
|
47
50
|
|
@@ -52,7 +55,8 @@ module MetaInspector
|
|
52
55
|
parsed(url) && parsed(url).scheme.nil? ? 'http://' + url : url
|
53
56
|
end
|
54
57
|
|
55
|
-
# Normalize url to deal with characters that should be
|
58
|
+
# Normalize url to deal with characters that should be encoded,
|
59
|
+
# add trailing slash, convert to downcase...
|
56
60
|
def normalized(url)
|
57
61
|
Addressable::URI.parse(url).normalize.to_s
|
58
62
|
end
|
data/meta_inspector.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 4.0.0
|
4
|
+
version: 4.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jaime Iniesta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-11-
|
11
|
+
date: 2014-11-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -192,6 +192,20 @@ dependencies:
|
|
192
192
|
- - ">="
|
193
193
|
- !ruby/object:Gem::Version
|
194
194
|
version: '0'
|
195
|
+
- !ruby/object:Gem::Dependency
|
196
|
+
name: rubocop
|
197
|
+
requirement: !ruby/object:Gem::Requirement
|
198
|
+
requirements:
|
199
|
+
- - ">="
|
200
|
+
- !ruby/object:Gem::Version
|
201
|
+
version: '0'
|
202
|
+
type: :development
|
203
|
+
prerelease: false
|
204
|
+
version_requirements: !ruby/object:Gem::Requirement
|
205
|
+
requirements:
|
206
|
+
- - ">="
|
207
|
+
- !ruby/object:Gem::Version
|
208
|
+
version: '0'
|
195
209
|
description: MetaInspector lets you scrape a web page and get its title, charset,
|
196
210
|
link and meta tags
|
197
211
|
email:
|
@@ -202,6 +216,7 @@ extra_rdoc_files: []
|
|
202
216
|
files:
|
203
217
|
- ".gitignore"
|
204
218
|
- ".rspec.example"
|
219
|
+
- ".rubocop.yml.example"
|
205
220
|
- ".travis.yml"
|
206
221
|
- Gemfile
|
207
222
|
- Guardfile
|
@@ -286,9 +301,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
286
301
|
version: '0'
|
287
302
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
288
303
|
requirements:
|
289
|
-
- - "
|
304
|
+
- - ">="
|
290
305
|
- !ruby/object:Gem::Version
|
291
|
-
version:
|
306
|
+
version: '0'
|
292
307
|
requirements: []
|
293
308
|
rubyforge_project:
|
294
309
|
rubygems_version: 2.2.2
|