metainspector 4.0.0.rc3 → 4.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/.rubocop.yml.example +4 -0
- data/lib/meta_inspector/document.rb +41 -33
- data/lib/meta_inspector/exception_log.rb +2 -2
- data/lib/meta_inspector/exceptionable.rb +1 -1
- data/lib/meta_inspector/parser.rb +5 -5
- data/lib/meta_inspector/parsers/base.rb +1 -1
- data/lib/meta_inspector/parsers/images.rb +9 -5
- data/lib/meta_inspector/parsers/links.rb +13 -10
- data/lib/meta_inspector/parsers/meta_tags.rb +11 -11
- data/lib/meta_inspector/parsers/texts.rb +4 -3
- data/lib/meta_inspector/request.rb +5 -6
- data/lib/meta_inspector/url.rb +9 -5
- data/lib/meta_inspector/version.rb +1 -1
- data/meta_inspector.gemspec +1 -0
- metadata +19 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4fbb85a1c08f497b3c38edbdc97e0c8d96ee6c6a
|
4
|
+
data.tar.gz: 9ce2c80b81b1eb085037312e75fb82d1e46f4202
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e12a19a7598d3a9c7d83d90c121336964490dcd8b334f72d9ceb64ea8efab67c3b269445eb1ebf46eb5385169ea04a81ef155533dbe92779614eb3e0a10c50b3
|
7
|
+
data.tar.gz: 555a9b35ee7f51def2c45a24e46996cc130a65d15daebda9841c7be74fda8a2c76cb0097c53a67ad763b80272db52d84f8bdb7b99ecee124929a19b3c36a6338
|
data/.gitignore
CHANGED
@@ -5,16 +5,18 @@ module MetaInspector
|
|
5
5
|
|
6
6
|
include MetaInspector::Exceptionable
|
7
7
|
|
8
|
-
# Initializes a new instance of MetaInspector::Document, setting the URL
|
8
|
+
# Initializes a new instance of MetaInspector::Document, setting the URL
|
9
9
|
# Options:
|
10
|
-
#
|
11
|
-
#
|
12
|
-
#
|
13
|
-
#
|
14
|
-
#
|
15
|
-
#
|
16
|
-
#
|
17
|
-
#
|
10
|
+
# * connection_timeout: defaults to 20 seconds
|
11
|
+
# * read_timeout: defaults to 20 seconds
|
12
|
+
# * retries: defaults to 3 times
|
13
|
+
# * html_content_type_only: if an exception should be raised if request
|
14
|
+
# content-type is not text/html. Defaults to false.
|
15
|
+
# * allow_redirections: when true, follow HTTP redirects. Defaults to true
|
16
|
+
# * document: the html of the url as a string
|
17
|
+
# * warn_level: what to do when encountering exceptions.
|
18
|
+
# Can be :warn, :raise or nil
|
19
|
+
# * headers: object containing custom headers for the request
|
18
20
|
def initialize(initial_url, options = {})
|
19
21
|
options = defaults.merge(options)
|
20
22
|
@connection_timeout = options[:connection_timeout]
|
@@ -37,25 +39,28 @@ module MetaInspector
|
|
37
39
|
end
|
38
40
|
|
39
41
|
extend Forwardable
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
42
|
+
delegate [:url, :scheme, :host, :root_url] => :@url
|
43
|
+
|
44
|
+
delegate [:content_type, :response] => :@request
|
45
|
+
|
46
|
+
delegate [:parsed, :title, :description, :links,
|
47
|
+
:images, :feed, :charset, :meta_tags,
|
48
|
+
:meta_tag, :meta, :favicon] => :@parser
|
44
49
|
|
45
50
|
# Returns all document data as a nested Hash
|
46
51
|
def to_hash
|
47
52
|
{
|
48
|
-
'url'
|
49
|
-
'title'
|
50
|
-
'links'
|
51
|
-
'images'
|
52
|
-
'charset'
|
53
|
-
'feed'
|
54
|
-
'content_type'
|
55
|
-
'meta_tags'
|
56
|
-
'favicon'
|
57
|
-
'response'
|
58
|
-
|
53
|
+
'url' => url,
|
54
|
+
'title' => title,
|
55
|
+
'links' => links.to_hash,
|
56
|
+
'images' => images.to_a,
|
57
|
+
'charset' => charset,
|
58
|
+
'feed' => feed,
|
59
|
+
'content_type' => content_type,
|
60
|
+
'meta_tags' => meta_tags,
|
61
|
+
'favicon' => images.favicon,
|
62
|
+
'response' => { 'status' => response.status,
|
63
|
+
'headers' => response.headers }
|
59
64
|
}
|
60
65
|
end
|
61
66
|
|
@@ -67,18 +72,21 @@ module MetaInspector
|
|
67
72
|
private
|
68
73
|
|
69
74
|
def defaults
|
70
|
-
{ :timeout
|
71
|
-
:retries
|
72
|
-
:html_content_only
|
73
|
-
:warn_level
|
74
|
-
:headers
|
75
|
-
:allow_redirections => true
|
76
|
-
|
75
|
+
{ :timeout => 20,
|
76
|
+
:retries => 3,
|
77
|
+
:html_content_only => false,
|
78
|
+
:warn_level => :raise,
|
79
|
+
:headers => { 'User-Agent' => default_user_agent },
|
80
|
+
:allow_redirections => true }
|
81
|
+
end
|
82
|
+
|
83
|
+
def default_user_agent
|
84
|
+
"MetaInspector/#{MetaInspector::VERSION} (+https://github.com/jaimeiniesta/metainspector)"
|
77
85
|
end
|
78
86
|
|
79
87
|
def document
|
80
|
-
@document ||= if html_content_only && content_type !=
|
81
|
-
|
88
|
+
@document ||= if html_content_only && content_type != 'text/html'
|
89
|
+
fail "The url provided contains #{content_type} content instead of text/html content"
|
82
90
|
else
|
83
91
|
@request.read
|
84
92
|
end
|
@@ -12,7 +12,7 @@ module MetaInspector
|
|
12
12
|
def <<(exception)
|
13
13
|
case warn_level
|
14
14
|
when :raise
|
15
|
-
|
15
|
+
fail exception
|
16
16
|
when :warn
|
17
17
|
warn exception
|
18
18
|
when :store
|
@@ -24,7 +24,7 @@ module MetaInspector
|
|
24
24
|
if warn_level == :store
|
25
25
|
exceptions.empty?
|
26
26
|
else
|
27
|
-
warn
|
27
|
+
warn 'ExceptionLog#ok? should only be used when warn_level is :store'
|
28
28
|
end
|
29
29
|
end
|
30
30
|
end
|
@@ -20,11 +20,11 @@ module MetaInspector
|
|
20
20
|
end
|
21
21
|
|
22
22
|
extend Forwardable
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
23
|
+
delegate [:url, :scheme, :host] => :@document
|
24
|
+
delegate [:meta_tags, :meta_tag, :meta, :charset] => :@meta_tag_parser
|
25
|
+
delegate [:links, :feed, :base_url] => :@links_parser
|
26
|
+
delegate :images => :@images_parser
|
27
|
+
delegate [:title, :description] => :@texts_parser
|
28
28
|
|
29
29
|
# Returns the whole parsed document
|
30
30
|
def parsed
|
@@ -1,8 +1,8 @@
|
|
1
1
|
module MetaInspector
|
2
2
|
module Parsers
|
3
3
|
class ImagesParser < Base
|
4
|
-
|
5
|
-
|
4
|
+
delegate [:parsed, :meta, :base_url] => :@main_parser
|
5
|
+
delegate [:each, :length, :size, :[], :last] => :images_collection
|
6
6
|
|
7
7
|
include Enumerable
|
8
8
|
|
@@ -11,7 +11,7 @@ module MetaInspector
|
|
11
11
|
end
|
12
12
|
|
13
13
|
# Returns the parsed image from Facebook's open graph property tags
|
14
|
-
# Most
|
14
|
+
# Most major websites now define this property and is usually relevant
|
15
15
|
# See doc at http://developers.facebook.com/docs/opengraph/
|
16
16
|
# If none found, tries with Twitter image
|
17
17
|
def best
|
@@ -30,11 +30,15 @@ module MetaInspector
|
|
30
30
|
private
|
31
31
|
|
32
32
|
def images_collection
|
33
|
-
@images_collection ||=
|
33
|
+
@images_collection ||= absolutified_images
|
34
|
+
end
|
35
|
+
|
36
|
+
def absolutified_images
|
37
|
+
parsed_images.map { |i| URL.absolutify(i, base_url) }
|
34
38
|
end
|
35
39
|
|
36
40
|
def parsed_images
|
37
|
-
|
41
|
+
cleanup(parsed.search('//img/@src'))
|
38
42
|
end
|
39
43
|
end
|
40
44
|
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module MetaInspector
|
2
2
|
module Parsers
|
3
3
|
class LinksParser < Base
|
4
|
-
|
4
|
+
delegate [:parsed, :url, :scheme, :host] => :@main_parser
|
5
5
|
|
6
6
|
def links
|
7
7
|
self
|
@@ -9,37 +9,39 @@ module MetaInspector
|
|
9
9
|
|
10
10
|
# Returns all links found, unprocessed
|
11
11
|
def raw
|
12
|
-
@raw ||= cleanup(parsed.search(
|
12
|
+
@raw ||= cleanup(parsed.search('//a/@href')).compact.uniq
|
13
13
|
end
|
14
14
|
|
15
15
|
# Returns all links found, unrelavitized and absolutified
|
16
16
|
def all
|
17
|
-
@all ||= raw.map { |
|
17
|
+
@all ||= raw.map { |link| URL.absolutify(URL.unrelativize(link, scheme), base_url) }
|
18
18
|
.compact.uniq
|
19
19
|
end
|
20
20
|
|
21
21
|
# Returns all HTTP links found
|
22
22
|
def http
|
23
|
-
@http ||= all.select {|
|
23
|
+
@http ||= all.select { |link| link =~ /^http(s)?:\/\//i}
|
24
24
|
end
|
25
25
|
|
26
26
|
# Returns all non-HTTP links found
|
27
27
|
def non_http
|
28
|
-
@non_http ||= all.select {|
|
28
|
+
@non_http ||= all.select { |link| link !~ /^http(s)?:\/\//i}
|
29
29
|
end
|
30
30
|
|
31
31
|
# Returns all internal HTTP links found
|
32
32
|
def internal
|
33
|
-
@internal ||= http.select {|link| URL.new(link).host == host }
|
33
|
+
@internal ||= http.select { |link| URL.new(link).host == host }
|
34
34
|
end
|
35
35
|
|
36
36
|
# Returns all external HTTP links found
|
37
37
|
def external
|
38
|
-
@external ||= http.select {|link| URL.new(link).host != host }
|
38
|
+
@external ||= http.select { |link| URL.new(link).host != host }
|
39
39
|
end
|
40
40
|
|
41
41
|
def to_hash
|
42
|
-
{ 'internal' => internal,
|
42
|
+
{ 'internal' => internal,
|
43
|
+
'external' => external,
|
44
|
+
'non_http' => non_http }
|
43
45
|
end
|
44
46
|
|
45
47
|
# Returns the parsed document meta rss link
|
@@ -47,7 +49,8 @@ module MetaInspector
|
|
47
49
|
@feed ||= (parsed_feed('rss') || parsed_feed('atom'))
|
48
50
|
end
|
49
51
|
|
50
|
-
# Returns the base url to absolutify relative links.
|
52
|
+
# Returns the base url to absolutify relative links.
|
53
|
+
# This can be the one set on a <base> tag,
|
51
54
|
# or the url of the document if no <base> tag was found.
|
52
55
|
def base_url
|
53
56
|
base_href || url
|
@@ -60,7 +63,7 @@ module MetaInspector
|
|
60
63
|
feed ? URL.absolutify(feed.attributes['href'].value, base_url) : nil
|
61
64
|
end
|
62
65
|
|
63
|
-
# Returns the value of the href attribute on the <base /> tag, if
|
66
|
+
# Returns the value of the href attribute on the <base /> tag, if exists
|
64
67
|
def base_href
|
65
68
|
parsed.search('base').first.attributes['href'].value rescue nil
|
66
69
|
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module MetaInspector
|
2
2
|
module Parsers
|
3
3
|
class MetaTagsParser < Base
|
4
|
-
|
4
|
+
delegate :parsed => :@main_parser
|
5
5
|
|
6
6
|
def meta_tags
|
7
7
|
{
|
@@ -20,10 +20,10 @@ module MetaInspector
|
|
20
20
|
meta_tag['name']
|
21
21
|
.merge(meta_tag['http-equiv'])
|
22
22
|
.merge(meta_tag['property'])
|
23
|
-
.merge(
|
23
|
+
.merge('charset' => meta_tag['charset'])
|
24
24
|
end
|
25
25
|
|
26
|
-
# Returns the charset from the meta tags,
|
26
|
+
# Returns the charset from the meta tags, searching in this order:
|
27
27
|
# <meta charset='utf-8' />
|
28
28
|
# <meta http-equiv="Content-Type" content="text/html; charset=windows-1252" />
|
29
29
|
def charset
|
@@ -33,12 +33,12 @@ module MetaInspector
|
|
33
33
|
private
|
34
34
|
|
35
35
|
def charset_from_meta_charset
|
36
|
-
parsed.css(
|
36
|
+
parsed.css('meta[charset]')[0].attributes['charset'].value rescue nil
|
37
37
|
end
|
38
38
|
|
39
39
|
def charset_from_meta_content_type
|
40
40
|
parsed.css("meta[http-equiv='Content-Type']")[0]
|
41
|
-
.attributes['content'].value.split(
|
41
|
+
.attributes['content'].value.split(';')[1].split('=')[1] rescue nil
|
42
42
|
end
|
43
43
|
|
44
44
|
def meta_tags_by(attribute)
|
@@ -58,12 +58,12 @@ module MetaInspector
|
|
58
58
|
def convert_each_array_to_first_element_on(hash)
|
59
59
|
hash.each_pair do |k, v|
|
60
60
|
hash[k] = if v.is_a?(Hash)
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
61
|
+
convert_each_array_to_first_element_on(v)
|
62
|
+
elsif v.is_a?(Array)
|
63
|
+
v.first
|
64
|
+
else
|
65
|
+
v
|
66
|
+
end
|
67
67
|
end
|
68
68
|
end
|
69
69
|
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module MetaInspector
|
2
2
|
module Parsers
|
3
3
|
class TextsParser < Base
|
4
|
-
|
4
|
+
delegate [:parsed, :meta] => :@main_parser
|
5
5
|
|
6
6
|
# Returns the parsed document title, from the content of the <title> tag
|
7
7
|
# within the <head> section.
|
@@ -9,8 +9,9 @@ module MetaInspector
|
|
9
9
|
@title ||= parsed.css('head title').inner_text rescue nil
|
10
10
|
end
|
11
11
|
|
12
|
-
# A description getter that first checks for a meta description
|
13
|
-
# guess by looking at the first paragraph
|
12
|
+
# A description getter that first checks for a meta description
|
13
|
+
# and if not present will guess by looking at the first paragraph
|
14
|
+
# with more than 120 characters
|
14
15
|
def description
|
15
16
|
meta['description'] || secondary_description
|
16
17
|
end
|
@@ -18,25 +18,24 @@ module MetaInspector
|
|
18
18
|
@exception_log = options[:exception_log]
|
19
19
|
@headers = options[:headers]
|
20
20
|
|
21
|
-
response #
|
21
|
+
response # request early so we can fail early
|
22
22
|
end
|
23
23
|
|
24
24
|
extend Forwardable
|
25
|
-
|
25
|
+
delegate :url => :@url
|
26
26
|
|
27
27
|
def read
|
28
28
|
response.body if response
|
29
29
|
end
|
30
30
|
|
31
31
|
def content_type
|
32
|
-
response.headers[
|
32
|
+
response.headers['content-type'].split(';')[0] if response
|
33
33
|
end
|
34
34
|
|
35
35
|
def response
|
36
|
-
request_count ||= 0
|
37
|
-
request_count += 1
|
38
36
|
@response ||= fetch
|
39
|
-
rescue Faraday::TimeoutError, Faraday::Error::ConnectionFailed,
|
37
|
+
rescue Faraday::TimeoutError, Faraday::Error::ConnectionFailed,
|
38
|
+
RuntimeError => e
|
40
39
|
@exception_log << e
|
41
40
|
nil
|
42
41
|
end
|
data/lib/meta_inspector/url.rb
CHANGED
@@ -28,20 +28,23 @@ module MetaInspector
|
|
28
28
|
@url = normalized(with_default_scheme(new_url))
|
29
29
|
end
|
30
30
|
|
31
|
-
# Converts a protocol-relative url to its full form,
|
31
|
+
# Converts a protocol-relative url to its full form,
|
32
|
+
# depending on the scheme of the page that contains it
|
32
33
|
def self.unrelativize(url, scheme)
|
33
34
|
url =~ /^\/\// ? "#{scheme}://#{url[2..-1]}" : url
|
34
35
|
end
|
35
36
|
|
36
|
-
#
|
37
|
-
#
|
37
|
+
# Converts a relative URL to an absolute URL, like:
|
38
|
+
# "/faq" => "http://example.com/faq"
|
39
|
+
# Respecting already absolute URLs like the ones starting with
|
40
|
+
# http:, ftp:, telnet:, mailto:, javascript: ...
|
38
41
|
def self.absolutify(url, base_url)
|
39
42
|
if url =~ /^\w*\:/i
|
40
43
|
MetaInspector::URL.new(url).url
|
41
44
|
else
|
42
45
|
Addressable::URI.join(base_url, url).normalize.to_s
|
43
46
|
end
|
44
|
-
rescue Addressable::URI::InvalidURIError
|
47
|
+
rescue Addressable::URI::InvalidURIError
|
45
48
|
nil
|
46
49
|
end
|
47
50
|
|
@@ -52,7 +55,8 @@ module MetaInspector
|
|
52
55
|
parsed(url) && parsed(url).scheme.nil? ? 'http://' + url : url
|
53
56
|
end
|
54
57
|
|
55
|
-
# Normalize url to deal with characters that should be
|
58
|
+
# Normalize url to deal with characters that should be encoded,
|
59
|
+
# add trailing slash, convert to downcase...
|
56
60
|
def normalized(url)
|
57
61
|
Addressable::URI.parse(url).normalize.to_s
|
58
62
|
end
|
data/meta_inspector.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 4.0.0
|
4
|
+
version: 4.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jaime Iniesta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-11-
|
11
|
+
date: 2014-11-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -192,6 +192,20 @@ dependencies:
|
|
192
192
|
- - ">="
|
193
193
|
- !ruby/object:Gem::Version
|
194
194
|
version: '0'
|
195
|
+
- !ruby/object:Gem::Dependency
|
196
|
+
name: rubocop
|
197
|
+
requirement: !ruby/object:Gem::Requirement
|
198
|
+
requirements:
|
199
|
+
- - ">="
|
200
|
+
- !ruby/object:Gem::Version
|
201
|
+
version: '0'
|
202
|
+
type: :development
|
203
|
+
prerelease: false
|
204
|
+
version_requirements: !ruby/object:Gem::Requirement
|
205
|
+
requirements:
|
206
|
+
- - ">="
|
207
|
+
- !ruby/object:Gem::Version
|
208
|
+
version: '0'
|
195
209
|
description: MetaInspector lets you scrape a web page and get its title, charset,
|
196
210
|
link and meta tags
|
197
211
|
email:
|
@@ -202,6 +216,7 @@ extra_rdoc_files: []
|
|
202
216
|
files:
|
203
217
|
- ".gitignore"
|
204
218
|
- ".rspec.example"
|
219
|
+
- ".rubocop.yml.example"
|
205
220
|
- ".travis.yml"
|
206
221
|
- Gemfile
|
207
222
|
- Guardfile
|
@@ -286,9 +301,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
286
301
|
version: '0'
|
287
302
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
288
303
|
requirements:
|
289
|
-
- - "
|
304
|
+
- - ">="
|
290
305
|
- !ruby/object:Gem::Version
|
291
|
-
version:
|
306
|
+
version: '0'
|
292
307
|
requirements: []
|
293
308
|
rubyforge_project:
|
294
309
|
rubygems_version: 2.2.2
|