metainspector 1.16.1 → 1.17.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +17 -11
- data/lib/meta_inspector.rb +10 -3
- data/lib/meta_inspector/deprecations.rb +19 -0
- data/lib/meta_inspector/document.rb +81 -0
- data/lib/meta_inspector/exception_log.rb +29 -0
- data/lib/meta_inspector/exceptionable.rb +11 -0
- data/lib/meta_inspector/parser.rb +178 -0
- data/lib/meta_inspector/request.rb +55 -0
- data/lib/meta_inspector/url.rb +76 -0
- data/lib/meta_inspector/version.rb +1 -1
- data/spec/document_spec.rb +97 -0
- data/spec/exception_log_spec.rb +59 -0
- data/spec/meta_inspector_spec.rb +9 -0
- data/spec/parser_spec.rb +374 -0
- data/spec/redirections_spec.rb +20 -3
- data/spec/request_spec.rb +64 -0
- data/spec/url_spec.rb +74 -0
- metadata +18 -7
- data/lib/meta_inspector/scraper.rb +0 -283
- data/spec/metainspector_spec.rb +0 -547
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 54f34fbd4dec77ffa68eb9762cdc140e98246817
|
4
|
+
data.tar.gz: 0c294be322b646fa90150c3934218db529af3c6b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 79a235d8161922f9991f9df68a524f18076562c56ff68cee29a4ac8dc88c6abd1b1113363aaa9d79b39e9f703abcdcc801ccd24930c3a085157979814498c132
|
7
|
+
data.tar.gz: 68b165770c8c8de5d56b923c9b25405f500ac3ee00bfb245af41a91dcf9d2ac7fcb1bf6fcbf6d6ec80b9f7453462d2c2235a14109a0a09710ec424d5f4b59a08
|
data/README.md
CHANGED
@@ -74,11 +74,11 @@ You can also access most of the scraped data as a hash:
|
|
74
74
|
|
75
75
|
The original document is accessible from:
|
76
76
|
|
77
|
-
page.
|
77
|
+
page.to_s # A String with the contents of the HTML document
|
78
78
|
|
79
79
|
And the full scraped document is accessible from:
|
80
80
|
|
81
|
-
page.
|
81
|
+
page.parsed # Nokogiri doc that you can use it to get any element from the page
|
82
82
|
|
83
83
|
## Opengraph and Twitter card meta tags
|
84
84
|
|
@@ -91,8 +91,8 @@ Twitter cards & Open graph tags make it possible for you to attach media experie
|
|
91
91
|
|
92
92
|
Also many sites use name & property, content & value attributes interchangeably. Using MetaInspector accessing this information is as easy as -
|
93
93
|
|
94
|
-
page.meta_og_image
|
95
|
-
page.meta_twitter_image_width
|
94
|
+
page.meta_og_image
|
95
|
+
page.meta_twitter_image_width
|
96
96
|
|
97
97
|
Note that MetaInspector gives priority to content over value. In other words if there is a tag of the form
|
98
98
|
|
@@ -122,7 +122,7 @@ However, you can tell MetaInspector to allow these redirections with the option
|
|
122
122
|
|
123
123
|
### HTML Content Only
|
124
124
|
|
125
|
-
MetaInspector will try to parse all URLs by default. If you want to raise an
|
125
|
+
MetaInspector will try to parse all URLs by default. If you want to raise an exception when trying to parse a non-html URL (one that has a content-type different than text/html), you can state it like this:
|
126
126
|
|
127
127
|
page = MetaInspector.new('markupvalidator.com', :html_content_only => true)
|
128
128
|
|
@@ -137,21 +137,27 @@ This is useful when using MetaInspector on web spidering. Although on the initia
|
|
137
137
|
page.title # returns nil
|
138
138
|
page.content_type # "image/png"
|
139
139
|
page.ok? # false
|
140
|
-
page.
|
140
|
+
page.exceptions.first.message # "The url provided contains image/png content instead of text/html content"
|
141
141
|
|
142
|
-
##
|
142
|
+
## Exception handling
|
143
143
|
|
144
144
|
You can check if the page has been succesfully parsed with:
|
145
145
|
|
146
146
|
page.ok? # Will return true if everything looks OK
|
147
147
|
|
148
|
-
In case there have been any
|
148
|
+
In case there have been any exceptions, you can check them with:
|
149
149
|
|
150
|
-
page.
|
150
|
+
page.exceptions # Will return an array with the exceptions
|
151
151
|
|
152
|
-
|
152
|
+
You can also specify what to do when encountering an exception. By default it
|
153
|
+
will store it, but you can also tell MetaInspector to warn about it on the log
|
154
|
+
console, or to raise the exceptions, like this:
|
153
155
|
|
154
|
-
|
156
|
+
# This will warn about the exception on console
|
157
|
+
page = MetaInspector.new('http://example.com', warn_level: :warn)
|
158
|
+
|
159
|
+
# This will raise the exception
|
160
|
+
page = MetaInspector.new('http://example.com', warn_level: :raise)
|
155
161
|
|
156
162
|
## Examples
|
157
163
|
|
data/lib/meta_inspector.rb
CHANGED
@@ -1,12 +1,19 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
2
|
|
3
|
-
require
|
3
|
+
require 'forwardable'
|
4
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/exceptionable'))
|
5
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/exception_log'))
|
6
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/request'))
|
7
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/url'))
|
8
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/parser'))
|
9
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/document'))
|
10
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/deprecations'))
|
4
11
|
|
5
12
|
module MetaInspector
|
6
13
|
extend self
|
7
14
|
|
8
|
-
# Sugar method to be able to
|
15
|
+
# Sugar method to be able to scrape a document in a shorter way
|
9
16
|
def new(url, options = {})
|
10
|
-
|
17
|
+
Document.new(url, options)
|
11
18
|
end
|
12
19
|
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
module MetaInspector
|
4
|
+
class Scraper < Document
|
5
|
+
def initialize
|
6
|
+
warn "The Scraper class is now deprecated since version 1.17, use Document instead"
|
7
|
+
super
|
8
|
+
end
|
9
|
+
|
10
|
+
def errors
|
11
|
+
warn "The #errors method is deprecated since version 1.17, use #exceptions instead"
|
12
|
+
exceptions
|
13
|
+
end
|
14
|
+
|
15
|
+
def document
|
16
|
+
warn "The #document method is deprecated since version 1.17, use #to_s instead"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
module MetaInspector
|
4
|
+
# A MetaInspector::Document knows about its URL and its contents
|
5
|
+
class Document
|
6
|
+
attr_reader :timeout, :html_content_only, :allow_redirections, :warn_level
|
7
|
+
|
8
|
+
include MetaInspector::Exceptionable
|
9
|
+
|
10
|
+
# Initializes a new instance of MetaInspector::Document, setting the URL to the one given
|
11
|
+
# Options:
|
12
|
+
# => timeout: defaults to 20 seconds
|
13
|
+
# => html_content_type_only: if an exception should be raised if request content-type is not text/html. Defaults to false
|
14
|
+
# => allow_redirections: when :safe, allows HTTP => HTTPS redirections. When :all, it also allows HTTPS => HTTP
|
15
|
+
# => document: the html of the url as a string
|
16
|
+
# => warn_level: what to do when encountering exceptions. Can be :warn, :raise or nil
|
17
|
+
def initialize(initial_url, options = {})
|
18
|
+
options = defaults.merge(options)
|
19
|
+
@timeout = options[:timeout]
|
20
|
+
@html_content_only = options[:html_content_only]
|
21
|
+
@allow_redirections = options[:allow_redirections]
|
22
|
+
@document = options[:document]
|
23
|
+
|
24
|
+
if options[:verbose] == true
|
25
|
+
warn "The verbose option is deprecated since 1.17, please use warn_level: :warn instead"
|
26
|
+
options[:warn_level] = :warn
|
27
|
+
end
|
28
|
+
|
29
|
+
@warn_level = options[:warn_level]
|
30
|
+
|
31
|
+
@exception_log = MetaInspector::ExceptionLog.new(warn_level: warn_level)
|
32
|
+
@url = MetaInspector::URL.new(initial_url, exception_log: @exception_log)
|
33
|
+
@request = MetaInspector::Request.new(@url, allow_redirections: @allow_redirections,
|
34
|
+
timeout: @timeout,
|
35
|
+
exception_log: @exception_log)
|
36
|
+
@parser = MetaInspector::Parser.new(self, exception_log: @exception_log)
|
37
|
+
end
|
38
|
+
|
39
|
+
extend Forwardable
|
40
|
+
def_delegators :@url, :url, :scheme, :host, :root_url
|
41
|
+
def_delegators :@request, :content_type
|
42
|
+
def_delegators :@parser, :parsed, :method_missing, :title, :description, :links, :internal_links, :external_links,
|
43
|
+
:images, :image, :feed, :charset
|
44
|
+
|
45
|
+
# Returns all document data as a nested Hash
|
46
|
+
def to_hash
|
47
|
+
{
|
48
|
+
'url' => url,
|
49
|
+
'title' => title,
|
50
|
+
'links' => links,
|
51
|
+
'internal_links' => internal_links,
|
52
|
+
'external_links' => external_links,
|
53
|
+
'images' => images,
|
54
|
+
'charset' => charset,
|
55
|
+
'feed' => feed,
|
56
|
+
'content_type' => content_type
|
57
|
+
}.merge @parser.to_hash
|
58
|
+
end
|
59
|
+
|
60
|
+
# Returns the contents of the document as a string
|
61
|
+
def to_s
|
62
|
+
document
|
63
|
+
end
|
64
|
+
|
65
|
+
private
|
66
|
+
|
67
|
+
def defaults
|
68
|
+
{ :timeout => 20, :html_content_only => false }
|
69
|
+
end
|
70
|
+
|
71
|
+
def document
|
72
|
+
@document ||= if html_content_only && content_type != "text/html"
|
73
|
+
raise "The url provided contains #{content_type} content instead of text/html content" and nil
|
74
|
+
else
|
75
|
+
@request.read
|
76
|
+
end
|
77
|
+
rescue Exception => e
|
78
|
+
@exception_log << e
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
module MetaInspector
|
4
|
+
|
5
|
+
# Stores the exceptions passed to it, warning about them if required
|
6
|
+
class ExceptionLog
|
7
|
+
attr_reader :exceptions, :warn_level
|
8
|
+
|
9
|
+
def initialize(options = {})
|
10
|
+
@exceptions = []
|
11
|
+
@warn_level = options[:warn_level]
|
12
|
+
end
|
13
|
+
|
14
|
+
def <<(exception)
|
15
|
+
case warn_level
|
16
|
+
when :warn
|
17
|
+
warn exception
|
18
|
+
when :raise
|
19
|
+
raise exception
|
20
|
+
end
|
21
|
+
|
22
|
+
@exceptions << exception
|
23
|
+
end
|
24
|
+
|
25
|
+
def ok?
|
26
|
+
exceptions.empty?
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,178 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'hashie/rash'
|
5
|
+
|
6
|
+
module MetaInspector
|
7
|
+
# Parses the document with Nokogiri
|
8
|
+
class Parser
|
9
|
+
include MetaInspector::Exceptionable
|
10
|
+
|
11
|
+
def initialize(document, options = {})
|
12
|
+
options = defaults.merge(options)
|
13
|
+
|
14
|
+
@document = document
|
15
|
+
@data = Hashie::Rash.new
|
16
|
+
@exception_log = options[:exception_log]
|
17
|
+
end
|
18
|
+
|
19
|
+
extend Forwardable
|
20
|
+
def_delegators :@document, :url, :scheme, :host
|
21
|
+
|
22
|
+
# Returns the whole parsed document
|
23
|
+
def parsed
|
24
|
+
@parsed ||= Nokogiri::HTML(@document.to_s)
|
25
|
+
|
26
|
+
rescue Exception => e
|
27
|
+
@exception_log << e
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_hash
|
31
|
+
scrape_meta_data
|
32
|
+
@data.to_hash
|
33
|
+
end
|
34
|
+
|
35
|
+
# Returns the parsed document title, from the content of the <title> tag.
|
36
|
+
# This is not the same as the meta_title tag
|
37
|
+
def title
|
38
|
+
@title ||= parsed.css('title').inner_text rescue nil
|
39
|
+
end
|
40
|
+
|
41
|
+
# A description getter that first checks for a meta description and if not present will
|
42
|
+
# guess by looking at the first paragraph with more than 120 characters
|
43
|
+
def description
|
44
|
+
meta_description || secondary_description
|
45
|
+
end
|
46
|
+
|
47
|
+
# Links found on the page, as absolute URLs
|
48
|
+
def links
|
49
|
+
@links ||= parsed_links.map{ |l| URL.absolutify(URL.unrelativize(l, scheme), base_url) }.compact.uniq
|
50
|
+
end
|
51
|
+
|
52
|
+
# Internal links found on the page, as absolute URLs
|
53
|
+
def internal_links
|
54
|
+
@internal_links ||= links.select {|link| URL.new(link).host == host }
|
55
|
+
end
|
56
|
+
|
57
|
+
# External links found on the page, as absolute URLs
|
58
|
+
def external_links
|
59
|
+
@external_links ||= links.select {|link| URL.new(link).host != host }
|
60
|
+
end
|
61
|
+
|
62
|
+
# Images found on the page, as absolute URLs
|
63
|
+
def images
|
64
|
+
@images ||= parsed_images.map{ |i| URL.absolutify(i, base_url) }
|
65
|
+
end
|
66
|
+
|
67
|
+
# Returns the parsed image from Facebook's open graph property tags
|
68
|
+
# Most all major websites now define this property and is usually very relevant
|
69
|
+
# See doc at http://developers.facebook.com/docs/opengraph/
|
70
|
+
def image
|
71
|
+
meta_og_image || meta_twitter_image
|
72
|
+
end
|
73
|
+
|
74
|
+
# Returns the parsed document meta rss link
|
75
|
+
def feed
|
76
|
+
@feed ||= (parsed_feed('rss') || parsed_feed('atom'))
|
77
|
+
end
|
78
|
+
|
79
|
+
# Returns the charset from the meta tags, looking for it in the following order:
|
80
|
+
# <meta charset='utf-8' />
|
81
|
+
# <meta http-equiv="Content-Type" content="text/html; charset=windows-1252" />
|
82
|
+
def charset
|
83
|
+
@charset ||= (charset_from_meta_charset || charset_from_meta_content_type)
|
84
|
+
end
|
85
|
+
|
86
|
+
private
|
87
|
+
|
88
|
+
def defaults
|
89
|
+
{ exception_log: MetaInspector::ExceptionLog.new }
|
90
|
+
end
|
91
|
+
|
92
|
+
# Scrapers for all meta_tags in the form of "meta_name" are automatically defined. This has been tested for
|
93
|
+
# meta name: keywords, description, robots, generator
|
94
|
+
# meta http-equiv: content-language, Content-Type
|
95
|
+
#
|
96
|
+
# It will first try with meta name="..." and if nothing found,
|
97
|
+
# with meta http-equiv="...", substituting "_" by "-"
|
98
|
+
# TODO: define respond_to? to return true on the meta_name methods
|
99
|
+
def method_missing(method_name)
|
100
|
+
if method_name.to_s =~ /^meta_(.*)/
|
101
|
+
key = $1
|
102
|
+
|
103
|
+
#special treatment for opengraph (og:) and twitter card (twitter:) tags
|
104
|
+
key.gsub!("_",":") if key =~ /^og_(.*)/ || key =~ /^twitter_(.*)/
|
105
|
+
|
106
|
+
scrape_meta_data
|
107
|
+
|
108
|
+
@data.meta.name && (@data.meta.name[key.downcase]) || (@data.meta.property && @data.meta.property[key.downcase])
|
109
|
+
else
|
110
|
+
super
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
# Scrapes all meta tags found
|
115
|
+
def scrape_meta_data
|
116
|
+
unless @data.meta
|
117
|
+
@data.meta!.name!
|
118
|
+
@data.meta!.property!
|
119
|
+
parsed.xpath("//meta").each do |element|
|
120
|
+
get_meta_name_or_property(element)
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
# Store meta tag value, looking at meta name or meta property
|
126
|
+
def get_meta_name_or_property(element)
|
127
|
+
name_or_property = element.attributes["name"] ? "name" : (element.attributes["property"] ? "property" : nil)
|
128
|
+
content_or_value = element.attributes["content"] ? "content" : (element.attributes["value"] ? "value" : nil)
|
129
|
+
|
130
|
+
if !name_or_property.nil? && !content_or_value.nil?
|
131
|
+
@data.meta.name[element.attributes[name_or_property].value.downcase] = element.attributes[content_or_value].value
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
# Look for the first <p> block with 120 characters or more
|
136
|
+
def secondary_description
|
137
|
+
first_long_paragraph = parsed.search('//p[string-length() >= 120]').first
|
138
|
+
first_long_paragraph ? first_long_paragraph.text : ''
|
139
|
+
end
|
140
|
+
|
141
|
+
def parsed_links
|
142
|
+
@parsed_links ||= cleanup_nokogiri_values(parsed.search("//a/@href"))
|
143
|
+
end
|
144
|
+
|
145
|
+
def parsed_images
|
146
|
+
@parsed_images ||= cleanup_nokogiri_values(parsed.search('//img/@src'))
|
147
|
+
end
|
148
|
+
|
149
|
+
def parsed_feed(format)
|
150
|
+
feed = parsed.search("//link[@type='application/#{format}+xml']").first
|
151
|
+
feed ? URL.absolutify(feed.attributes['href'].value, base_url) : nil
|
152
|
+
end
|
153
|
+
|
154
|
+
def charset_from_meta_charset
|
155
|
+
parsed.css("meta[charset]")[0].attributes['charset'].value rescue nil
|
156
|
+
end
|
157
|
+
|
158
|
+
def charset_from_meta_content_type
|
159
|
+
parsed.css("meta[http-equiv='Content-Type']")[0].attributes['content'].value.split(";")[1].split("=")[1] rescue nil
|
160
|
+
end
|
161
|
+
|
162
|
+
# Returns the base url to absolutify relative links. This can be the one set on a <base> tag,
|
163
|
+
# or the url of the document if no <base> tag was found.
|
164
|
+
def base_url
|
165
|
+
base_href || url
|
166
|
+
end
|
167
|
+
|
168
|
+
# Returns the value of the href attribute on the <base /> tag, if it exists
|
169
|
+
def base_href
|
170
|
+
parsed.search('base').first.attributes['href'].value rescue nil
|
171
|
+
end
|
172
|
+
|
173
|
+
# Takes a nokogiri search result, strips the values, rejects the empty ones, and removes duplicates
|
174
|
+
def cleanup_nokogiri_values(results)
|
175
|
+
results.map { |a| a.value.strip }.reject { |s| s.empty? }.uniq
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require 'open-uri'
|
4
|
+
require 'open_uri_redirections'
|
5
|
+
require 'timeout'
|
6
|
+
|
7
|
+
module MetaInspector
|
8
|
+
|
9
|
+
# Makes the request to the server
|
10
|
+
class Request
|
11
|
+
include MetaInspector::Exceptionable
|
12
|
+
|
13
|
+
def initialize(initial_url, options = {})
|
14
|
+
options = defaults.merge(options)
|
15
|
+
|
16
|
+
@url = initial_url
|
17
|
+
@allow_redirections = options[:allow_redirections]
|
18
|
+
@timeout = options[:timeout]
|
19
|
+
@exception_log = options[:exception_log]
|
20
|
+
end
|
21
|
+
|
22
|
+
extend Forwardable
|
23
|
+
def_delegators :@url, :url
|
24
|
+
|
25
|
+
def read
|
26
|
+
response.read if response
|
27
|
+
end
|
28
|
+
|
29
|
+
def content_type
|
30
|
+
response.content_type if response
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def response
|
36
|
+
Timeout::timeout(@timeout) { @response ||= fetch }
|
37
|
+
|
38
|
+
rescue TimeoutError, SocketError => e
|
39
|
+
@exception_log << e
|
40
|
+
nil
|
41
|
+
end
|
42
|
+
|
43
|
+
def fetch
|
44
|
+
request = open(url, {:allow_redirections => @allow_redirections})
|
45
|
+
|
46
|
+
@url.url = request.base_uri.to_s
|
47
|
+
|
48
|
+
request
|
49
|
+
end
|
50
|
+
|
51
|
+
def defaults
|
52
|
+
{ allow_redirections: false, timeout: 20, exception_log: MetaInspector::ExceptionLog.new }
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|