metainspector 1.16.1 → 1.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +17 -11
- data/lib/meta_inspector.rb +10 -3
- data/lib/meta_inspector/deprecations.rb +19 -0
- data/lib/meta_inspector/document.rb +81 -0
- data/lib/meta_inspector/exception_log.rb +29 -0
- data/lib/meta_inspector/exceptionable.rb +11 -0
- data/lib/meta_inspector/parser.rb +178 -0
- data/lib/meta_inspector/request.rb +55 -0
- data/lib/meta_inspector/url.rb +76 -0
- data/lib/meta_inspector/version.rb +1 -1
- data/spec/document_spec.rb +97 -0
- data/spec/exception_log_spec.rb +59 -0
- data/spec/meta_inspector_spec.rb +9 -0
- data/spec/parser_spec.rb +374 -0
- data/spec/redirections_spec.rb +20 -3
- data/spec/request_spec.rb +64 -0
- data/spec/url_spec.rb +74 -0
- metadata +18 -7
- data/lib/meta_inspector/scraper.rb +0 -283
- data/spec/metainspector_spec.rb +0 -547
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 54f34fbd4dec77ffa68eb9762cdc140e98246817
|
4
|
+
data.tar.gz: 0c294be322b646fa90150c3934218db529af3c6b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 79a235d8161922f9991f9df68a524f18076562c56ff68cee29a4ac8dc88c6abd1b1113363aaa9d79b39e9f703abcdcc801ccd24930c3a085157979814498c132
|
7
|
+
data.tar.gz: 68b165770c8c8de5d56b923c9b25405f500ac3ee00bfb245af41a91dcf9d2ac7fcb1bf6fcbf6d6ec80b9f7453462d2c2235a14109a0a09710ec424d5f4b59a08
|
data/README.md
CHANGED
@@ -74,11 +74,11 @@ You can also access most of the scraped data as a hash:
|
|
74
74
|
|
75
75
|
The original document is accessible from:
|
76
76
|
|
77
|
-
page.
|
77
|
+
page.to_s # A String with the contents of the HTML document
|
78
78
|
|
79
79
|
And the full scraped document is accessible from:
|
80
80
|
|
81
|
-
page.
|
81
|
+
page.parsed # Nokogiri doc that you can use it to get any element from the page
|
82
82
|
|
83
83
|
## Opengraph and Twitter card meta tags
|
84
84
|
|
@@ -91,8 +91,8 @@ Twitter cards & Open graph tags make it possible for you to attach media experie
|
|
91
91
|
|
92
92
|
Also many sites use name & property, content & value attributes interchangeably. Using MetaInspector accessing this information is as easy as -
|
93
93
|
|
94
|
-
page.meta_og_image
|
95
|
-
page.meta_twitter_image_width
|
94
|
+
page.meta_og_image
|
95
|
+
page.meta_twitter_image_width
|
96
96
|
|
97
97
|
Note that MetaInspector gives priority to content over value. In other words if there is a tag of the form
|
98
98
|
|
@@ -122,7 +122,7 @@ However, you can tell MetaInspector to allow these redirections with the option
|
|
122
122
|
|
123
123
|
### HTML Content Only
|
124
124
|
|
125
|
-
MetaInspector will try to parse all URLs by default. If you want to raise an
|
125
|
+
MetaInspector will try to parse all URLs by default. If you want to raise an exception when trying to parse a non-html URL (one that has a content-type different than text/html), you can state it like this:
|
126
126
|
|
127
127
|
page = MetaInspector.new('markupvalidator.com', :html_content_only => true)
|
128
128
|
|
@@ -137,21 +137,27 @@ This is useful when using MetaInspector on web spidering. Although on the initia
|
|
137
137
|
page.title # returns nil
|
138
138
|
page.content_type # "image/png"
|
139
139
|
page.ok? # false
|
140
|
-
page.
|
140
|
+
page.exceptions.first.message # "The url provided contains image/png content instead of text/html content"
|
141
141
|
|
142
|
-
##
|
142
|
+
## Exception handling
|
143
143
|
|
144
144
|
You can check if the page has been succesfully parsed with:
|
145
145
|
|
146
146
|
page.ok? # Will return true if everything looks OK
|
147
147
|
|
148
|
-
In case there have been any
|
148
|
+
In case there have been any exceptions, you can check them with:
|
149
149
|
|
150
|
-
page.
|
150
|
+
page.exceptions # Will return an array with the exceptions
|
151
151
|
|
152
|
-
|
152
|
+
You can also specify what to do when encountering an exception. By default it
|
153
|
+
will store it, but you can also tell MetaInspector to warn about it on the log
|
154
|
+
console, or to raise the exceptions, like this:
|
153
155
|
|
154
|
-
|
156
|
+
# This will warn about the exception on console
|
157
|
+
page = MetaInspector.new('http://example.com', warn_level: :warn)
|
158
|
+
|
159
|
+
# This will raise the exception
|
160
|
+
page = MetaInspector.new('http://example.com', warn_level: :raise)
|
155
161
|
|
156
162
|
## Examples
|
157
163
|
|
data/lib/meta_inspector.rb
CHANGED
@@ -1,12 +1,19 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
2
|
|
3
|
-
require
|
3
|
+
require 'forwardable'
|
4
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/exceptionable'))
|
5
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/exception_log'))
|
6
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/request'))
|
7
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/url'))
|
8
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/parser'))
|
9
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/document'))
|
10
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/deprecations'))
|
4
11
|
|
5
12
|
module MetaInspector
|
6
13
|
extend self
|
7
14
|
|
8
|
-
# Sugar method to be able to
|
15
|
+
# Sugar method to be able to scrape a document in a shorter way
|
9
16
|
def new(url, options = {})
|
10
|
-
|
17
|
+
Document.new(url, options)
|
11
18
|
end
|
12
19
|
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
module MetaInspector
|
4
|
+
class Scraper < Document
|
5
|
+
def initialize
|
6
|
+
warn "The Scraper class is now deprecated since version 1.17, use Document instead"
|
7
|
+
super
|
8
|
+
end
|
9
|
+
|
10
|
+
def errors
|
11
|
+
warn "The #errors method is deprecated since version 1.17, use #exceptions instead"
|
12
|
+
exceptions
|
13
|
+
end
|
14
|
+
|
15
|
+
def document
|
16
|
+
warn "The #document method is deprecated since version 1.17, use #to_s instead"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
module MetaInspector
|
4
|
+
# A MetaInspector::Document knows about its URL and its contents
|
5
|
+
class Document
|
6
|
+
attr_reader :timeout, :html_content_only, :allow_redirections, :warn_level
|
7
|
+
|
8
|
+
include MetaInspector::Exceptionable
|
9
|
+
|
10
|
+
# Initializes a new instance of MetaInspector::Document, setting the URL to the one given
|
11
|
+
# Options:
|
12
|
+
# => timeout: defaults to 20 seconds
|
13
|
+
# => html_content_type_only: if an exception should be raised if request content-type is not text/html. Defaults to false
|
14
|
+
# => allow_redirections: when :safe, allows HTTP => HTTPS redirections. When :all, it also allows HTTPS => HTTP
|
15
|
+
# => document: the html of the url as a string
|
16
|
+
# => warn_level: what to do when encountering exceptions. Can be :warn, :raise or nil
|
17
|
+
def initialize(initial_url, options = {})
|
18
|
+
options = defaults.merge(options)
|
19
|
+
@timeout = options[:timeout]
|
20
|
+
@html_content_only = options[:html_content_only]
|
21
|
+
@allow_redirections = options[:allow_redirections]
|
22
|
+
@document = options[:document]
|
23
|
+
|
24
|
+
if options[:verbose] == true
|
25
|
+
warn "The verbose option is deprecated since 1.17, please use warn_level: :warn instead"
|
26
|
+
options[:warn_level] = :warn
|
27
|
+
end
|
28
|
+
|
29
|
+
@warn_level = options[:warn_level]
|
30
|
+
|
31
|
+
@exception_log = MetaInspector::ExceptionLog.new(warn_level: warn_level)
|
32
|
+
@url = MetaInspector::URL.new(initial_url, exception_log: @exception_log)
|
33
|
+
@request = MetaInspector::Request.new(@url, allow_redirections: @allow_redirections,
|
34
|
+
timeout: @timeout,
|
35
|
+
exception_log: @exception_log)
|
36
|
+
@parser = MetaInspector::Parser.new(self, exception_log: @exception_log)
|
37
|
+
end
|
38
|
+
|
39
|
+
extend Forwardable
|
40
|
+
def_delegators :@url, :url, :scheme, :host, :root_url
|
41
|
+
def_delegators :@request, :content_type
|
42
|
+
def_delegators :@parser, :parsed, :method_missing, :title, :description, :links, :internal_links, :external_links,
|
43
|
+
:images, :image, :feed, :charset
|
44
|
+
|
45
|
+
# Returns all document data as a nested Hash
|
46
|
+
def to_hash
|
47
|
+
{
|
48
|
+
'url' => url,
|
49
|
+
'title' => title,
|
50
|
+
'links' => links,
|
51
|
+
'internal_links' => internal_links,
|
52
|
+
'external_links' => external_links,
|
53
|
+
'images' => images,
|
54
|
+
'charset' => charset,
|
55
|
+
'feed' => feed,
|
56
|
+
'content_type' => content_type
|
57
|
+
}.merge @parser.to_hash
|
58
|
+
end
|
59
|
+
|
60
|
+
# Returns the contents of the document as a string
|
61
|
+
def to_s
|
62
|
+
document
|
63
|
+
end
|
64
|
+
|
65
|
+
private
|
66
|
+
|
67
|
+
def defaults
|
68
|
+
{ :timeout => 20, :html_content_only => false }
|
69
|
+
end
|
70
|
+
|
71
|
+
def document
|
72
|
+
@document ||= if html_content_only && content_type != "text/html"
|
73
|
+
raise "The url provided contains #{content_type} content instead of text/html content" and nil
|
74
|
+
else
|
75
|
+
@request.read
|
76
|
+
end
|
77
|
+
rescue Exception => e
|
78
|
+
@exception_log << e
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
module MetaInspector
|
4
|
+
|
5
|
+
# Stores the exceptions passed to it, warning about them if required
|
6
|
+
class ExceptionLog
|
7
|
+
attr_reader :exceptions, :warn_level
|
8
|
+
|
9
|
+
def initialize(options = {})
|
10
|
+
@exceptions = []
|
11
|
+
@warn_level = options[:warn_level]
|
12
|
+
end
|
13
|
+
|
14
|
+
def <<(exception)
|
15
|
+
case warn_level
|
16
|
+
when :warn
|
17
|
+
warn exception
|
18
|
+
when :raise
|
19
|
+
raise exception
|
20
|
+
end
|
21
|
+
|
22
|
+
@exceptions << exception
|
23
|
+
end
|
24
|
+
|
25
|
+
def ok?
|
26
|
+
exceptions.empty?
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,178 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'hashie/rash'
|
5
|
+
|
6
|
+
module MetaInspector
|
7
|
+
# Parses the document with Nokogiri
|
8
|
+
class Parser
|
9
|
+
include MetaInspector::Exceptionable
|
10
|
+
|
11
|
+
def initialize(document, options = {})
|
12
|
+
options = defaults.merge(options)
|
13
|
+
|
14
|
+
@document = document
|
15
|
+
@data = Hashie::Rash.new
|
16
|
+
@exception_log = options[:exception_log]
|
17
|
+
end
|
18
|
+
|
19
|
+
extend Forwardable
|
20
|
+
def_delegators :@document, :url, :scheme, :host
|
21
|
+
|
22
|
+
# Returns the whole parsed document
|
23
|
+
def parsed
|
24
|
+
@parsed ||= Nokogiri::HTML(@document.to_s)
|
25
|
+
|
26
|
+
rescue Exception => e
|
27
|
+
@exception_log << e
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_hash
|
31
|
+
scrape_meta_data
|
32
|
+
@data.to_hash
|
33
|
+
end
|
34
|
+
|
35
|
+
# Returns the parsed document title, from the content of the <title> tag.
|
36
|
+
# This is not the same as the meta_title tag
|
37
|
+
def title
|
38
|
+
@title ||= parsed.css('title').inner_text rescue nil
|
39
|
+
end
|
40
|
+
|
41
|
+
# A description getter that first checks for a meta description and if not present will
|
42
|
+
# guess by looking at the first paragraph with more than 120 characters
|
43
|
+
def description
|
44
|
+
meta_description || secondary_description
|
45
|
+
end
|
46
|
+
|
47
|
+
# Links found on the page, as absolute URLs
|
48
|
+
def links
|
49
|
+
@links ||= parsed_links.map{ |l| URL.absolutify(URL.unrelativize(l, scheme), base_url) }.compact.uniq
|
50
|
+
end
|
51
|
+
|
52
|
+
# Internal links found on the page, as absolute URLs
|
53
|
+
def internal_links
|
54
|
+
@internal_links ||= links.select {|link| URL.new(link).host == host }
|
55
|
+
end
|
56
|
+
|
57
|
+
# External links found on the page, as absolute URLs
|
58
|
+
def external_links
|
59
|
+
@external_links ||= links.select {|link| URL.new(link).host != host }
|
60
|
+
end
|
61
|
+
|
62
|
+
# Images found on the page, as absolute URLs
|
63
|
+
def images
|
64
|
+
@images ||= parsed_images.map{ |i| URL.absolutify(i, base_url) }
|
65
|
+
end
|
66
|
+
|
67
|
+
# Returns the parsed image from Facebook's open graph property tags
|
68
|
+
# Most all major websites now define this property and is usually very relevant
|
69
|
+
# See doc at http://developers.facebook.com/docs/opengraph/
|
70
|
+
def image
|
71
|
+
meta_og_image || meta_twitter_image
|
72
|
+
end
|
73
|
+
|
74
|
+
# Returns the parsed document meta rss link
|
75
|
+
def feed
|
76
|
+
@feed ||= (parsed_feed('rss') || parsed_feed('atom'))
|
77
|
+
end
|
78
|
+
|
79
|
+
# Returns the charset from the meta tags, looking for it in the following order:
|
80
|
+
# <meta charset='utf-8' />
|
81
|
+
# <meta http-equiv="Content-Type" content="text/html; charset=windows-1252" />
|
82
|
+
def charset
|
83
|
+
@charset ||= (charset_from_meta_charset || charset_from_meta_content_type)
|
84
|
+
end
|
85
|
+
|
86
|
+
private
|
87
|
+
|
88
|
+
def defaults
|
89
|
+
{ exception_log: MetaInspector::ExceptionLog.new }
|
90
|
+
end
|
91
|
+
|
92
|
+
# Scrapers for all meta_tags in the form of "meta_name" are automatically defined. This has been tested for
|
93
|
+
# meta name: keywords, description, robots, generator
|
94
|
+
# meta http-equiv: content-language, Content-Type
|
95
|
+
#
|
96
|
+
# It will first try with meta name="..." and if nothing found,
|
97
|
+
# with meta http-equiv="...", substituting "_" by "-"
|
98
|
+
# TODO: define respond_to? to return true on the meta_name methods
|
99
|
+
def method_missing(method_name)
|
100
|
+
if method_name.to_s =~ /^meta_(.*)/
|
101
|
+
key = $1
|
102
|
+
|
103
|
+
#special treatment for opengraph (og:) and twitter card (twitter:) tags
|
104
|
+
key.gsub!("_",":") if key =~ /^og_(.*)/ || key =~ /^twitter_(.*)/
|
105
|
+
|
106
|
+
scrape_meta_data
|
107
|
+
|
108
|
+
@data.meta.name && (@data.meta.name[key.downcase]) || (@data.meta.property && @data.meta.property[key.downcase])
|
109
|
+
else
|
110
|
+
super
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
# Scrapes all meta tags found
|
115
|
+
def scrape_meta_data
|
116
|
+
unless @data.meta
|
117
|
+
@data.meta!.name!
|
118
|
+
@data.meta!.property!
|
119
|
+
parsed.xpath("//meta").each do |element|
|
120
|
+
get_meta_name_or_property(element)
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
# Store meta tag value, looking at meta name or meta property
|
126
|
+
def get_meta_name_or_property(element)
|
127
|
+
name_or_property = element.attributes["name"] ? "name" : (element.attributes["property"] ? "property" : nil)
|
128
|
+
content_or_value = element.attributes["content"] ? "content" : (element.attributes["value"] ? "value" : nil)
|
129
|
+
|
130
|
+
if !name_or_property.nil? && !content_or_value.nil?
|
131
|
+
@data.meta.name[element.attributes[name_or_property].value.downcase] = element.attributes[content_or_value].value
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
# Look for the first <p> block with 120 characters or more
|
136
|
+
def secondary_description
|
137
|
+
first_long_paragraph = parsed.search('//p[string-length() >= 120]').first
|
138
|
+
first_long_paragraph ? first_long_paragraph.text : ''
|
139
|
+
end
|
140
|
+
|
141
|
+
def parsed_links
|
142
|
+
@parsed_links ||= cleanup_nokogiri_values(parsed.search("//a/@href"))
|
143
|
+
end
|
144
|
+
|
145
|
+
def parsed_images
|
146
|
+
@parsed_images ||= cleanup_nokogiri_values(parsed.search('//img/@src'))
|
147
|
+
end
|
148
|
+
|
149
|
+
def parsed_feed(format)
|
150
|
+
feed = parsed.search("//link[@type='application/#{format}+xml']").first
|
151
|
+
feed ? URL.absolutify(feed.attributes['href'].value, base_url) : nil
|
152
|
+
end
|
153
|
+
|
154
|
+
def charset_from_meta_charset
|
155
|
+
parsed.css("meta[charset]")[0].attributes['charset'].value rescue nil
|
156
|
+
end
|
157
|
+
|
158
|
+
def charset_from_meta_content_type
|
159
|
+
parsed.css("meta[http-equiv='Content-Type']")[0].attributes['content'].value.split(";")[1].split("=")[1] rescue nil
|
160
|
+
end
|
161
|
+
|
162
|
+
# Returns the base url to absolutify relative links. This can be the one set on a <base> tag,
|
163
|
+
# or the url of the document if no <base> tag was found.
|
164
|
+
def base_url
|
165
|
+
base_href || url
|
166
|
+
end
|
167
|
+
|
168
|
+
# Returns the value of the href attribute on the <base /> tag, if it exists
|
169
|
+
def base_href
|
170
|
+
parsed.search('base').first.attributes['href'].value rescue nil
|
171
|
+
end
|
172
|
+
|
173
|
+
# Takes a nokogiri search result, strips the values, rejects the empty ones, and removes duplicates
|
174
|
+
def cleanup_nokogiri_values(results)
|
175
|
+
results.map { |a| a.value.strip }.reject { |s| s.empty? }.uniq
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require 'open-uri'
|
4
|
+
require 'open_uri_redirections'
|
5
|
+
require 'timeout'
|
6
|
+
|
7
|
+
module MetaInspector
|
8
|
+
|
9
|
+
# Makes the request to the server
|
10
|
+
class Request
|
11
|
+
include MetaInspector::Exceptionable
|
12
|
+
|
13
|
+
def initialize(initial_url, options = {})
|
14
|
+
options = defaults.merge(options)
|
15
|
+
|
16
|
+
@url = initial_url
|
17
|
+
@allow_redirections = options[:allow_redirections]
|
18
|
+
@timeout = options[:timeout]
|
19
|
+
@exception_log = options[:exception_log]
|
20
|
+
end
|
21
|
+
|
22
|
+
extend Forwardable
|
23
|
+
def_delegators :@url, :url
|
24
|
+
|
25
|
+
def read
|
26
|
+
response.read if response
|
27
|
+
end
|
28
|
+
|
29
|
+
def content_type
|
30
|
+
response.content_type if response
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def response
|
36
|
+
Timeout::timeout(@timeout) { @response ||= fetch }
|
37
|
+
|
38
|
+
rescue TimeoutError, SocketError => e
|
39
|
+
@exception_log << e
|
40
|
+
nil
|
41
|
+
end
|
42
|
+
|
43
|
+
def fetch
|
44
|
+
request = open(url, {:allow_redirections => @allow_redirections})
|
45
|
+
|
46
|
+
@url.url = request.base_uri.to_s
|
47
|
+
|
48
|
+
request
|
49
|
+
end
|
50
|
+
|
51
|
+
def defaults
|
52
|
+
{ allow_redirections: false, timeout: 20, exception_log: MetaInspector::ExceptionLog.new }
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|