metainspector 3.3.0 → 4.0.0.rc1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Guardfile +5 -0
- data/README.md +26 -8
- data/lib/meta_inspector/document.rb +4 -8
- data/lib/meta_inspector/exception_log.rb +0 -2
- data/lib/meta_inspector/exceptionable.rb +0 -2
- data/lib/meta_inspector/parser.rb +17 -162
- data/lib/meta_inspector/parsers/base.rb +30 -0
- data/lib/meta_inspector/parsers/images.rb +45 -0
- data/lib/meta_inspector/parsers/links.rb +69 -0
- data/lib/meta_inspector/parsers/meta_tags.rb +72 -0
- data/lib/meta_inspector/parsers/texts.rb +27 -0
- data/lib/meta_inspector/request.rb +0 -2
- data/lib/meta_inspector/url.rb +0 -2
- data/lib/meta_inspector/version.rb +1 -3
- data/lib/meta_inspector.rb +5 -2
- data/lib/metainspector.rb +0 -2
- data/meta_inspector.gemspec +2 -1
- data/spec/document_spec.rb +16 -26
- data/spec/exception_log_spec.rb +1 -3
- data/spec/fixtures/example.response +17 -0
- data/spec/meta_inspector/images_spec.rb +111 -0
- data/spec/meta_inspector/links_spec.rb +203 -0
- data/spec/{meta_inspector_spec.rb → meta_inspector/meta_inspector_spec.rb} +1 -3
- data/spec/meta_inspector/meta_tags_spec.rb +108 -0
- data/spec/meta_inspector/redirections_spec.rb +48 -0
- data/spec/meta_inspector/texts_spec.rb +22 -0
- data/spec/parser_spec.rb +7 -393
- data/spec/request_spec.rb +1 -3
- data/spec/spec_helper.rb +0 -2
- data/spec/url_spec.rb +1 -3
- metadata +44 -6
- data/spec/redirections_spec.rb +0 -47
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 84153e6fb0df5db8c6e71c3b918c5afb48da2ff0
|
4
|
+
data.tar.gz: c3c98fc6d9488202a114769d0cb38871ba9eb871
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 65ba2a93d70893615ccb20a416d5c461e5f93877cdf6a77688131e6f7ad597f48ad48188930267cef9088f17af25fb934cdb219d355187d88f91070bbee77e96
|
7
|
+
data.tar.gz: dbbf14dad512d0ca6bf2dedc1dba7b6c8f93fcd0ed5ce1e3b7a6c0efd3345612555d2825a57db6efc18e727b0f8239794b123adf822c946a306add942185deec
|
data/Guardfile
ADDED
data/README.md
CHANGED
@@ -8,9 +8,24 @@ You give it an URL, and it lets you easily get its title, links, images, charset
|
|
8
8
|
|
9
9
|
You can try MetaInspector live at this little demo: [https://metainspectordemo.herokuapp.com](https://metainspectordemo.herokuapp.com)
|
10
10
|
|
11
|
-
## Changes in
|
11
|
+
## Changes in 4.0
|
12
|
+
|
13
|
+
* The links API has been changed, now instead of `page.links`, `page.internal_links` and `page.external_links` we have:
|
14
|
+
|
15
|
+
```ruby
|
16
|
+
page.links.raw # Returns all links found, unprocessed
|
17
|
+
page.links.all # Returns all links found, unrelavitized and absolutified
|
18
|
+
page.links.http # Returns all HTTP links found
|
19
|
+
page.links.non_http # Returns all non-HTTP links found
|
20
|
+
page.links.internal # Returns all internal HTTP links found
|
21
|
+
page.links.external # Returns all external HTTP links found
|
22
|
+
```
|
23
|
+
|
24
|
+
* The images API has been changed, now instead of `page.image` we have `page.images.best`, and instead of `page.favicon` we have `page.images.favicon`.
|
12
25
|
|
13
|
-
|
26
|
+
* Now `page.image` will return the first image in `page.images` if no OG or Twitter image found, instead of returning `nil`.
|
27
|
+
|
28
|
+
## Changes in 3.0
|
14
29
|
|
15
30
|
* The redirect API has been changed, now the `:allow_redirections` option will expect only a boolean, which by default is `true`. That is, no more specifying `:safe`, `:unsafe` or `:all`.
|
16
31
|
* We've dropped support for Ruby < 2.
|
@@ -63,18 +78,21 @@ You can see the scraped data like this:
|
|
63
78
|
page.host # Hostname of the page (like, sitevalidator.com, without the scheme)
|
64
79
|
page.root_url # Root url (scheme + host, like http://sitevalidator.com/)
|
65
80
|
page.title # title of the page, as string
|
66
|
-
page.links
|
67
|
-
page.
|
68
|
-
page.
|
81
|
+
page.links.raw # every link found, unprocessed
|
82
|
+
page.links.all # every link found on the page as an absolute URL
|
83
|
+
page.links.http # every HTTP link found
|
84
|
+
page.links.non_http # every non-HTTP link found
|
85
|
+
page.links.internal # every internal link found on the page as an absolute URL
|
86
|
+
page.links.external # every external link found on the page as an absolute URL
|
69
87
|
page.meta['keywords'] # meta keywords, as string
|
70
88
|
page.meta['description'] # meta description, as string
|
71
89
|
page.description # returns the meta description, or the first long paragraph if no meta description is found
|
72
|
-
page.
|
73
|
-
page.images
|
90
|
+
page.images # enumerable collection, with every img found on the page as an absolute URL
|
91
|
+
page.images.best # Most relevant image, if defined with the og:image or twitter:image metatags. Fallback to the first page.images array element
|
92
|
+
page.images.favicon # absolute URL to the favicon
|
74
93
|
page.feed # Get rss or atom links in meta data fields as array
|
75
94
|
page.charset # UTF-8
|
76
95
|
page.content_type # content-type returned by the server when the url was requested
|
77
|
-
page.favicon # absolute URL to the favicon
|
78
96
|
|
79
97
|
## Meta tags
|
80
98
|
|
@@ -1,5 +1,3 @@
|
|
1
|
-
# -*- encoding: utf-8 -*-
|
2
|
-
|
3
1
|
module MetaInspector
|
4
2
|
# A MetaInspector::Document knows about its URL and its contents
|
5
3
|
class Document
|
@@ -35,7 +33,7 @@ module MetaInspector
|
|
35
33
|
extend Forwardable
|
36
34
|
def_delegators :@url, :url, :scheme, :host, :root_url
|
37
35
|
def_delegators :@request, :content_type, :response
|
38
|
-
def_delegators :@parser, :parsed, :respond_to?, :title, :description, :links,
|
36
|
+
def_delegators :@parser, :parsed, :respond_to?, :title, :description, :links,
|
39
37
|
:images, :image, :feed, :charset, :meta_tags, :meta_tag, :meta, :favicon
|
40
38
|
|
41
39
|
# Returns all document data as a nested Hash
|
@@ -43,15 +41,13 @@ module MetaInspector
|
|
43
41
|
{
|
44
42
|
'url' => url,
|
45
43
|
'title' => title,
|
46
|
-
'links' => links,
|
47
|
-
'
|
48
|
-
'external_links' => external_links,
|
49
|
-
'images' => images,
|
44
|
+
'links' => links.to_hash,
|
45
|
+
'images' => images.to_a,
|
50
46
|
'charset' => charset,
|
51
47
|
'feed' => feed,
|
52
48
|
'content_type' => content_type,
|
53
49
|
'meta_tags' => meta_tags,
|
54
|
-
'favicon' => favicon,
|
50
|
+
'favicon' => images.favicon,
|
55
51
|
'response' => { 'status' => response.status,
|
56
52
|
'headers' => response.headers }
|
57
53
|
}
|
@@ -1,36 +1,30 @@
|
|
1
|
-
# -*- encoding: utf-8 -*-
|
2
|
-
|
3
1
|
require 'nokogiri'
|
4
2
|
|
5
3
|
module MetaInspector
|
6
|
-
|
4
|
+
##
|
5
|
+
# Parses the document with Nokogiri.
|
6
|
+
#
|
7
|
+
# Delegates the parsing of the different elements to specialized parsers,
|
8
|
+
# passing itself as a reference for coordination purposes
|
9
|
+
#
|
7
10
|
class Parser
|
8
11
|
include MetaInspector::Exceptionable
|
9
12
|
|
10
13
|
def initialize(document, options = {})
|
11
|
-
@document
|
12
|
-
@exception_log
|
14
|
+
@document = document
|
15
|
+
@exception_log = options[:exception_log]
|
16
|
+
@meta_tag_parser = MetaInspector::Parsers::MetaTagsParser.new(self)
|
17
|
+
@links_parser = MetaInspector::Parsers::LinksParser.new(self)
|
18
|
+
@images_parser = MetaInspector::Parsers::ImagesParser.new(self)
|
19
|
+
@texts_parser = MetaInspector::Parsers::TextsParser.new(self)
|
13
20
|
end
|
14
21
|
|
15
22
|
extend Forwardable
|
16
|
-
def_delegators :@document,
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
'http-equiv' => meta_tags_by('http-equiv'),
|
22
|
-
'property' => meta_tags_by('property'),
|
23
|
-
'charset' => [charset_from_meta_charset]
|
24
|
-
}
|
25
|
-
end
|
26
|
-
|
27
|
-
def meta_tag
|
28
|
-
convert_each_array_to_first_element_on meta_tags
|
29
|
-
end
|
30
|
-
|
31
|
-
def meta
|
32
|
-
meta_tag['name'].merge(meta_tag['http-equiv']).merge(meta_tag['property']).merge({'charset' => meta_tag['charset']})
|
33
|
-
end
|
23
|
+
def_delegators :@document, :url, :scheme, :host
|
24
|
+
def_delegators :@meta_tag_parser, :meta_tags, :meta_tag, :meta, :charset
|
25
|
+
def_delegators :@links_parser, :links, :feed, :base_url
|
26
|
+
def_delegators :@images_parser, :images, :image, :favicon
|
27
|
+
def_delegators :@texts_parser, :title, :description
|
34
28
|
|
35
29
|
# Returns the whole parsed document
|
36
30
|
def parsed
|
@@ -38,144 +32,5 @@ module MetaInspector
|
|
38
32
|
rescue Exception => e
|
39
33
|
@exception_log << e
|
40
34
|
end
|
41
|
-
|
42
|
-
# Returns the parsed document title, from the content of the <title> tag
|
43
|
-
# within the <head> section.
|
44
|
-
# This is not the same as the meta_title tag
|
45
|
-
def title
|
46
|
-
@title ||= parsed.css('head title').inner_text rescue nil
|
47
|
-
end
|
48
|
-
|
49
|
-
# Return favicon url if exist
|
50
|
-
def favicon
|
51
|
-
query = '//link[@rel="icon" or contains(@rel, "shortcut")]'
|
52
|
-
value = parsed.xpath(query)[0].attributes['href'].value
|
53
|
-
@favicon ||= URL.absolutify(value, base_url)
|
54
|
-
rescue
|
55
|
-
nil
|
56
|
-
end
|
57
|
-
|
58
|
-
# A description getter that first checks for a meta description and if not present will
|
59
|
-
# guess by looking at the first paragraph with more than 120 characters
|
60
|
-
def description
|
61
|
-
meta['description'] || secondary_description
|
62
|
-
end
|
63
|
-
|
64
|
-
# Links found on the page, as absolute URLs
|
65
|
-
def links
|
66
|
-
@links ||= parsed_links.map{ |l| URL.absolutify(URL.unrelativize(l, scheme), base_url) }.compact.uniq
|
67
|
-
end
|
68
|
-
|
69
|
-
# Internal links found on the page, as absolute URLs
|
70
|
-
def internal_links
|
71
|
-
@internal_links ||= links.select {|link| URL.new(link).host == host }
|
72
|
-
end
|
73
|
-
|
74
|
-
# External links found on the page, as absolute URLs
|
75
|
-
def external_links
|
76
|
-
@external_links ||= links.select {|link| URL.new(link).host != host }
|
77
|
-
end
|
78
|
-
|
79
|
-
# Images found on the page, as absolute URLs
|
80
|
-
def images
|
81
|
-
@images ||= parsed_images.map{ |i| URL.absolutify(i, base_url) }
|
82
|
-
end
|
83
|
-
|
84
|
-
# Returns the parsed image from Facebook's open graph property tags
|
85
|
-
# Most all major websites now define this property and is usually very relevant
|
86
|
-
# See doc at http://developers.facebook.com/docs/opengraph/
|
87
|
-
# If none found, tries with Twitter image
|
88
|
-
# TODO: if not found, try with images.first
|
89
|
-
def image
|
90
|
-
meta['og:image'] || meta['twitter:image']
|
91
|
-
end
|
92
|
-
|
93
|
-
# Returns the parsed document meta rss link
|
94
|
-
def feed
|
95
|
-
@feed ||= (parsed_feed('rss') || parsed_feed('atom'))
|
96
|
-
end
|
97
|
-
|
98
|
-
# Returns the charset from the meta tags, looking for it in the following order:
|
99
|
-
# <meta charset='utf-8' />
|
100
|
-
# <meta http-equiv="Content-Type" content="text/html; charset=windows-1252" />
|
101
|
-
def charset
|
102
|
-
@charset ||= (charset_from_meta_charset || charset_from_meta_content_type)
|
103
|
-
end
|
104
|
-
|
105
|
-
private
|
106
|
-
|
107
|
-
def meta_tags_by(attribute)
|
108
|
-
hash = {}
|
109
|
-
parsed.css("meta[@#{attribute}]").map do |tag|
|
110
|
-
name = tag.attributes[attribute].value.downcase rescue nil
|
111
|
-
content = tag.attributes['content'].value rescue nil
|
112
|
-
|
113
|
-
if name && content
|
114
|
-
hash[name] ||= []
|
115
|
-
hash[name] << content
|
116
|
-
end
|
117
|
-
end
|
118
|
-
hash
|
119
|
-
end
|
120
|
-
|
121
|
-
def convert_each_array_to_first_element_on(hash)
|
122
|
-
hash.each_pair do |k, v|
|
123
|
-
hash[k] = if v.is_a?(Hash)
|
124
|
-
convert_each_array_to_first_element_on(v)
|
125
|
-
elsif v.is_a?(Array)
|
126
|
-
v.first
|
127
|
-
else
|
128
|
-
v
|
129
|
-
end
|
130
|
-
end
|
131
|
-
end
|
132
|
-
|
133
|
-
# Look for the first <p> block with 120 characters or more
|
134
|
-
def secondary_description
|
135
|
-
first_long_paragraph = parsed_search('//p[string-length() >= 120]').first
|
136
|
-
first_long_paragraph ? first_long_paragraph.text : ''
|
137
|
-
end
|
138
|
-
|
139
|
-
def parsed_links
|
140
|
-
@parsed_links ||= cleanup_nokogiri_values(parsed_search("//a/@href"))
|
141
|
-
end
|
142
|
-
|
143
|
-
def parsed_images
|
144
|
-
@parsed_images ||= cleanup_nokogiri_values(parsed_search('//img/@src'))
|
145
|
-
end
|
146
|
-
|
147
|
-
def parsed_feed(format)
|
148
|
-
feed = parsed_search("//link[@type='application/#{format}+xml']").first
|
149
|
-
feed ? URL.absolutify(feed.attributes['href'].value, base_url) : nil
|
150
|
-
end
|
151
|
-
|
152
|
-
def charset_from_meta_charset
|
153
|
-
parsed.css("meta[charset]")[0].attributes['charset'].value rescue nil
|
154
|
-
end
|
155
|
-
|
156
|
-
def charset_from_meta_content_type
|
157
|
-
parsed.css("meta[http-equiv='Content-Type']")[0].attributes['content'].value.split(";")[1].split("=")[1] rescue nil
|
158
|
-
end
|
159
|
-
|
160
|
-
# Returns the base url to absolutify relative links. This can be the one set on a <base> tag,
|
161
|
-
# or the url of the document if no <base> tag was found.
|
162
|
-
def base_url
|
163
|
-
base_href || url
|
164
|
-
end
|
165
|
-
|
166
|
-
# Returns the value of the href attribute on the <base /> tag, if it exists
|
167
|
-
def base_href
|
168
|
-
parsed_search('base').first.attributes['href'].value rescue nil
|
169
|
-
end
|
170
|
-
|
171
|
-
# Takes a nokogiri search result, strips the values, rejects the empty ones, and removes duplicates
|
172
|
-
def cleanup_nokogiri_values(results)
|
173
|
-
results.map { |a| a.value.strip }.reject { |s| s.empty? }.uniq
|
174
|
-
end
|
175
|
-
|
176
|
-
# Searches the parsed document for the selector, if the parsed document is searchable
|
177
|
-
def parsed_search(selector)
|
178
|
-
parsed.respond_to?(:search) ? parsed.search(selector) : []
|
179
|
-
end
|
180
35
|
end
|
181
36
|
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module MetaInspector
|
2
|
+
module Parsers
|
3
|
+
##
|
4
|
+
# Base class from where the specialized parsers inherit from.
|
5
|
+
#
|
6
|
+
# On initialization a main parser is expected, so the specialized
|
7
|
+
# parsers can request the parsed document to the main parser, and
|
8
|
+
# then perform the searches on it.
|
9
|
+
#
|
10
|
+
# The main parser also serves as a message hub between the specialized
|
11
|
+
# parsers. For example, the ImagesParser needs to know the base_url
|
12
|
+
# in order to absolutify image URLs, so it delegates it to the main parser
|
13
|
+
# which, in turn, delegates it to the LinksParser.
|
14
|
+
#
|
15
|
+
class Base
|
16
|
+
def initialize(main_parser)
|
17
|
+
@main_parser = main_parser
|
18
|
+
end
|
19
|
+
|
20
|
+
extend Forwardable
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
# Cleans up nokogiri search results
|
25
|
+
def cleanup(results)
|
26
|
+
results.map { |_| _.value.strip }.reject { |_| _.empty? }.uniq
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module MetaInspector
|
2
|
+
module Parsers
|
3
|
+
class ImagesParser < Base
|
4
|
+
def_delegators :@main_parser, :parsed, :meta, :base_url
|
5
|
+
def_delegators :images_collection, :length, :size
|
6
|
+
|
7
|
+
include Enumerable
|
8
|
+
|
9
|
+
def images
|
10
|
+
self
|
11
|
+
end
|
12
|
+
|
13
|
+
def each(&block)
|
14
|
+
images_collection.each(&block)
|
15
|
+
end
|
16
|
+
|
17
|
+
# Returns the parsed image from Facebook's open graph property tags
|
18
|
+
# Most all major websites now define this property and is usually very relevant
|
19
|
+
# See doc at http://developers.facebook.com/docs/opengraph/
|
20
|
+
# If none found, tries with Twitter image
|
21
|
+
def best
|
22
|
+
meta['og:image'] || meta['twitter:image'] || images_collection.first
|
23
|
+
end
|
24
|
+
|
25
|
+
# Return favicon url if exist
|
26
|
+
def favicon
|
27
|
+
query = '//link[@rel="icon" or contains(@rel, "shortcut")]'
|
28
|
+
value = parsed.xpath(query)[0].attributes['href'].value
|
29
|
+
@favicon ||= URL.absolutify(value, base_url)
|
30
|
+
rescue
|
31
|
+
nil
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
def images_collection
|
37
|
+
@images_collection ||= parsed_images.map{ |i| URL.absolutify(i, base_url) }
|
38
|
+
end
|
39
|
+
|
40
|
+
def parsed_images
|
41
|
+
@parsed_images ||= cleanup(parsed.search('//img/@src'))
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
module MetaInspector
|
2
|
+
module Parsers
|
3
|
+
class LinksParser < Base
|
4
|
+
def_delegators :@main_parser, :parsed, :url, :scheme, :host
|
5
|
+
|
6
|
+
def links
|
7
|
+
self
|
8
|
+
end
|
9
|
+
|
10
|
+
# Returns all links found, unprocessed
|
11
|
+
def raw
|
12
|
+
@raw ||= cleanup(parsed.search("//a/@href")).compact.uniq
|
13
|
+
end
|
14
|
+
|
15
|
+
# Returns all links found, unrelavitized and absolutified
|
16
|
+
def all
|
17
|
+
@all ||= raw.map { |l| URL.absolutify(URL.unrelativize(l, scheme), base_url) }
|
18
|
+
.compact.uniq
|
19
|
+
end
|
20
|
+
|
21
|
+
# Returns all HTTP links found
|
22
|
+
def http
|
23
|
+
@http ||= all.select {|l| l =~ /^http(s)?:\/\//i}
|
24
|
+
end
|
25
|
+
|
26
|
+
# Returns all non-HTTP links found
|
27
|
+
def non_http
|
28
|
+
@non_http ||= all.select {|l| l !~ /^http(s)?:\/\//i}
|
29
|
+
end
|
30
|
+
|
31
|
+
# Returns all internal HTTP links found
|
32
|
+
def internal
|
33
|
+
@internal ||= http.select {|link| URL.new(link).host == host }
|
34
|
+
end
|
35
|
+
|
36
|
+
# Returns all external HTTP links found
|
37
|
+
def external
|
38
|
+
@external ||= http.select {|link| URL.new(link).host != host }
|
39
|
+
end
|
40
|
+
|
41
|
+
def to_hash
|
42
|
+
{ 'internal' => internal, 'external' => external, 'non_http' => non_http }
|
43
|
+
end
|
44
|
+
|
45
|
+
# Returns the parsed document meta rss link
|
46
|
+
def feed
|
47
|
+
@feed ||= (parsed_feed('rss') || parsed_feed('atom'))
|
48
|
+
end
|
49
|
+
|
50
|
+
# Returns the base url to absolutify relative links. This can be the one set on a <base> tag,
|
51
|
+
# or the url of the document if no <base> tag was found.
|
52
|
+
def base_url
|
53
|
+
base_href || url
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
|
58
|
+
def parsed_feed(format)
|
59
|
+
feed = parsed.search("//link[@type='application/#{format}+xml']").first
|
60
|
+
feed ? URL.absolutify(feed.attributes['href'].value, base_url) : nil
|
61
|
+
end
|
62
|
+
|
63
|
+
# Returns the value of the href attribute on the <base /> tag, if it exists
|
64
|
+
def base_href
|
65
|
+
parsed.search('base').first.attributes['href'].value rescue nil
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
module MetaInspector
|
2
|
+
module Parsers
|
3
|
+
class MetaTagsParser < Base
|
4
|
+
def_delegators :@main_parser, :parsed
|
5
|
+
|
6
|
+
def meta_tags
|
7
|
+
{
|
8
|
+
'name' => meta_tags_by('name'),
|
9
|
+
'http-equiv' => meta_tags_by('http-equiv'),
|
10
|
+
'property' => meta_tags_by('property'),
|
11
|
+
'charset' => [charset_from_meta_charset]
|
12
|
+
}
|
13
|
+
end
|
14
|
+
|
15
|
+
def meta_tag
|
16
|
+
convert_each_array_to_first_element_on meta_tags
|
17
|
+
end
|
18
|
+
|
19
|
+
def meta
|
20
|
+
meta_tag['name']
|
21
|
+
.merge(meta_tag['http-equiv'])
|
22
|
+
.merge(meta_tag['property'])
|
23
|
+
.merge({'charset' => meta_tag['charset']})
|
24
|
+
end
|
25
|
+
|
26
|
+
# Returns the charset from the meta tags, looking for it in the following order:
|
27
|
+
# <meta charset='utf-8' />
|
28
|
+
# <meta http-equiv="Content-Type" content="text/html; charset=windows-1252" />
|
29
|
+
def charset
|
30
|
+
@charset ||= (charset_from_meta_charset || charset_from_meta_content_type)
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def charset_from_meta_charset
|
36
|
+
parsed.css("meta[charset]")[0].attributes['charset'].value rescue nil
|
37
|
+
end
|
38
|
+
|
39
|
+
def charset_from_meta_content_type
|
40
|
+
parsed.css("meta[http-equiv='Content-Type']")[0]
|
41
|
+
.attributes['content'].value.split(";")[1].split("=")[1] rescue nil
|
42
|
+
end
|
43
|
+
|
44
|
+
def meta_tags_by(attribute)
|
45
|
+
hash = {}
|
46
|
+
parsed.css("meta[@#{attribute}]").map do |tag|
|
47
|
+
name = tag.attributes[attribute].value.downcase rescue nil
|
48
|
+
content = tag.attributes['content'].value rescue nil
|
49
|
+
|
50
|
+
if name && content
|
51
|
+
hash[name] ||= []
|
52
|
+
hash[name] << content
|
53
|
+
end
|
54
|
+
end
|
55
|
+
hash
|
56
|
+
end
|
57
|
+
|
58
|
+
def convert_each_array_to_first_element_on(hash)
|
59
|
+
hash.each_pair do |k, v|
|
60
|
+
hash[k] = if v.is_a?(Hash)
|
61
|
+
convert_each_array_to_first_element_on(v)
|
62
|
+
elsif v.is_a?(Array)
|
63
|
+
v.first
|
64
|
+
else
|
65
|
+
v
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module MetaInspector
|
2
|
+
module Parsers
|
3
|
+
class TextsParser < Base
|
4
|
+
def_delegators :@main_parser, :parsed, :meta
|
5
|
+
|
6
|
+
# Returns the parsed document title, from the content of the <title> tag
|
7
|
+
# within the <head> section.
|
8
|
+
def title
|
9
|
+
@title ||= parsed.css('head title').inner_text rescue nil
|
10
|
+
end
|
11
|
+
|
12
|
+
# A description getter that first checks for a meta description and if not present will
|
13
|
+
# guess by looking at the first paragraph with more than 120 characters
|
14
|
+
def description
|
15
|
+
meta['description'] || secondary_description
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
# Look for the first <p> block with 120 characters or more
|
21
|
+
def secondary_description
|
22
|
+
first_long_paragraph = parsed.search('//p[string-length() >= 120]').first
|
23
|
+
first_long_paragraph ? first_long_paragraph.text : ''
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
data/lib/meta_inspector/url.rb
CHANGED
data/lib/meta_inspector.rb
CHANGED
@@ -1,11 +1,14 @@
|
|
1
|
-
# -*- encoding: utf-8 -*-
|
2
|
-
|
3
1
|
require 'forwardable'
|
4
2
|
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/exceptionable'))
|
5
3
|
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/exception_log'))
|
6
4
|
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/request'))
|
7
5
|
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/url'))
|
8
6
|
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/parser'))
|
7
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/parsers/base'))
|
8
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/parsers/images'))
|
9
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/parsers/links'))
|
10
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/parsers/meta_tags'))
|
11
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/parsers/texts'))
|
9
12
|
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/document'))
|
10
13
|
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/version'))
|
11
14
|
|
data/lib/metainspector.rb
CHANGED
data/meta_inspector.gemspec
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
# -*- encoding: utf-8 -*-
|
2
1
|
require File.expand_path('../lib/meta_inspector/version', __FILE__)
|
3
2
|
|
4
3
|
Gem::Specification.new do |gem|
|
@@ -27,4 +26,6 @@ Gem::Specification.new do |gem|
|
|
27
26
|
gem.add_development_dependency 'awesome_print', '~> 1.2.0'
|
28
27
|
gem.add_development_dependency 'rake', '~> 10.1.0'
|
29
28
|
gem.add_development_dependency 'pry'
|
29
|
+
gem.add_development_dependency 'guard'
|
30
|
+
gem.add_development_dependency 'guard-rspec'
|
30
31
|
end
|