metainspector 3.3.0 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Guardfile +5 -0
- data/README.md +26 -8
- data/lib/meta_inspector/document.rb +4 -8
- data/lib/meta_inspector/exception_log.rb +0 -2
- data/lib/meta_inspector/exceptionable.rb +0 -2
- data/lib/meta_inspector/parser.rb +17 -162
- data/lib/meta_inspector/parsers/base.rb +30 -0
- data/lib/meta_inspector/parsers/images.rb +45 -0
- data/lib/meta_inspector/parsers/links.rb +69 -0
- data/lib/meta_inspector/parsers/meta_tags.rb +72 -0
- data/lib/meta_inspector/parsers/texts.rb +27 -0
- data/lib/meta_inspector/request.rb +0 -2
- data/lib/meta_inspector/url.rb +0 -2
- data/lib/meta_inspector/version.rb +1 -3
- data/lib/meta_inspector.rb +5 -2
- data/lib/metainspector.rb +0 -2
- data/meta_inspector.gemspec +2 -1
- data/spec/document_spec.rb +16 -26
- data/spec/exception_log_spec.rb +1 -3
- data/spec/fixtures/example.response +17 -0
- data/spec/meta_inspector/images_spec.rb +111 -0
- data/spec/meta_inspector/links_spec.rb +203 -0
- data/spec/{meta_inspector_spec.rb → meta_inspector/meta_inspector_spec.rb} +1 -3
- data/spec/meta_inspector/meta_tags_spec.rb +108 -0
- data/spec/meta_inspector/redirections_spec.rb +48 -0
- data/spec/meta_inspector/texts_spec.rb +22 -0
- data/spec/parser_spec.rb +7 -393
- data/spec/request_spec.rb +1 -3
- data/spec/spec_helper.rb +0 -2
- data/spec/url_spec.rb +1 -3
- metadata +44 -6
- data/spec/redirections_spec.rb +0 -47
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 84153e6fb0df5db8c6e71c3b918c5afb48da2ff0
|
4
|
+
data.tar.gz: c3c98fc6d9488202a114769d0cb38871ba9eb871
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 65ba2a93d70893615ccb20a416d5c461e5f93877cdf6a77688131e6f7ad597f48ad48188930267cef9088f17af25fb934cdb219d355187d88f91070bbee77e96
|
7
|
+
data.tar.gz: dbbf14dad512d0ca6bf2dedc1dba7b6c8f93fcd0ed5ce1e3b7a6c0efd3345612555d2825a57db6efc18e727b0f8239794b123adf822c946a306add942185deec
|
data/Guardfile
ADDED
data/README.md
CHANGED
@@ -8,9 +8,24 @@ You give it an URL, and it lets you easily get its title, links, images, charset
|
|
8
8
|
|
9
9
|
You can try MetaInspector live at this little demo: [https://metainspectordemo.herokuapp.com](https://metainspectordemo.herokuapp.com)
|
10
10
|
|
11
|
-
## Changes in
|
11
|
+
## Changes in 4.0
|
12
|
+
|
13
|
+
* The links API has been changed, now instead of `page.links`, `page.internal_links` and `page.external_links` we have:
|
14
|
+
|
15
|
+
```ruby
|
16
|
+
page.links.raw # Returns all links found, unprocessed
|
17
|
+
page.links.all # Returns all links found, unrelavitized and absolutified
|
18
|
+
page.links.http # Returns all HTTP links found
|
19
|
+
page.links.non_http # Returns all non-HTTP links found
|
20
|
+
page.links.internal # Returns all internal HTTP links found
|
21
|
+
page.links.external # Returns all external HTTP links found
|
22
|
+
```
|
23
|
+
|
24
|
+
* The images API has been changed, now instead of `page.image` we have `page.images.best`, and instead of `page.favicon` we have `page.images.favicon`.
|
12
25
|
|
13
|
-
|
26
|
+
* Now `page.image` will return the first image in `page.images` if no OG or Twitter image found, instead of returning `nil`.
|
27
|
+
|
28
|
+
## Changes in 3.0
|
14
29
|
|
15
30
|
* The redirect API has been changed, now the `:allow_redirections` option will expect only a boolean, which by default is `true`. That is, no more specifying `:safe`, `:unsafe` or `:all`.
|
16
31
|
* We've dropped support for Ruby < 2.
|
@@ -63,18 +78,21 @@ You can see the scraped data like this:
|
|
63
78
|
page.host # Hostname of the page (like, sitevalidator.com, without the scheme)
|
64
79
|
page.root_url # Root url (scheme + host, like http://sitevalidator.com/)
|
65
80
|
page.title # title of the page, as string
|
66
|
-
page.links
|
67
|
-
page.
|
68
|
-
page.
|
81
|
+
page.links.raw # every link found, unprocessed
|
82
|
+
page.links.all # every link found on the page as an absolute URL
|
83
|
+
page.links.http # every HTTP link found
|
84
|
+
page.links.non_http # every non-HTTP link found
|
85
|
+
page.links.internal # every internal link found on the page as an absolute URL
|
86
|
+
page.links.external # every external link found on the page as an absolute URL
|
69
87
|
page.meta['keywords'] # meta keywords, as string
|
70
88
|
page.meta['description'] # meta description, as string
|
71
89
|
page.description # returns the meta description, or the first long paragraph if no meta description is found
|
72
|
-
page.
|
73
|
-
page.images
|
90
|
+
page.images # enumerable collection, with every img found on the page as an absolute URL
|
91
|
+
page.images.best # Most relevant image, if defined with the og:image or twitter:image metatags. Fallback to the first page.images array element
|
92
|
+
page.images.favicon # absolute URL to the favicon
|
74
93
|
page.feed # Get rss or atom links in meta data fields as array
|
75
94
|
page.charset # UTF-8
|
76
95
|
page.content_type # content-type returned by the server when the url was requested
|
77
|
-
page.favicon # absolute URL to the favicon
|
78
96
|
|
79
97
|
## Meta tags
|
80
98
|
|
@@ -1,5 +1,3 @@
|
|
1
|
-
# -*- encoding: utf-8 -*-
|
2
|
-
|
3
1
|
module MetaInspector
|
4
2
|
# A MetaInspector::Document knows about its URL and its contents
|
5
3
|
class Document
|
@@ -35,7 +33,7 @@ module MetaInspector
|
|
35
33
|
extend Forwardable
|
36
34
|
def_delegators :@url, :url, :scheme, :host, :root_url
|
37
35
|
def_delegators :@request, :content_type, :response
|
38
|
-
def_delegators :@parser, :parsed, :respond_to?, :title, :description, :links,
|
36
|
+
def_delegators :@parser, :parsed, :respond_to?, :title, :description, :links,
|
39
37
|
:images, :image, :feed, :charset, :meta_tags, :meta_tag, :meta, :favicon
|
40
38
|
|
41
39
|
# Returns all document data as a nested Hash
|
@@ -43,15 +41,13 @@ module MetaInspector
|
|
43
41
|
{
|
44
42
|
'url' => url,
|
45
43
|
'title' => title,
|
46
|
-
'links' => links,
|
47
|
-
'
|
48
|
-
'external_links' => external_links,
|
49
|
-
'images' => images,
|
44
|
+
'links' => links.to_hash,
|
45
|
+
'images' => images.to_a,
|
50
46
|
'charset' => charset,
|
51
47
|
'feed' => feed,
|
52
48
|
'content_type' => content_type,
|
53
49
|
'meta_tags' => meta_tags,
|
54
|
-
'favicon' => favicon,
|
50
|
+
'favicon' => images.favicon,
|
55
51
|
'response' => { 'status' => response.status,
|
56
52
|
'headers' => response.headers }
|
57
53
|
}
|
@@ -1,36 +1,30 @@
|
|
1
|
-
# -*- encoding: utf-8 -*-
|
2
|
-
|
3
1
|
require 'nokogiri'
|
4
2
|
|
5
3
|
module MetaInspector
|
6
|
-
|
4
|
+
##
|
5
|
+
# Parses the document with Nokogiri.
|
6
|
+
#
|
7
|
+
# Delegates the parsing of the different elements to specialized parsers,
|
8
|
+
# passing itself as a reference for coordination purposes
|
9
|
+
#
|
7
10
|
class Parser
|
8
11
|
include MetaInspector::Exceptionable
|
9
12
|
|
10
13
|
def initialize(document, options = {})
|
11
|
-
@document
|
12
|
-
@exception_log
|
14
|
+
@document = document
|
15
|
+
@exception_log = options[:exception_log]
|
16
|
+
@meta_tag_parser = MetaInspector::Parsers::MetaTagsParser.new(self)
|
17
|
+
@links_parser = MetaInspector::Parsers::LinksParser.new(self)
|
18
|
+
@images_parser = MetaInspector::Parsers::ImagesParser.new(self)
|
19
|
+
@texts_parser = MetaInspector::Parsers::TextsParser.new(self)
|
13
20
|
end
|
14
21
|
|
15
22
|
extend Forwardable
|
16
|
-
def_delegators :@document,
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
'http-equiv' => meta_tags_by('http-equiv'),
|
22
|
-
'property' => meta_tags_by('property'),
|
23
|
-
'charset' => [charset_from_meta_charset]
|
24
|
-
}
|
25
|
-
end
|
26
|
-
|
27
|
-
def meta_tag
|
28
|
-
convert_each_array_to_first_element_on meta_tags
|
29
|
-
end
|
30
|
-
|
31
|
-
def meta
|
32
|
-
meta_tag['name'].merge(meta_tag['http-equiv']).merge(meta_tag['property']).merge({'charset' => meta_tag['charset']})
|
33
|
-
end
|
23
|
+
def_delegators :@document, :url, :scheme, :host
|
24
|
+
def_delegators :@meta_tag_parser, :meta_tags, :meta_tag, :meta, :charset
|
25
|
+
def_delegators :@links_parser, :links, :feed, :base_url
|
26
|
+
def_delegators :@images_parser, :images, :image, :favicon
|
27
|
+
def_delegators :@texts_parser, :title, :description
|
34
28
|
|
35
29
|
# Returns the whole parsed document
|
36
30
|
def parsed
|
@@ -38,144 +32,5 @@ module MetaInspector
|
|
38
32
|
rescue Exception => e
|
39
33
|
@exception_log << e
|
40
34
|
end
|
41
|
-
|
42
|
-
# Returns the parsed document title, from the content of the <title> tag
|
43
|
-
# within the <head> section.
|
44
|
-
# This is not the same as the meta_title tag
|
45
|
-
def title
|
46
|
-
@title ||= parsed.css('head title').inner_text rescue nil
|
47
|
-
end
|
48
|
-
|
49
|
-
# Return favicon url if exist
|
50
|
-
def favicon
|
51
|
-
query = '//link[@rel="icon" or contains(@rel, "shortcut")]'
|
52
|
-
value = parsed.xpath(query)[0].attributes['href'].value
|
53
|
-
@favicon ||= URL.absolutify(value, base_url)
|
54
|
-
rescue
|
55
|
-
nil
|
56
|
-
end
|
57
|
-
|
58
|
-
# A description getter that first checks for a meta description and if not present will
|
59
|
-
# guess by looking at the first paragraph with more than 120 characters
|
60
|
-
def description
|
61
|
-
meta['description'] || secondary_description
|
62
|
-
end
|
63
|
-
|
64
|
-
# Links found on the page, as absolute URLs
|
65
|
-
def links
|
66
|
-
@links ||= parsed_links.map{ |l| URL.absolutify(URL.unrelativize(l, scheme), base_url) }.compact.uniq
|
67
|
-
end
|
68
|
-
|
69
|
-
# Internal links found on the page, as absolute URLs
|
70
|
-
def internal_links
|
71
|
-
@internal_links ||= links.select {|link| URL.new(link).host == host }
|
72
|
-
end
|
73
|
-
|
74
|
-
# External links found on the page, as absolute URLs
|
75
|
-
def external_links
|
76
|
-
@external_links ||= links.select {|link| URL.new(link).host != host }
|
77
|
-
end
|
78
|
-
|
79
|
-
# Images found on the page, as absolute URLs
|
80
|
-
def images
|
81
|
-
@images ||= parsed_images.map{ |i| URL.absolutify(i, base_url) }
|
82
|
-
end
|
83
|
-
|
84
|
-
# Returns the parsed image from Facebook's open graph property tags
|
85
|
-
# Most all major websites now define this property and is usually very relevant
|
86
|
-
# See doc at http://developers.facebook.com/docs/opengraph/
|
87
|
-
# If none found, tries with Twitter image
|
88
|
-
# TODO: if not found, try with images.first
|
89
|
-
def image
|
90
|
-
meta['og:image'] || meta['twitter:image']
|
91
|
-
end
|
92
|
-
|
93
|
-
# Returns the parsed document meta rss link
|
94
|
-
def feed
|
95
|
-
@feed ||= (parsed_feed('rss') || parsed_feed('atom'))
|
96
|
-
end
|
97
|
-
|
98
|
-
# Returns the charset from the meta tags, looking for it in the following order:
|
99
|
-
# <meta charset='utf-8' />
|
100
|
-
# <meta http-equiv="Content-Type" content="text/html; charset=windows-1252" />
|
101
|
-
def charset
|
102
|
-
@charset ||= (charset_from_meta_charset || charset_from_meta_content_type)
|
103
|
-
end
|
104
|
-
|
105
|
-
private
|
106
|
-
|
107
|
-
def meta_tags_by(attribute)
|
108
|
-
hash = {}
|
109
|
-
parsed.css("meta[@#{attribute}]").map do |tag|
|
110
|
-
name = tag.attributes[attribute].value.downcase rescue nil
|
111
|
-
content = tag.attributes['content'].value rescue nil
|
112
|
-
|
113
|
-
if name && content
|
114
|
-
hash[name] ||= []
|
115
|
-
hash[name] << content
|
116
|
-
end
|
117
|
-
end
|
118
|
-
hash
|
119
|
-
end
|
120
|
-
|
121
|
-
def convert_each_array_to_first_element_on(hash)
|
122
|
-
hash.each_pair do |k, v|
|
123
|
-
hash[k] = if v.is_a?(Hash)
|
124
|
-
convert_each_array_to_first_element_on(v)
|
125
|
-
elsif v.is_a?(Array)
|
126
|
-
v.first
|
127
|
-
else
|
128
|
-
v
|
129
|
-
end
|
130
|
-
end
|
131
|
-
end
|
132
|
-
|
133
|
-
# Look for the first <p> block with 120 characters or more
|
134
|
-
def secondary_description
|
135
|
-
first_long_paragraph = parsed_search('//p[string-length() >= 120]').first
|
136
|
-
first_long_paragraph ? first_long_paragraph.text : ''
|
137
|
-
end
|
138
|
-
|
139
|
-
def parsed_links
|
140
|
-
@parsed_links ||= cleanup_nokogiri_values(parsed_search("//a/@href"))
|
141
|
-
end
|
142
|
-
|
143
|
-
def parsed_images
|
144
|
-
@parsed_images ||= cleanup_nokogiri_values(parsed_search('//img/@src'))
|
145
|
-
end
|
146
|
-
|
147
|
-
def parsed_feed(format)
|
148
|
-
feed = parsed_search("//link[@type='application/#{format}+xml']").first
|
149
|
-
feed ? URL.absolutify(feed.attributes['href'].value, base_url) : nil
|
150
|
-
end
|
151
|
-
|
152
|
-
def charset_from_meta_charset
|
153
|
-
parsed.css("meta[charset]")[0].attributes['charset'].value rescue nil
|
154
|
-
end
|
155
|
-
|
156
|
-
def charset_from_meta_content_type
|
157
|
-
parsed.css("meta[http-equiv='Content-Type']")[0].attributes['content'].value.split(";")[1].split("=")[1] rescue nil
|
158
|
-
end
|
159
|
-
|
160
|
-
# Returns the base url to absolutify relative links. This can be the one set on a <base> tag,
|
161
|
-
# or the url of the document if no <base> tag was found.
|
162
|
-
def base_url
|
163
|
-
base_href || url
|
164
|
-
end
|
165
|
-
|
166
|
-
# Returns the value of the href attribute on the <base /> tag, if it exists
|
167
|
-
def base_href
|
168
|
-
parsed_search('base').first.attributes['href'].value rescue nil
|
169
|
-
end
|
170
|
-
|
171
|
-
# Takes a nokogiri search result, strips the values, rejects the empty ones, and removes duplicates
|
172
|
-
def cleanup_nokogiri_values(results)
|
173
|
-
results.map { |a| a.value.strip }.reject { |s| s.empty? }.uniq
|
174
|
-
end
|
175
|
-
|
176
|
-
# Searches the parsed document for the selector, if the parsed document is searchable
|
177
|
-
def parsed_search(selector)
|
178
|
-
parsed.respond_to?(:search) ? parsed.search(selector) : []
|
179
|
-
end
|
180
35
|
end
|
181
36
|
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module MetaInspector
|
2
|
+
module Parsers
|
3
|
+
##
|
4
|
+
# Base class from where the specialized parsers inherit from.
|
5
|
+
#
|
6
|
+
# On initialization a main parser is expected, so the specialized
|
7
|
+
# parsers can request the parsed document to the main parser, and
|
8
|
+
# then perform the searches on it.
|
9
|
+
#
|
10
|
+
# The main parser also serves as a message hub between the specialized
|
11
|
+
# parsers. For example, the ImagesParser needs to know the base_url
|
12
|
+
# in order to absolutify image URLs, so it delegates it to the main parser
|
13
|
+
# which, in turn, delegates it to the LinksParser.
|
14
|
+
#
|
15
|
+
class Base
|
16
|
+
def initialize(main_parser)
|
17
|
+
@main_parser = main_parser
|
18
|
+
end
|
19
|
+
|
20
|
+
extend Forwardable
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
# Cleans up nokogiri search results
|
25
|
+
def cleanup(results)
|
26
|
+
results.map { |_| _.value.strip }.reject { |_| _.empty? }.uniq
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module MetaInspector
|
2
|
+
module Parsers
|
3
|
+
class ImagesParser < Base
|
4
|
+
def_delegators :@main_parser, :parsed, :meta, :base_url
|
5
|
+
def_delegators :images_collection, :length, :size
|
6
|
+
|
7
|
+
include Enumerable
|
8
|
+
|
9
|
+
def images
|
10
|
+
self
|
11
|
+
end
|
12
|
+
|
13
|
+
def each(&block)
|
14
|
+
images_collection.each(&block)
|
15
|
+
end
|
16
|
+
|
17
|
+
# Returns the parsed image from Facebook's open graph property tags
|
18
|
+
# Most all major websites now define this property and is usually very relevant
|
19
|
+
# See doc at http://developers.facebook.com/docs/opengraph/
|
20
|
+
# If none found, tries with Twitter image
|
21
|
+
def best
|
22
|
+
meta['og:image'] || meta['twitter:image'] || images_collection.first
|
23
|
+
end
|
24
|
+
|
25
|
+
# Return favicon url if exist
|
26
|
+
def favicon
|
27
|
+
query = '//link[@rel="icon" or contains(@rel, "shortcut")]'
|
28
|
+
value = parsed.xpath(query)[0].attributes['href'].value
|
29
|
+
@favicon ||= URL.absolutify(value, base_url)
|
30
|
+
rescue
|
31
|
+
nil
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
def images_collection
|
37
|
+
@images_collection ||= parsed_images.map{ |i| URL.absolutify(i, base_url) }
|
38
|
+
end
|
39
|
+
|
40
|
+
def parsed_images
|
41
|
+
@parsed_images ||= cleanup(parsed.search('//img/@src'))
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
module MetaInspector
|
2
|
+
module Parsers
|
3
|
+
class LinksParser < Base
|
4
|
+
def_delegators :@main_parser, :parsed, :url, :scheme, :host
|
5
|
+
|
6
|
+
def links
|
7
|
+
self
|
8
|
+
end
|
9
|
+
|
10
|
+
# Returns all links found, unprocessed
|
11
|
+
def raw
|
12
|
+
@raw ||= cleanup(parsed.search("//a/@href")).compact.uniq
|
13
|
+
end
|
14
|
+
|
15
|
+
# Returns all links found, unrelavitized and absolutified
|
16
|
+
def all
|
17
|
+
@all ||= raw.map { |l| URL.absolutify(URL.unrelativize(l, scheme), base_url) }
|
18
|
+
.compact.uniq
|
19
|
+
end
|
20
|
+
|
21
|
+
# Returns all HTTP links found
|
22
|
+
def http
|
23
|
+
@http ||= all.select {|l| l =~ /^http(s)?:\/\//i}
|
24
|
+
end
|
25
|
+
|
26
|
+
# Returns all non-HTTP links found
|
27
|
+
def non_http
|
28
|
+
@non_http ||= all.select {|l| l !~ /^http(s)?:\/\//i}
|
29
|
+
end
|
30
|
+
|
31
|
+
# Returns all internal HTTP links found
|
32
|
+
def internal
|
33
|
+
@internal ||= http.select {|link| URL.new(link).host == host }
|
34
|
+
end
|
35
|
+
|
36
|
+
# Returns all external HTTP links found
|
37
|
+
def external
|
38
|
+
@external ||= http.select {|link| URL.new(link).host != host }
|
39
|
+
end
|
40
|
+
|
41
|
+
def to_hash
|
42
|
+
{ 'internal' => internal, 'external' => external, 'non_http' => non_http }
|
43
|
+
end
|
44
|
+
|
45
|
+
# Returns the parsed document meta rss link
|
46
|
+
def feed
|
47
|
+
@feed ||= (parsed_feed('rss') || parsed_feed('atom'))
|
48
|
+
end
|
49
|
+
|
50
|
+
# Returns the base url to absolutify relative links. This can be the one set on a <base> tag,
|
51
|
+
# or the url of the document if no <base> tag was found.
|
52
|
+
def base_url
|
53
|
+
base_href || url
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
|
58
|
+
def parsed_feed(format)
|
59
|
+
feed = parsed.search("//link[@type='application/#{format}+xml']").first
|
60
|
+
feed ? URL.absolutify(feed.attributes['href'].value, base_url) : nil
|
61
|
+
end
|
62
|
+
|
63
|
+
# Returns the value of the href attribute on the <base /> tag, if it exists
|
64
|
+
def base_href
|
65
|
+
parsed.search('base').first.attributes['href'].value rescue nil
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
module MetaInspector
|
2
|
+
module Parsers
|
3
|
+
class MetaTagsParser < Base
|
4
|
+
def_delegators :@main_parser, :parsed
|
5
|
+
|
6
|
+
def meta_tags
|
7
|
+
{
|
8
|
+
'name' => meta_tags_by('name'),
|
9
|
+
'http-equiv' => meta_tags_by('http-equiv'),
|
10
|
+
'property' => meta_tags_by('property'),
|
11
|
+
'charset' => [charset_from_meta_charset]
|
12
|
+
}
|
13
|
+
end
|
14
|
+
|
15
|
+
def meta_tag
|
16
|
+
convert_each_array_to_first_element_on meta_tags
|
17
|
+
end
|
18
|
+
|
19
|
+
def meta
|
20
|
+
meta_tag['name']
|
21
|
+
.merge(meta_tag['http-equiv'])
|
22
|
+
.merge(meta_tag['property'])
|
23
|
+
.merge({'charset' => meta_tag['charset']})
|
24
|
+
end
|
25
|
+
|
26
|
+
# Returns the charset from the meta tags, looking for it in the following order:
|
27
|
+
# <meta charset='utf-8' />
|
28
|
+
# <meta http-equiv="Content-Type" content="text/html; charset=windows-1252" />
|
29
|
+
def charset
|
30
|
+
@charset ||= (charset_from_meta_charset || charset_from_meta_content_type)
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def charset_from_meta_charset
|
36
|
+
parsed.css("meta[charset]")[0].attributes['charset'].value rescue nil
|
37
|
+
end
|
38
|
+
|
39
|
+
def charset_from_meta_content_type
|
40
|
+
parsed.css("meta[http-equiv='Content-Type']")[0]
|
41
|
+
.attributes['content'].value.split(";")[1].split("=")[1] rescue nil
|
42
|
+
end
|
43
|
+
|
44
|
+
def meta_tags_by(attribute)
|
45
|
+
hash = {}
|
46
|
+
parsed.css("meta[@#{attribute}]").map do |tag|
|
47
|
+
name = tag.attributes[attribute].value.downcase rescue nil
|
48
|
+
content = tag.attributes['content'].value rescue nil
|
49
|
+
|
50
|
+
if name && content
|
51
|
+
hash[name] ||= []
|
52
|
+
hash[name] << content
|
53
|
+
end
|
54
|
+
end
|
55
|
+
hash
|
56
|
+
end
|
57
|
+
|
58
|
+
def convert_each_array_to_first_element_on(hash)
|
59
|
+
hash.each_pair do |k, v|
|
60
|
+
hash[k] = if v.is_a?(Hash)
|
61
|
+
convert_each_array_to_first_element_on(v)
|
62
|
+
elsif v.is_a?(Array)
|
63
|
+
v.first
|
64
|
+
else
|
65
|
+
v
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module MetaInspector
|
2
|
+
module Parsers
|
3
|
+
class TextsParser < Base
|
4
|
+
def_delegators :@main_parser, :parsed, :meta
|
5
|
+
|
6
|
+
# Returns the parsed document title, from the content of the <title> tag
|
7
|
+
# within the <head> section.
|
8
|
+
def title
|
9
|
+
@title ||= parsed.css('head title').inner_text rescue nil
|
10
|
+
end
|
11
|
+
|
12
|
+
# A description getter that first checks for a meta description and if not present will
|
13
|
+
# guess by looking at the first paragraph with more than 120 characters
|
14
|
+
def description
|
15
|
+
meta['description'] || secondary_description
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
# Look for the first <p> block with 120 characters or more
|
21
|
+
def secondary_description
|
22
|
+
first_long_paragraph = parsed.search('//p[string-length() >= 120]').first
|
23
|
+
first_long_paragraph ? first_long_paragraph.text : ''
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
data/lib/meta_inspector/url.rb
CHANGED
data/lib/meta_inspector.rb
CHANGED
@@ -1,11 +1,14 @@
|
|
1
|
-
# -*- encoding: utf-8 -*-
|
2
|
-
|
3
1
|
require 'forwardable'
|
4
2
|
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/exceptionable'))
|
5
3
|
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/exception_log'))
|
6
4
|
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/request'))
|
7
5
|
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/url'))
|
8
6
|
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/parser'))
|
7
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/parsers/base'))
|
8
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/parsers/images'))
|
9
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/parsers/links'))
|
10
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/parsers/meta_tags'))
|
11
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/parsers/texts'))
|
9
12
|
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/document'))
|
10
13
|
require File.expand_path(File.join(File.dirname(__FILE__), 'meta_inspector/version'))
|
11
14
|
|
data/lib/metainspector.rb
CHANGED
data/meta_inspector.gemspec
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
# -*- encoding: utf-8 -*-
|
2
1
|
require File.expand_path('../lib/meta_inspector/version', __FILE__)
|
3
2
|
|
4
3
|
Gem::Specification.new do |gem|
|
@@ -27,4 +26,6 @@ Gem::Specification.new do |gem|
|
|
27
26
|
gem.add_development_dependency 'awesome_print', '~> 1.2.0'
|
28
27
|
gem.add_development_dependency 'rake', '~> 10.1.0'
|
29
28
|
gem.add_development_dependency 'pry'
|
29
|
+
gem.add_development_dependency 'guard'
|
30
|
+
gem.add_development_dependency 'guard-rspec'
|
30
31
|
end
|