metainspector 1.5.0 → 1.6.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +8 -1
- data/lib/meta_inspector/scraper.rb +30 -13
- data/lib/meta_inspector/version.rb +1 -1
- data/lib/metainspector.rb +1 -1
- data/meta_inspector.gemspec +2 -1
- data/samples/basic_scraping.rb +11 -6
- data/samples/spider.rb +9 -8
- data/spec/metainspector_spec.rb +31 -3
- metadata +37 -7
data/README.rdoc
CHANGED
@@ -32,7 +32,9 @@ Then you can see the scraped data like this:
|
|
32
32
|
page.meta_description # meta description, as string
|
33
33
|
page.meta_keywords # meta keywords, as string
|
34
34
|
page.image # Most relevant image, if defined with og:image
|
35
|
-
page.feed
|
35
|
+
page.feed # Get rss or atom links in meta data fields as array
|
36
|
+
page.meta_og_title # opengraph title
|
37
|
+
page.meta_og_image # opengraph image
|
36
38
|
|
37
39
|
MetaInspector uses dynamic methods for meta_tag discovery, so all these will work, and will be converted to a search of a meta tag by the corresponding name, and return its content attribute
|
38
40
|
|
@@ -48,6 +50,10 @@ It will also work for the meta tags of the form <meta http-equiv="name" ... />,
|
|
48
50
|
|
49
51
|
Please notice that MetaInspector is case sensitive, so page.meta_Content_Type is not the same as page.meta_content_type
|
50
52
|
|
53
|
+
You can also access most of the scraped data as a hash:
|
54
|
+
|
55
|
+
page.to_hash # { "url"=>"http://pagerankalert.com", "title" => "PageRankAlert.com", ... }
|
56
|
+
|
51
57
|
The full scraped document if accessible from:
|
52
58
|
|
53
59
|
page.document # Nokogiri doc that you can use it to get any element from the page
|
@@ -100,5 +106,6 @@ You're welcome to fork this project and send pull requests. I want to thank spec
|
|
100
106
|
* If keywords seem to be separated by blank spaces, replace them with commas
|
101
107
|
* Mocks
|
102
108
|
* Check content type, process only HTML pages, don't try to scrape TAR files like http://ftp.ruby-lang.org/pub/ruby/ruby-1.9.1-p129.tar.bz2 or video files like http://isabel.dit.upm.es/component/option,com_docman/task,doc_download/gid,831/Itemid,74/
|
109
|
+
* Autodiscover all available meta tags
|
103
110
|
|
104
111
|
Copyright (c) 2009-2011 Jaime Iniesta, released under the MIT license
|
@@ -5,39 +5,40 @@ require 'rubygems'
|
|
5
5
|
require 'nokogiri'
|
6
6
|
require 'charguess'
|
7
7
|
require 'iconv'
|
8
|
+
require 'hashie/rash'
|
8
9
|
|
9
10
|
# MetaInspector provides an easy way to scrape web pages and get its elements
|
10
11
|
module MetaInspector
|
11
12
|
class Scraper
|
12
13
|
attr_reader :url
|
13
|
-
|
14
14
|
# Initializes a new instance of MetaInspector, setting the URL to the one given
|
15
15
|
# If no scheme given, set it to http:// by default
|
16
16
|
def initialize(url)
|
17
17
|
@url = URI.parse(url).scheme.nil? ? 'http://' + url : url
|
18
|
+
@data = Hashie::Rash.new('url' => @url)
|
18
19
|
end
|
19
20
|
|
20
21
|
# Returns the parsed document title, from the content of the <title> tag.
|
21
22
|
# This is not the same as the meta_tite tag
|
22
23
|
def title
|
23
|
-
@title ||= parsed_document.css('title').inner_html rescue nil
|
24
|
+
@data.title ||= parsed_document.css('title').inner_html rescue nil
|
24
25
|
end
|
25
26
|
|
26
27
|
# Returns the parsed document links
|
27
28
|
def links
|
28
|
-
@links ||= remove_mailto(parsed_document.search("//a")
|
29
|
-
.map {|link| link.attributes["href"]
|
29
|
+
@data.links ||= remove_mailto(parsed_document.search("//a") \
|
30
|
+
.map {|link| link.attributes["href"] \
|
30
31
|
.to_s.strip}.uniq) rescue nil
|
31
32
|
end
|
32
33
|
|
33
34
|
# Returns the links converted to absolute urls
|
34
35
|
def absolute_links
|
35
|
-
@absolute_links ||= links.map { |l| absolutify_url(l) }
|
36
|
+
@data.absolute_links ||= links.map { |l| absolutify_url(l) }
|
36
37
|
end
|
37
38
|
|
38
39
|
# Returns the parsed document meta rss links
|
39
40
|
def feed
|
40
|
-
@feed ||= parsed_document.xpath("//link").select{ |link|
|
41
|
+
@data.feed ||= parsed_document.xpath("//link").select{ |link|
|
41
42
|
link.attributes["type"] && link.attributes["type"].value =~ /(atom|rss)/
|
42
43
|
}.map { |link|
|
43
44
|
absolutify_url(link.attributes["href"].value)
|
@@ -48,14 +49,21 @@ module MetaInspector
|
|
48
49
|
# Most all major websites now define this property and is usually very relevant
|
49
50
|
# See doc at http://developers.facebook.com/docs/opengraph/
|
50
51
|
def image
|
51
|
-
|
52
|
+
meta_og_image
|
52
53
|
end
|
53
54
|
|
54
55
|
# Returns the charset
|
55
56
|
# TODO: We should trust the charset expressed on the Content-Type meta tag
|
56
57
|
# and only guess it if none given
|
57
58
|
def charset
|
58
|
-
@charset ||= CharGuess.guess(document).downcase
|
59
|
+
@data.charset ||= CharGuess.guess(document).downcase
|
60
|
+
end
|
61
|
+
|
62
|
+
# Returns all parsed data as a nested Hash
|
63
|
+
def to_hash
|
64
|
+
# TODO: find a better option to populate the data to the Hash
|
65
|
+
image;feed;links;charset;absolute_links;title;meta_keywords
|
66
|
+
@data.to_hash
|
59
67
|
end
|
60
68
|
|
61
69
|
# Returns the whole parsed document
|
@@ -85,14 +93,23 @@ module MetaInspector
|
|
85
93
|
#
|
86
94
|
# It will first try with meta name="..." and if nothing found,
|
87
95
|
# with meta http-equiv="...", substituting "_" by "-"
|
88
|
-
# TODO: this should be case unsensitive, so meta_robots gets the results from the HTML for robots, Robots, ROBOTS...
|
89
|
-
# TODO: cache results on instance variables, using ||=
|
90
96
|
# TODO: define respond_to? to return true on the meta_name methods
|
91
97
|
def method_missing(method_name)
|
92
98
|
if method_name.to_s =~ /^meta_(.*)/
|
93
|
-
|
94
|
-
|
95
|
-
|
99
|
+
key = $1
|
100
|
+
#special treatment for og:
|
101
|
+
if key =~ /^og_(.*)/
|
102
|
+
key = "og:#{$1}"
|
103
|
+
end
|
104
|
+
unless @data.meta
|
105
|
+
@data.meta!.name!
|
106
|
+
@data.meta!.property!
|
107
|
+
parsed_document.xpath("//meta").each { |element|
|
108
|
+
@data.meta.name[element.attributes["name"].value.downcase] = element.attributes["content"].value if element.attributes["name"]
|
109
|
+
@data.meta.property[element.attributes["property"].value.downcase] = element.attributes["content"].value if element.attributes["property"]
|
110
|
+
}
|
111
|
+
end
|
112
|
+
@data.meta.name && (@data.meta.name[key.downcase]) || (@data.meta.property && @data.meta.property[key.downcase])
|
96
113
|
else
|
97
114
|
super
|
98
115
|
end
|
data/lib/metainspector.rb
CHANGED
data/meta_inspector.gemspec
CHANGED
@@ -21,8 +21,9 @@ Gem::Specification.new do |s|
|
|
21
21
|
|
22
22
|
s.add_dependency 'nokogiri', '1.4.4'
|
23
23
|
s.add_dependency 'charguess', '1.3.20110226181011'
|
24
|
+
s.add_dependency "rash", "~> 0.3.0"
|
24
25
|
|
25
26
|
s.add_development_dependency 'rspec', '~> 2.6.0'
|
26
27
|
s.add_development_dependency 'fakeweb', '~> 1.3.0'
|
27
|
-
|
28
|
+
s.add_development_dependency 'awesome_print', '~> 0.4.0'
|
28
29
|
end
|
data/samples/basic_scraping.rb
CHANGED
@@ -1,17 +1,22 @@
|
|
1
1
|
# Some basic MetaInspector samples
|
2
2
|
|
3
|
-
|
3
|
+
$: << File.join(File.dirname(__FILE__), "/../lib")
|
4
|
+
require 'meta_inspector'
|
5
|
+
require 'ap'
|
4
6
|
|
5
|
-
puts "Enter a valid http
|
6
|
-
|
7
|
-
page = MetaInspector.new(
|
7
|
+
puts "Enter a valid http url to scrape it"
|
8
|
+
url = gets.strip
|
9
|
+
page = MetaInspector.new(url)
|
8
10
|
puts "...please wait while scraping the page..."
|
9
11
|
|
10
|
-
puts "Scraping #{page.
|
12
|
+
puts "Scraping #{page.url} returned these results:"
|
11
13
|
puts "TITLE: #{page.title}"
|
12
14
|
puts "META DESCRIPTION: #{page.meta_description}"
|
13
15
|
puts "META KEYWORDS: #{page.meta_keywords}"
|
14
16
|
puts "#{page.links.size} links found..."
|
15
17
|
page.links.each do |link|
|
16
18
|
puts " ==> #{link}"
|
17
|
-
end
|
19
|
+
end
|
20
|
+
|
21
|
+
puts "to_hash..."
|
22
|
+
ap page.to_hash
|
data/samples/spider.rb
CHANGED
@@ -1,19 +1,20 @@
|
|
1
1
|
# A basic spider that will follow links on an infinite loop
|
2
|
-
|
2
|
+
$: << File.join(File.dirname(__FILE__), "/../lib")
|
3
|
+
require 'meta_inspector'
|
3
4
|
|
4
5
|
q = Queue.new
|
5
6
|
visited_links=[]
|
6
7
|
|
7
|
-
puts "Enter a valid http
|
8
|
-
|
8
|
+
puts "Enter a valid http url to spider it following external links"
|
9
|
+
url = gets.strip
|
9
10
|
|
10
|
-
page = MetaInspector.new(
|
11
|
-
q.push(
|
11
|
+
page = MetaInspector.new(url)
|
12
|
+
q.push(url)
|
12
13
|
|
13
14
|
while q.size > 0
|
14
|
-
visited_links <<
|
15
|
-
page = MetaInspector.new(
|
16
|
-
puts "Spidering #{page.
|
15
|
+
visited_links << url = q.pop
|
16
|
+
page = MetaInspector.new(url)
|
17
|
+
puts "Spidering #{page.url}"
|
17
18
|
|
18
19
|
puts "TITLE: #{page.title}"
|
19
20
|
puts "META DESCRIPTION: #{page.meta_description}"
|
data/spec/metainspector_spec.rb
CHANGED
@@ -43,6 +43,7 @@ describe MetaInspector do
|
|
43
43
|
it "should find an image" do
|
44
44
|
@m = MetaInspector.new('http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/')
|
45
45
|
@m.image.should == "http://o.onionstatic.com/images/articles/article/2772/Apple-Claims-600w-R_jpg_130x110_q85.jpg"
|
46
|
+
@m.meta_og_image.should == "http://o.onionstatic.com/images/articles/article/2772/Apple-Claims-600w-R_jpg_130x110_q85.jpg"
|
46
47
|
end
|
47
48
|
|
48
49
|
it "should have a Nokogiri::HTML::Document as parsed_document" do
|
@@ -103,6 +104,10 @@ describe MetaInspector do
|
|
103
104
|
@m.meta_robots.should == 'all,follow'
|
104
105
|
end
|
105
106
|
|
107
|
+
it "should get the robots meta tag" do
|
108
|
+
@m.meta_RoBoTs.should == 'all,follow'
|
109
|
+
end
|
110
|
+
|
106
111
|
it "should get the description meta tag" do
|
107
112
|
@m.meta_description.should == 'Track your PageRank(TM) changes and receive alerts by email'
|
108
113
|
end
|
@@ -116,9 +121,8 @@ describe MetaInspector do
|
|
116
121
|
@m.meta_content_language.should == "en"
|
117
122
|
end
|
118
123
|
|
119
|
-
it "should get the
|
120
|
-
|
121
|
-
@m.meta_Content_Type.should == "text/html; charset=utf-8"
|
124
|
+
it "should get the Csrf_pAram meta tag" do
|
125
|
+
@m.meta_Csrf_pAram.should == "authenticity_token"
|
122
126
|
end
|
123
127
|
|
124
128
|
it "should get the generator meta tag" do
|
@@ -129,6 +133,17 @@ describe MetaInspector do
|
|
129
133
|
it "should return nil for nonfound meta_tags" do
|
130
134
|
@m.meta_lollypop.should == nil
|
131
135
|
end
|
136
|
+
|
137
|
+
it "should find a meta_og_title" do
|
138
|
+
@m = MetaInspector.new('http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/')
|
139
|
+
@m.meta_og_title.should == "Apple Claims New iPhone Only Visible To Most Loyal Of Customers"
|
140
|
+
end
|
141
|
+
|
142
|
+
it "should not find a meta_og_something" do
|
143
|
+
@m = MetaInspector.new('http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/')
|
144
|
+
@m.meta_og_something.should == nil
|
145
|
+
end
|
146
|
+
|
132
147
|
end
|
133
148
|
|
134
149
|
context 'Charset detection' do
|
@@ -146,4 +161,17 @@ describe MetaInspector do
|
|
146
161
|
@m.charset.should == "utf-8"
|
147
162
|
end
|
148
163
|
end
|
164
|
+
|
165
|
+
context 'to_hash' do
|
166
|
+
|
167
|
+
FakeWeb.register_uri(:get, "http://www.pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
|
168
|
+
|
169
|
+
it "should return a hash with all the values set" do
|
170
|
+
@m = MetaInspector.new('http://www.pagerankalert.com')
|
171
|
+
@m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://www.pagerankalert.com", "meta"=>{"name"=>{"robots"=>"all,follow", "csrf_param"=>"authenticity_token", "description"=>"Track your PageRank(TM) changes and receive alerts by email", "keywords"=>"pagerank, seo, optimization, google", "csrf_token"=>"iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="}, "property"=>{}}, "links"=>["/", "/es?language=es", "/users/sign_up", "/users/sign_in", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"], "charset"=>"utf-8", "feed"=>"http://feeds.feedburner.com/PageRankAlert", "absolute_links"=>["http://www.pagerankalert.com/", "http://www.pagerankalert.com/es?language=es", "http://www.pagerankalert.com/users/sign_up", "http://www.pagerankalert.com/users/sign_in", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"]}
|
172
|
+
end
|
173
|
+
|
174
|
+
end
|
175
|
+
|
176
|
+
|
149
177
|
end
|
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 1
|
7
|
-
-
|
7
|
+
- 6
|
8
8
|
- 0
|
9
|
-
version: 1.
|
9
|
+
version: 1.6.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jaime Iniesta
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-
|
17
|
+
date: 2011-06-03 00:00:00 +02:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -48,9 +48,24 @@ dependencies:
|
|
48
48
|
type: :runtime
|
49
49
|
version_requirements: *id002
|
50
50
|
- !ruby/object:Gem::Dependency
|
51
|
-
name:
|
51
|
+
name: rash
|
52
52
|
prerelease: false
|
53
53
|
requirement: &id003 !ruby/object:Gem::Requirement
|
54
|
+
none: false
|
55
|
+
requirements:
|
56
|
+
- - ~>
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
segments:
|
59
|
+
- 0
|
60
|
+
- 3
|
61
|
+
- 0
|
62
|
+
version: 0.3.0
|
63
|
+
type: :runtime
|
64
|
+
version_requirements: *id003
|
65
|
+
- !ruby/object:Gem::Dependency
|
66
|
+
name: rspec
|
67
|
+
prerelease: false
|
68
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
54
69
|
none: false
|
55
70
|
requirements:
|
56
71
|
- - ~>
|
@@ -61,11 +76,11 @@ dependencies:
|
|
61
76
|
- 0
|
62
77
|
version: 2.6.0
|
63
78
|
type: :development
|
64
|
-
version_requirements: *
|
79
|
+
version_requirements: *id004
|
65
80
|
- !ruby/object:Gem::Dependency
|
66
81
|
name: fakeweb
|
67
82
|
prerelease: false
|
68
|
-
requirement: &
|
83
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
69
84
|
none: false
|
70
85
|
requirements:
|
71
86
|
- - ~>
|
@@ -76,7 +91,22 @@ dependencies:
|
|
76
91
|
- 0
|
77
92
|
version: 1.3.0
|
78
93
|
type: :development
|
79
|
-
version_requirements: *
|
94
|
+
version_requirements: *id005
|
95
|
+
- !ruby/object:Gem::Dependency
|
96
|
+
name: awesome_print
|
97
|
+
prerelease: false
|
98
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
99
|
+
none: false
|
100
|
+
requirements:
|
101
|
+
- - ~>
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
segments:
|
104
|
+
- 0
|
105
|
+
- 4
|
106
|
+
- 0
|
107
|
+
version: 0.4.0
|
108
|
+
type: :development
|
109
|
+
version_requirements: *id006
|
80
110
|
description: MetaInspector lets you scrape a web page and get its title, charset, link and meta tags
|
81
111
|
email:
|
82
112
|
- jaimeiniesta@gmail.com
|