metainspector 1.5.0 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +8 -1
- data/lib/meta_inspector/scraper.rb +30 -13
- data/lib/meta_inspector/version.rb +1 -1
- data/lib/metainspector.rb +1 -1
- data/meta_inspector.gemspec +2 -1
- data/samples/basic_scraping.rb +11 -6
- data/samples/spider.rb +9 -8
- data/spec/metainspector_spec.rb +31 -3
- metadata +37 -7
data/README.rdoc
CHANGED
@@ -32,7 +32,9 @@ Then you can see the scraped data like this:
|
|
32
32
|
page.meta_description # meta description, as string
|
33
33
|
page.meta_keywords # meta keywords, as string
|
34
34
|
page.image # Most relevant image, if defined with og:image
|
35
|
-
page.feed
|
35
|
+
page.feed # Get rss or atom links in meta data fields as array
|
36
|
+
page.meta_og_title # opengraph title
|
37
|
+
page.meta_og_image # opengraph image
|
36
38
|
|
37
39
|
MetaInspector uses dynamic methods for meta_tag discovery, so all these will work, and will be converted to a search of a meta tag by the corresponding name, and return its content attribute
|
38
40
|
|
@@ -48,6 +50,10 @@ It will also work for the meta tags of the form <meta http-equiv="name" ... />,
|
|
48
50
|
|
49
51
|
Please notice that MetaInspector is case sensitive, so page.meta_Content_Type is not the same as page.meta_content_type
|
50
52
|
|
53
|
+
You can also access most of the scraped data as a hash:
|
54
|
+
|
55
|
+
page.to_hash # { "url"=>"http://pagerankalert.com", "title" => "PageRankAlert.com", ... }
|
56
|
+
|
51
57
|
The full scraped document if accessible from:
|
52
58
|
|
53
59
|
page.document # Nokogiri doc that you can use it to get any element from the page
|
@@ -100,5 +106,6 @@ You're welcome to fork this project and send pull requests. I want to thank spec
|
|
100
106
|
* If keywords seem to be separated by blank spaces, replace them with commas
|
101
107
|
* Mocks
|
102
108
|
* Check content type, process only HTML pages, don't try to scrape TAR files like http://ftp.ruby-lang.org/pub/ruby/ruby-1.9.1-p129.tar.bz2 or video files like http://isabel.dit.upm.es/component/option,com_docman/task,doc_download/gid,831/Itemid,74/
|
109
|
+
* Autodiscover all available meta tags
|
103
110
|
|
104
111
|
Copyright (c) 2009-2011 Jaime Iniesta, released under the MIT license
|
@@ -5,39 +5,40 @@ require 'rubygems'
|
|
5
5
|
require 'nokogiri'
|
6
6
|
require 'charguess'
|
7
7
|
require 'iconv'
|
8
|
+
require 'hashie/rash'
|
8
9
|
|
9
10
|
# MetaInspector provides an easy way to scrape web pages and get its elements
|
10
11
|
module MetaInspector
|
11
12
|
class Scraper
|
12
13
|
attr_reader :url
|
13
|
-
|
14
14
|
# Initializes a new instance of MetaInspector, setting the URL to the one given
|
15
15
|
# If no scheme given, set it to http:// by default
|
16
16
|
def initialize(url)
|
17
17
|
@url = URI.parse(url).scheme.nil? ? 'http://' + url : url
|
18
|
+
@data = Hashie::Rash.new('url' => @url)
|
18
19
|
end
|
19
20
|
|
20
21
|
# Returns the parsed document title, from the content of the <title> tag.
|
21
22
|
# This is not the same as the meta_tite tag
|
22
23
|
def title
|
23
|
-
@title ||= parsed_document.css('title').inner_html rescue nil
|
24
|
+
@data.title ||= parsed_document.css('title').inner_html rescue nil
|
24
25
|
end
|
25
26
|
|
26
27
|
# Returns the parsed document links
|
27
28
|
def links
|
28
|
-
@links ||= remove_mailto(parsed_document.search("//a")
|
29
|
-
.map {|link| link.attributes["href"]
|
29
|
+
@data.links ||= remove_mailto(parsed_document.search("//a") \
|
30
|
+
.map {|link| link.attributes["href"] \
|
30
31
|
.to_s.strip}.uniq) rescue nil
|
31
32
|
end
|
32
33
|
|
33
34
|
# Returns the links converted to absolute urls
|
34
35
|
def absolute_links
|
35
|
-
@absolute_links ||= links.map { |l| absolutify_url(l) }
|
36
|
+
@data.absolute_links ||= links.map { |l| absolutify_url(l) }
|
36
37
|
end
|
37
38
|
|
38
39
|
# Returns the parsed document meta rss links
|
39
40
|
def feed
|
40
|
-
@feed ||= parsed_document.xpath("//link").select{ |link|
|
41
|
+
@data.feed ||= parsed_document.xpath("//link").select{ |link|
|
41
42
|
link.attributes["type"] && link.attributes["type"].value =~ /(atom|rss)/
|
42
43
|
}.map { |link|
|
43
44
|
absolutify_url(link.attributes["href"].value)
|
@@ -48,14 +49,21 @@ module MetaInspector
|
|
48
49
|
# Most all major websites now define this property and is usually very relevant
|
49
50
|
# See doc at http://developers.facebook.com/docs/opengraph/
|
50
51
|
def image
|
51
|
-
|
52
|
+
meta_og_image
|
52
53
|
end
|
53
54
|
|
54
55
|
# Returns the charset
|
55
56
|
# TODO: We should trust the charset expressed on the Content-Type meta tag
|
56
57
|
# and only guess it if none given
|
57
58
|
def charset
|
58
|
-
@charset ||= CharGuess.guess(document).downcase
|
59
|
+
@data.charset ||= CharGuess.guess(document).downcase
|
60
|
+
end
|
61
|
+
|
62
|
+
# Returns all parsed data as a nested Hash
|
63
|
+
def to_hash
|
64
|
+
# TODO: find a better option to populate the data to the Hash
|
65
|
+
image;feed;links;charset;absolute_links;title;meta_keywords
|
66
|
+
@data.to_hash
|
59
67
|
end
|
60
68
|
|
61
69
|
# Returns the whole parsed document
|
@@ -85,14 +93,23 @@ module MetaInspector
|
|
85
93
|
#
|
86
94
|
# It will first try with meta name="..." and if nothing found,
|
87
95
|
# with meta http-equiv="...", substituting "_" by "-"
|
88
|
-
# TODO: this should be case unsensitive, so meta_robots gets the results from the HTML for robots, Robots, ROBOTS...
|
89
|
-
# TODO: cache results on instance variables, using ||=
|
90
96
|
# TODO: define respond_to? to return true on the meta_name methods
|
91
97
|
def method_missing(method_name)
|
92
98
|
if method_name.to_s =~ /^meta_(.*)/
|
93
|
-
|
94
|
-
|
95
|
-
|
99
|
+
key = $1
|
100
|
+
#special treatment for og:
|
101
|
+
if key =~ /^og_(.*)/
|
102
|
+
key = "og:#{$1}"
|
103
|
+
end
|
104
|
+
unless @data.meta
|
105
|
+
@data.meta!.name!
|
106
|
+
@data.meta!.property!
|
107
|
+
parsed_document.xpath("//meta").each { |element|
|
108
|
+
@data.meta.name[element.attributes["name"].value.downcase] = element.attributes["content"].value if element.attributes["name"]
|
109
|
+
@data.meta.property[element.attributes["property"].value.downcase] = element.attributes["content"].value if element.attributes["property"]
|
110
|
+
}
|
111
|
+
end
|
112
|
+
@data.meta.name && (@data.meta.name[key.downcase]) || (@data.meta.property && @data.meta.property[key.downcase])
|
96
113
|
else
|
97
114
|
super
|
98
115
|
end
|
data/lib/metainspector.rb
CHANGED
data/meta_inspector.gemspec
CHANGED
@@ -21,8 +21,9 @@ Gem::Specification.new do |s|
|
|
21
21
|
|
22
22
|
s.add_dependency 'nokogiri', '1.4.4'
|
23
23
|
s.add_dependency 'charguess', '1.3.20110226181011'
|
24
|
+
s.add_dependency "rash", "~> 0.3.0"
|
24
25
|
|
25
26
|
s.add_development_dependency 'rspec', '~> 2.6.0'
|
26
27
|
s.add_development_dependency 'fakeweb', '~> 1.3.0'
|
27
|
-
|
28
|
+
s.add_development_dependency 'awesome_print', '~> 0.4.0'
|
28
29
|
end
|
data/samples/basic_scraping.rb
CHANGED
@@ -1,17 +1,22 @@
|
|
1
1
|
# Some basic MetaInspector samples
|
2
2
|
|
3
|
-
|
3
|
+
$: << File.join(File.dirname(__FILE__), "/../lib")
|
4
|
+
require 'meta_inspector'
|
5
|
+
require 'ap'
|
4
6
|
|
5
|
-
puts "Enter a valid http
|
6
|
-
|
7
|
-
page = MetaInspector.new(
|
7
|
+
puts "Enter a valid http url to scrape it"
|
8
|
+
url = gets.strip
|
9
|
+
page = MetaInspector.new(url)
|
8
10
|
puts "...please wait while scraping the page..."
|
9
11
|
|
10
|
-
puts "Scraping #{page.
|
12
|
+
puts "Scraping #{page.url} returned these results:"
|
11
13
|
puts "TITLE: #{page.title}"
|
12
14
|
puts "META DESCRIPTION: #{page.meta_description}"
|
13
15
|
puts "META KEYWORDS: #{page.meta_keywords}"
|
14
16
|
puts "#{page.links.size} links found..."
|
15
17
|
page.links.each do |link|
|
16
18
|
puts " ==> #{link}"
|
17
|
-
end
|
19
|
+
end
|
20
|
+
|
21
|
+
puts "to_hash..."
|
22
|
+
ap page.to_hash
|
data/samples/spider.rb
CHANGED
@@ -1,19 +1,20 @@
|
|
1
1
|
# A basic spider that will follow links on an infinite loop
|
2
|
-
|
2
|
+
$: << File.join(File.dirname(__FILE__), "/../lib")
|
3
|
+
require 'meta_inspector'
|
3
4
|
|
4
5
|
q = Queue.new
|
5
6
|
visited_links=[]
|
6
7
|
|
7
|
-
puts "Enter a valid http
|
8
|
-
|
8
|
+
puts "Enter a valid http url to spider it following external links"
|
9
|
+
url = gets.strip
|
9
10
|
|
10
|
-
page = MetaInspector.new(
|
11
|
-
q.push(
|
11
|
+
page = MetaInspector.new(url)
|
12
|
+
q.push(url)
|
12
13
|
|
13
14
|
while q.size > 0
|
14
|
-
visited_links <<
|
15
|
-
page = MetaInspector.new(
|
16
|
-
puts "Spidering #{page.
|
15
|
+
visited_links << url = q.pop
|
16
|
+
page = MetaInspector.new(url)
|
17
|
+
puts "Spidering #{page.url}"
|
17
18
|
|
18
19
|
puts "TITLE: #{page.title}"
|
19
20
|
puts "META DESCRIPTION: #{page.meta_description}"
|
data/spec/metainspector_spec.rb
CHANGED
@@ -43,6 +43,7 @@ describe MetaInspector do
|
|
43
43
|
it "should find an image" do
|
44
44
|
@m = MetaInspector.new('http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/')
|
45
45
|
@m.image.should == "http://o.onionstatic.com/images/articles/article/2772/Apple-Claims-600w-R_jpg_130x110_q85.jpg"
|
46
|
+
@m.meta_og_image.should == "http://o.onionstatic.com/images/articles/article/2772/Apple-Claims-600w-R_jpg_130x110_q85.jpg"
|
46
47
|
end
|
47
48
|
|
48
49
|
it "should have a Nokogiri::HTML::Document as parsed_document" do
|
@@ -103,6 +104,10 @@ describe MetaInspector do
|
|
103
104
|
@m.meta_robots.should == 'all,follow'
|
104
105
|
end
|
105
106
|
|
107
|
+
it "should get the robots meta tag" do
|
108
|
+
@m.meta_RoBoTs.should == 'all,follow'
|
109
|
+
end
|
110
|
+
|
106
111
|
it "should get the description meta tag" do
|
107
112
|
@m.meta_description.should == 'Track your PageRank(TM) changes and receive alerts by email'
|
108
113
|
end
|
@@ -116,9 +121,8 @@ describe MetaInspector do
|
|
116
121
|
@m.meta_content_language.should == "en"
|
117
122
|
end
|
118
123
|
|
119
|
-
it "should get the
|
120
|
-
|
121
|
-
@m.meta_Content_Type.should == "text/html; charset=utf-8"
|
124
|
+
it "should get the Csrf_pAram meta tag" do
|
125
|
+
@m.meta_Csrf_pAram.should == "authenticity_token"
|
122
126
|
end
|
123
127
|
|
124
128
|
it "should get the generator meta tag" do
|
@@ -129,6 +133,17 @@ describe MetaInspector do
|
|
129
133
|
it "should return nil for nonfound meta_tags" do
|
130
134
|
@m.meta_lollypop.should == nil
|
131
135
|
end
|
136
|
+
|
137
|
+
it "should find a meta_og_title" do
|
138
|
+
@m = MetaInspector.new('http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/')
|
139
|
+
@m.meta_og_title.should == "Apple Claims New iPhone Only Visible To Most Loyal Of Customers"
|
140
|
+
end
|
141
|
+
|
142
|
+
it "should not find a meta_og_something" do
|
143
|
+
@m = MetaInspector.new('http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/')
|
144
|
+
@m.meta_og_something.should == nil
|
145
|
+
end
|
146
|
+
|
132
147
|
end
|
133
148
|
|
134
149
|
context 'Charset detection' do
|
@@ -146,4 +161,17 @@ describe MetaInspector do
|
|
146
161
|
@m.charset.should == "utf-8"
|
147
162
|
end
|
148
163
|
end
|
164
|
+
|
165
|
+
context 'to_hash' do
|
166
|
+
|
167
|
+
FakeWeb.register_uri(:get, "http://www.pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
|
168
|
+
|
169
|
+
it "should return a hash with all the values set" do
|
170
|
+
@m = MetaInspector.new('http://www.pagerankalert.com')
|
171
|
+
@m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://www.pagerankalert.com", "meta"=>{"name"=>{"robots"=>"all,follow", "csrf_param"=>"authenticity_token", "description"=>"Track your PageRank(TM) changes and receive alerts by email", "keywords"=>"pagerank, seo, optimization, google", "csrf_token"=>"iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="}, "property"=>{}}, "links"=>["/", "/es?language=es", "/users/sign_up", "/users/sign_in", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"], "charset"=>"utf-8", "feed"=>"http://feeds.feedburner.com/PageRankAlert", "absolute_links"=>["http://www.pagerankalert.com/", "http://www.pagerankalert.com/es?language=es", "http://www.pagerankalert.com/users/sign_up", "http://www.pagerankalert.com/users/sign_in", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"]}
|
172
|
+
end
|
173
|
+
|
174
|
+
end
|
175
|
+
|
176
|
+
|
149
177
|
end
|
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 1
|
7
|
-
-
|
7
|
+
- 6
|
8
8
|
- 0
|
9
|
-
version: 1.
|
9
|
+
version: 1.6.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jaime Iniesta
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-
|
17
|
+
date: 2011-06-03 00:00:00 +02:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -48,9 +48,24 @@ dependencies:
|
|
48
48
|
type: :runtime
|
49
49
|
version_requirements: *id002
|
50
50
|
- !ruby/object:Gem::Dependency
|
51
|
-
name:
|
51
|
+
name: rash
|
52
52
|
prerelease: false
|
53
53
|
requirement: &id003 !ruby/object:Gem::Requirement
|
54
|
+
none: false
|
55
|
+
requirements:
|
56
|
+
- - ~>
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
segments:
|
59
|
+
- 0
|
60
|
+
- 3
|
61
|
+
- 0
|
62
|
+
version: 0.3.0
|
63
|
+
type: :runtime
|
64
|
+
version_requirements: *id003
|
65
|
+
- !ruby/object:Gem::Dependency
|
66
|
+
name: rspec
|
67
|
+
prerelease: false
|
68
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
54
69
|
none: false
|
55
70
|
requirements:
|
56
71
|
- - ~>
|
@@ -61,11 +76,11 @@ dependencies:
|
|
61
76
|
- 0
|
62
77
|
version: 2.6.0
|
63
78
|
type: :development
|
64
|
-
version_requirements: *
|
79
|
+
version_requirements: *id004
|
65
80
|
- !ruby/object:Gem::Dependency
|
66
81
|
name: fakeweb
|
67
82
|
prerelease: false
|
68
|
-
requirement: &
|
83
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
69
84
|
none: false
|
70
85
|
requirements:
|
71
86
|
- - ~>
|
@@ -76,7 +91,22 @@ dependencies:
|
|
76
91
|
- 0
|
77
92
|
version: 1.3.0
|
78
93
|
type: :development
|
79
|
-
version_requirements: *
|
94
|
+
version_requirements: *id005
|
95
|
+
- !ruby/object:Gem::Dependency
|
96
|
+
name: awesome_print
|
97
|
+
prerelease: false
|
98
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
99
|
+
none: false
|
100
|
+
requirements:
|
101
|
+
- - ~>
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
segments:
|
104
|
+
- 0
|
105
|
+
- 4
|
106
|
+
- 0
|
107
|
+
version: 0.4.0
|
108
|
+
type: :development
|
109
|
+
version_requirements: *id006
|
80
110
|
description: MetaInspector lets you scrape a web page and get its title, charset, link and meta tags
|
81
111
|
email:
|
82
112
|
- jaimeiniesta@gmail.com
|