metainspector 1.5.0 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.rdoc CHANGED
@@ -32,7 +32,9 @@ Then you can see the scraped data like this:
32
32
  page.meta_description # meta description, as string
33
33
  page.meta_keywords # meta keywords, as string
34
34
  page.image # Most relevant image, if defined with og:image
35
- page.feed # Get rss or atom links in meta data fields as array
35
+ page.feed # Get rss or atom links in meta data fields as array
36
+ page.meta_og_title # opengraph title
37
+ page.meta_og_image # opengraph image
36
38
 
37
39
  MetaInspector uses dynamic methods for meta_tag discovery, so all these will work, and will be converted to a search of a meta tag by the corresponding name, and return its content attribute
38
40
 
@@ -48,6 +50,10 @@ It will also work for the meta tags of the form <meta http-equiv="name" ... />,
48
50
 
49
51
  Please notice that MetaInspector is case sensitive, so page.meta_Content_Type is not the same as page.meta_content_type
50
52
 
53
+ You can also access most of the scraped data as a hash:
54
+
55
+ page.to_hash # { "url"=>"http://pagerankalert.com", "title" => "PageRankAlert.com", ... }
56
+
51
57
  The full scraped document if accessible from:
52
58
 
53
59
  page.document # Nokogiri doc that you can use it to get any element from the page
@@ -100,5 +106,6 @@ You're welcome to fork this project and send pull requests. I want to thank spec
100
106
  * If keywords seem to be separated by blank spaces, replace them with commas
101
107
  * Mocks
102
108
  * Check content type, process only HTML pages, don't try to scrape TAR files like http://ftp.ruby-lang.org/pub/ruby/ruby-1.9.1-p129.tar.bz2 or video files like http://isabel.dit.upm.es/component/option,com_docman/task,doc_download/gid,831/Itemid,74/
109
+ * Autodiscover all available meta tags
103
110
 
104
111
  Copyright (c) 2009-2011 Jaime Iniesta, released under the MIT license
@@ -5,39 +5,40 @@ require 'rubygems'
5
5
  require 'nokogiri'
6
6
  require 'charguess'
7
7
  require 'iconv'
8
+ require 'hashie/rash'
8
9
 
9
10
  # MetaInspector provides an easy way to scrape web pages and get its elements
10
11
  module MetaInspector
11
12
  class Scraper
12
13
  attr_reader :url
13
-
14
14
  # Initializes a new instance of MetaInspector, setting the URL to the one given
15
15
  # If no scheme given, set it to http:// by default
16
16
  def initialize(url)
17
17
  @url = URI.parse(url).scheme.nil? ? 'http://' + url : url
18
+ @data = Hashie::Rash.new('url' => @url)
18
19
  end
19
20
 
20
21
  # Returns the parsed document title, from the content of the <title> tag.
21
22
  # This is not the same as the meta_tite tag
22
23
  def title
23
- @title ||= parsed_document.css('title').inner_html rescue nil
24
+ @data.title ||= parsed_document.css('title').inner_html rescue nil
24
25
  end
25
26
 
26
27
  # Returns the parsed document links
27
28
  def links
28
- @links ||= remove_mailto(parsed_document.search("//a")
29
- .map {|link| link.attributes["href"]
29
+ @data.links ||= remove_mailto(parsed_document.search("//a") \
30
+ .map {|link| link.attributes["href"] \
30
31
  .to_s.strip}.uniq) rescue nil
31
32
  end
32
33
 
33
34
  # Returns the links converted to absolute urls
34
35
  def absolute_links
35
- @absolute_links ||= links.map { |l| absolutify_url(l) }
36
+ @data.absolute_links ||= links.map { |l| absolutify_url(l) }
36
37
  end
37
38
 
38
39
  # Returns the parsed document meta rss links
39
40
  def feed
40
- @feed ||= parsed_document.xpath("//link").select{ |link|
41
+ @data.feed ||= parsed_document.xpath("//link").select{ |link|
41
42
  link.attributes["type"] && link.attributes["type"].value =~ /(atom|rss)/
42
43
  }.map { |link|
43
44
  absolutify_url(link.attributes["href"].value)
@@ -48,14 +49,21 @@ module MetaInspector
48
49
  # Most all major websites now define this property and is usually very relevant
49
50
  # See doc at http://developers.facebook.com/docs/opengraph/
50
51
  def image
51
- @image ||= parsed_document.document.css("meta[@property='og:image']").first['content'] rescue nil
52
+ meta_og_image
52
53
  end
53
54
 
54
55
  # Returns the charset
55
56
  # TODO: We should trust the charset expressed on the Content-Type meta tag
56
57
  # and only guess it if none given
57
58
  def charset
58
- @charset ||= CharGuess.guess(document).downcase
59
+ @data.charset ||= CharGuess.guess(document).downcase
60
+ end
61
+
62
+ # Returns all parsed data as a nested Hash
63
+ def to_hash
64
+ # TODO: find a better option to populate the data to the Hash
65
+ image;feed;links;charset;absolute_links;title;meta_keywords
66
+ @data.to_hash
59
67
  end
60
68
 
61
69
  # Returns the whole parsed document
@@ -85,14 +93,23 @@ module MetaInspector
85
93
  #
86
94
  # It will first try with meta name="..." and if nothing found,
87
95
  # with meta http-equiv="...", substituting "_" by "-"
88
- # TODO: this should be case unsensitive, so meta_robots gets the results from the HTML for robots, Robots, ROBOTS...
89
- # TODO: cache results on instance variables, using ||=
90
96
  # TODO: define respond_to? to return true on the meta_name methods
91
97
  def method_missing(method_name)
92
98
  if method_name.to_s =~ /^meta_(.*)/
93
- content = parsed_document.css("meta[@name='#{$1}']").first['content'] rescue nil
94
- content = parsed_document.css("meta[@http-equiv='#{$1.gsub("_", "-")}']").first['content'] rescue nil if content.nil?
95
- content
99
+ key = $1
100
+ #special treatment for og:
101
+ if key =~ /^og_(.*)/
102
+ key = "og:#{$1}"
103
+ end
104
+ unless @data.meta
105
+ @data.meta!.name!
106
+ @data.meta!.property!
107
+ parsed_document.xpath("//meta").each { |element|
108
+ @data.meta.name[element.attributes["name"].value.downcase] = element.attributes["content"].value if element.attributes["name"]
109
+ @data.meta.property[element.attributes["property"].value.downcase] = element.attributes["content"].value if element.attributes["property"]
110
+ }
111
+ end
112
+ @data.meta.name && (@data.meta.name[key.downcase]) || (@data.meta.property && @data.meta.property[key.downcase])
96
113
  else
97
114
  super
98
115
  end
@@ -1,5 +1,5 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  module MetaInspector
4
- VERSION = "1.5.0"
4
+ VERSION = "1.6.0"
5
5
  end
data/lib/metainspector.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
- require 'meta_inspector'
3
+ require File.expand_path(File.join(File.dirname(__FILE__), './meta_inspector'))
@@ -21,8 +21,9 @@ Gem::Specification.new do |s|
21
21
 
22
22
  s.add_dependency 'nokogiri', '1.4.4'
23
23
  s.add_dependency 'charguess', '1.3.20110226181011'
24
+ s.add_dependency "rash", "~> 0.3.0"
24
25
 
25
26
  s.add_development_dependency 'rspec', '~> 2.6.0'
26
27
  s.add_development_dependency 'fakeweb', '~> 1.3.0'
27
-
28
+ s.add_development_dependency 'awesome_print', '~> 0.4.0'
28
29
  end
@@ -1,17 +1,22 @@
1
1
  # Some basic MetaInspector samples
2
2
 
3
- require_relative '../lib/meta_inspector.rb'
3
+ $: << File.join(File.dirname(__FILE__), "/../lib")
4
+ require 'meta_inspector'
5
+ require 'ap'
4
6
 
5
- puts "Enter a valid http address to scrape it"
6
- address = gets.strip
7
- page = MetaInspector.new(address)
7
+ puts "Enter a valid http url to scrape it"
8
+ url = gets.strip
9
+ page = MetaInspector.new(url)
8
10
  puts "...please wait while scraping the page..."
9
11
 
10
- puts "Scraping #{page.address} returned these results:"
12
+ puts "Scraping #{page.url} returned these results:"
11
13
  puts "TITLE: #{page.title}"
12
14
  puts "META DESCRIPTION: #{page.meta_description}"
13
15
  puts "META KEYWORDS: #{page.meta_keywords}"
14
16
  puts "#{page.links.size} links found..."
15
17
  page.links.each do |link|
16
18
  puts " ==> #{link}"
17
- end
19
+ end
20
+
21
+ puts "to_hash..."
22
+ ap page.to_hash
data/samples/spider.rb CHANGED
@@ -1,19 +1,20 @@
1
1
  # A basic spider that will follow links on an infinite loop
2
- require_relative '../lib/meta_inspector.rb'
2
+ $: << File.join(File.dirname(__FILE__), "/../lib")
3
+ require 'meta_inspector'
3
4
 
4
5
  q = Queue.new
5
6
  visited_links=[]
6
7
 
7
- puts "Enter a valid http address to spider it following external links"
8
- address = gets.strip
8
+ puts "Enter a valid http url to spider it following external links"
9
+ url = gets.strip
9
10
 
10
- page = MetaInspector.new(address)
11
- q.push(address)
11
+ page = MetaInspector.new(url)
12
+ q.push(url)
12
13
 
13
14
  while q.size > 0
14
- visited_links << address = q.pop
15
- page = MetaInspector.new(address)
16
- puts "Spidering #{page.address}"
15
+ visited_links << url = q.pop
16
+ page = MetaInspector.new(url)
17
+ puts "Spidering #{page.url}"
17
18
 
18
19
  puts "TITLE: #{page.title}"
19
20
  puts "META DESCRIPTION: #{page.meta_description}"
@@ -43,6 +43,7 @@ describe MetaInspector do
43
43
  it "should find an image" do
44
44
  @m = MetaInspector.new('http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/')
45
45
  @m.image.should == "http://o.onionstatic.com/images/articles/article/2772/Apple-Claims-600w-R_jpg_130x110_q85.jpg"
46
+ @m.meta_og_image.should == "http://o.onionstatic.com/images/articles/article/2772/Apple-Claims-600w-R_jpg_130x110_q85.jpg"
46
47
  end
47
48
 
48
49
  it "should have a Nokogiri::HTML::Document as parsed_document" do
@@ -103,6 +104,10 @@ describe MetaInspector do
103
104
  @m.meta_robots.should == 'all,follow'
104
105
  end
105
106
 
107
+ it "should get the robots meta tag" do
108
+ @m.meta_RoBoTs.should == 'all,follow'
109
+ end
110
+
106
111
  it "should get the description meta tag" do
107
112
  @m.meta_description.should == 'Track your PageRank(TM) changes and receive alerts by email'
108
113
  end
@@ -116,9 +121,8 @@ describe MetaInspector do
116
121
  @m.meta_content_language.should == "en"
117
122
  end
118
123
 
119
- it "should get the Content-Type meta tag" do
120
- pending "mocks"
121
- @m.meta_Content_Type.should == "text/html; charset=utf-8"
124
+ it "should get the Csrf_pAram meta tag" do
125
+ @m.meta_Csrf_pAram.should == "authenticity_token"
122
126
  end
123
127
 
124
128
  it "should get the generator meta tag" do
@@ -129,6 +133,17 @@ describe MetaInspector do
129
133
  it "should return nil for nonfound meta_tags" do
130
134
  @m.meta_lollypop.should == nil
131
135
  end
136
+
137
+ it "should find a meta_og_title" do
138
+ @m = MetaInspector.new('http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/')
139
+ @m.meta_og_title.should == "Apple Claims New iPhone Only Visible To Most Loyal Of Customers"
140
+ end
141
+
142
+ it "should not find a meta_og_something" do
143
+ @m = MetaInspector.new('http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/')
144
+ @m.meta_og_something.should == nil
145
+ end
146
+
132
147
  end
133
148
 
134
149
  context 'Charset detection' do
@@ -146,4 +161,17 @@ describe MetaInspector do
146
161
  @m.charset.should == "utf-8"
147
162
  end
148
163
  end
164
+
165
+ context 'to_hash' do
166
+
167
+ FakeWeb.register_uri(:get, "http://www.pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
168
+
169
+ it "should return a hash with all the values set" do
170
+ @m = MetaInspector.new('http://www.pagerankalert.com')
171
+ @m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://www.pagerankalert.com", "meta"=>{"name"=>{"robots"=>"all,follow", "csrf_param"=>"authenticity_token", "description"=>"Track your PageRank(TM) changes and receive alerts by email", "keywords"=>"pagerank, seo, optimization, google", "csrf_token"=>"iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="}, "property"=>{}}, "links"=>["/", "/es?language=es", "/users/sign_up", "/users/sign_in", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"], "charset"=>"utf-8", "feed"=>"http://feeds.feedburner.com/PageRankAlert", "absolute_links"=>["http://www.pagerankalert.com/", "http://www.pagerankalert.com/es?language=es", "http://www.pagerankalert.com/users/sign_up", "http://www.pagerankalert.com/users/sign_in", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"]}
172
+ end
173
+
174
+ end
175
+
176
+
149
177
  end
metadata CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
4
4
  prerelease: false
5
5
  segments:
6
6
  - 1
7
- - 5
7
+ - 6
8
8
  - 0
9
- version: 1.5.0
9
+ version: 1.6.0
10
10
  platform: ruby
11
11
  authors:
12
12
  - Jaime Iniesta
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-05-30 00:00:00 +02:00
17
+ date: 2011-06-03 00:00:00 +02:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -48,9 +48,24 @@ dependencies:
48
48
  type: :runtime
49
49
  version_requirements: *id002
50
50
  - !ruby/object:Gem::Dependency
51
- name: rspec
51
+ name: rash
52
52
  prerelease: false
53
53
  requirement: &id003 !ruby/object:Gem::Requirement
54
+ none: false
55
+ requirements:
56
+ - - ~>
57
+ - !ruby/object:Gem::Version
58
+ segments:
59
+ - 0
60
+ - 3
61
+ - 0
62
+ version: 0.3.0
63
+ type: :runtime
64
+ version_requirements: *id003
65
+ - !ruby/object:Gem::Dependency
66
+ name: rspec
67
+ prerelease: false
68
+ requirement: &id004 !ruby/object:Gem::Requirement
54
69
  none: false
55
70
  requirements:
56
71
  - - ~>
@@ -61,11 +76,11 @@ dependencies:
61
76
  - 0
62
77
  version: 2.6.0
63
78
  type: :development
64
- version_requirements: *id003
79
+ version_requirements: *id004
65
80
  - !ruby/object:Gem::Dependency
66
81
  name: fakeweb
67
82
  prerelease: false
68
- requirement: &id004 !ruby/object:Gem::Requirement
83
+ requirement: &id005 !ruby/object:Gem::Requirement
69
84
  none: false
70
85
  requirements:
71
86
  - - ~>
@@ -76,7 +91,22 @@ dependencies:
76
91
  - 0
77
92
  version: 1.3.0
78
93
  type: :development
79
- version_requirements: *id004
94
+ version_requirements: *id005
95
+ - !ruby/object:Gem::Dependency
96
+ name: awesome_print
97
+ prerelease: false
98
+ requirement: &id006 !ruby/object:Gem::Requirement
99
+ none: false
100
+ requirements:
101
+ - - ~>
102
+ - !ruby/object:Gem::Version
103
+ segments:
104
+ - 0
105
+ - 4
106
+ - 0
107
+ version: 0.4.0
108
+ type: :development
109
+ version_requirements: *id006
80
110
  description: MetaInspector lets you scrape a web page and get its title, charset, link and meta tags
81
111
  email:
82
112
  - jaimeiniesta@gmail.com