metainspector 1.2.0 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc CHANGED
@@ -18,13 +18,19 @@ or, for short, a convenience alias is also available:
18
18
 
19
19
  page = MetaInspector.new('http://pagerankalert.com')
20
20
 
21
+ If you don't include the scheme on the URL, http:// will be used
22
+ by defaul:
23
+
24
+ page = MetaInspector.new('pagerankalert.com')
25
+
21
26
  Then you can see the scraped data like this:
22
27
 
23
- page.address # URL of the page
28
+ page.url # URL of the page
24
29
  page.title # title of the page, as string
25
30
  page.links # array of strings, with every link found on the page
26
31
  page.meta_description # meta description, as string
27
32
  page.meta_keywords # meta keywords, as string
33
+ page.image # Most relevant image, if defined with og:image
28
34
 
29
35
  MetaInspector uses dynamic methods for meta_tag discovery, so all these will work, and will be converted to a search of a meta tag by the corresponding name, and return its content attribute
30
36
 
@@ -53,7 +59,7 @@ You can find some sample scripts on the samples folder, including a basic scrapi
53
59
  => true
54
60
 
55
61
  >> page = MetaInspector.new('http://pagerankalert.com')
56
- => #<MetaInspector:0x11330c0 @document=nil, @links=nil, @address="http://pagerankalert.com", @description=nil, @keywords=nil, @title=nil>
62
+ => #<MetaInspector:0x11330c0 @url="http://pagerankalert.com">
57
63
 
58
64
  >> page.title
59
65
  => "PageRankAlert.com :: Track your PageRank changes"
@@ -76,15 +82,18 @@ You can find some sample scripts on the samples folder, including a basic scrapi
76
82
  >> page.parsed_document.class
77
83
  => Nokogiri::HTML::Document
78
84
 
85
+ = ZOMG Fork! Thank you!
86
+
87
+ You're welcome to fork this project and send pull requests. I want to thank Ryan Romanchuk for his help https://github.com/rromanchuk
88
+
79
89
  = To Do
80
90
 
81
- * Get page.base_dir from the address
91
+ * Get page.base_dir from the URL
82
92
  * Distinguish between external and internal links, returning page.links for all of them as found, page.external_links and page.internal_links converted to absolute URLs
83
93
  * Return array of images in page as absolute URLs
84
94
  * Be able to set a timeout in seconds
85
95
  * If keywords seem to be separated by blank spaces, replace them with commas
86
96
  * Mocks
87
97
  * Check content type, process only HTML pages, don't try to scrape TAR files like http://ftp.ruby-lang.org/pub/ruby/ruby-1.9.1-p129.tar.bz2 or video files like http://isabel.dit.upm.es/component/option,com_docman/task,doc_download/gid,831/Itemid,74/
88
- * Get most important image querying Facebook
89
98
 
90
99
  Copyright (c) 2009-2011 Jaime Iniesta, released under the MIT license
@@ -9,14 +9,12 @@ require 'iconv'
9
9
  # MetaInspector provides an easy way to scrape web pages and get its elements
10
10
  module MetaInspector
11
11
  class Scraper
12
- attr_reader :address
12
+ attr_reader :url
13
13
 
14
- # Initializes a new instance of MetaInspector, setting the URL address to the one given
15
- # TODO: validate address as http URL, dont initialize it if wrong format
16
- def initialize(address)
17
- @address = address
18
-
19
- @document = @title = @description = @keywords = @links = nil
14
+ # Initializes a new instance of MetaInspector, setting the URL to the one given
15
+ # If no scheme given, set it to http:// by default
16
+ def initialize(url)
17
+ @url = URI.parse(url).scheme.nil? ? 'http://' + url : url
20
18
  end
21
19
 
22
20
  # Returns the parsed document title, from the content of the <title> tag.
@@ -30,6 +28,13 @@ module MetaInspector
30
28
  @links ||= parsed_document.search("//a").map {|link| link.attributes["href"].to_s.strip} rescue nil
31
29
  end
32
30
 
31
+ # Returns the parsed image from Facebook's open graph property tags
32
+ # Most all major websites now define this property and is usually very relevant
33
+ # See doc at http://developers.facebook.com/docs/opengraph/
34
+ def image
35
+ @image ||= parsed_document.document.css("meta[@property='og:image']").first['content'] rescue nil
36
+ end
37
+
33
38
  # Returns the charset
34
39
  # TODO: We should trust the charset expressed on the Content-Type meta tag
35
40
  # and only guess it if none given
@@ -47,7 +52,7 @@ module MetaInspector
47
52
 
48
53
  # Returns the original, unparsed document
49
54
  def document
50
- @document ||= open(@address).read
55
+ @document ||= open(@url).read
51
56
 
52
57
  rescue SocketError
53
58
  warn 'MetaInspector exception: The url provided does not exist or is temporarily unavailable (socket error)'
@@ -71,7 +76,6 @@ module MetaInspector
71
76
  if method_name.to_s =~ /^meta_(.*)/
72
77
  content = parsed_document.css("meta[@name='#{$1}']").first['content'] rescue nil
73
78
  content = parsed_document.css("meta[@http-equiv='#{$1.gsub("_", "-")}']").first['content'] rescue nil if content.nil?
74
-
75
79
  content
76
80
  else
77
81
  super
@@ -1,5 +1,5 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  module MetaInspector
4
- VERSION = "1.2.0"
4
+ VERSION = "1.3.0"
5
5
  end
@@ -4,13 +4,36 @@ require File.join(File.dirname(__FILE__), "/spec_helper")
4
4
 
5
5
  describe MetaInspector do
6
6
 
7
+ context 'Initialization' do
8
+ it 'should accept an URL with a scheme' do
9
+ @m = MetaInspector.new('http://pagerankalert.com')
10
+ @m.url.should == 'http://pagerankalert.com'
11
+ end
12
+
13
+ it "should use http:// as a default scheme" do
14
+ @m = MetaInspector.new('pagerankalert.com')
15
+ @m.url.should == 'http://pagerankalert.com'
16
+ end
17
+ end
18
+
7
19
  context 'Doing a basic scrape' do
20
+ EXPECTED_TITLE = 'PageRankAlert.com :: Track your PageRank changes'
21
+
8
22
  before(:each) do
9
23
  @m = MetaInspector.new('http://pagerankalert.com')
10
24
  end
11
25
 
12
26
  it "should get the title" do
13
- @m.title.should == 'PageRankAlert.com :: Track your PageRank changes'
27
+ @m.title.should == EXPECTED_TITLE
28
+ end
29
+
30
+ it "should not find an image" do
31
+ @m.image.should == nil
32
+ end
33
+
34
+ it "should find an image" do
35
+ @m = MetaInspector.new('http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/')
36
+ @m.image.should == "http://o.onionstatic.com/images/articles/article/2772/Apple-Claims-600w-R_jpg_130x110_q85.jpg"
14
37
  end
15
38
 
16
39
  it "should get the links" do
metadata CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
4
4
  prerelease: false
5
5
  segments:
6
6
  - 1
7
- - 2
7
+ - 3
8
8
  - 0
9
- version: 1.2.0
9
+ version: 1.3.0
10
10
  platform: ruby
11
11
  authors:
12
12
  - Jaime Iniesta
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-05-05 00:00:00 +02:00
17
+ date: 2011-05-09 00:00:00 +02:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency