metainspector 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.rdoc CHANGED
@@ -18,13 +18,19 @@ or, for short, a convenience alias is also available:
18
18
 
19
19
  page = MetaInspector.new('http://pagerankalert.com')
20
20
 
21
+ If you don't include the scheme on the URL, http:// will be used
22
+ by defaul:
23
+
24
+ page = MetaInspector.new('pagerankalert.com')
25
+
21
26
  Then you can see the scraped data like this:
22
27
 
23
- page.address # URL of the page
28
+ page.url # URL of the page
24
29
  page.title # title of the page, as string
25
30
  page.links # array of strings, with every link found on the page
26
31
  page.meta_description # meta description, as string
27
32
  page.meta_keywords # meta keywords, as string
33
+ page.image # Most relevant image, if defined with og:image
28
34
 
29
35
  MetaInspector uses dynamic methods for meta_tag discovery, so all these will work, and will be converted to a search of a meta tag by the corresponding name, and return its content attribute
30
36
 
@@ -53,7 +59,7 @@ You can find some sample scripts on the samples folder, including a basic scrapi
53
59
  => true
54
60
 
55
61
  >> page = MetaInspector.new('http://pagerankalert.com')
56
- => #<MetaInspector:0x11330c0 @document=nil, @links=nil, @address="http://pagerankalert.com", @description=nil, @keywords=nil, @title=nil>
62
+ => #<MetaInspector:0x11330c0 @url="http://pagerankalert.com">
57
63
 
58
64
  >> page.title
59
65
  => "PageRankAlert.com :: Track your PageRank changes"
@@ -76,15 +82,18 @@ You can find some sample scripts on the samples folder, including a basic scrapi
76
82
  >> page.parsed_document.class
77
83
  => Nokogiri::HTML::Document
78
84
 
85
+ = ZOMG Fork! Thank you!
86
+
87
+ You're welcome to fork this project and send pull requests. I want to thank Ryan Romanchuk for his help https://github.com/rromanchuk
88
+
79
89
  = To Do
80
90
 
81
- * Get page.base_dir from the address
91
+ * Get page.base_dir from the URL
82
92
  * Distinguish between external and internal links, returning page.links for all of them as found, page.external_links and page.internal_links converted to absolute URLs
83
93
  * Return array of images in page as absolute URLs
84
94
  * Be able to set a timeout in seconds
85
95
  * If keywords seem to be separated by blank spaces, replace them with commas
86
96
  * Mocks
87
97
  * Check content type, process only HTML pages, don't try to scrape TAR files like http://ftp.ruby-lang.org/pub/ruby/ruby-1.9.1-p129.tar.bz2 or video files like http://isabel.dit.upm.es/component/option,com_docman/task,doc_download/gid,831/Itemid,74/
88
- * Get most important image querying Facebook
89
98
 
90
99
  Copyright (c) 2009-2011 Jaime Iniesta, released under the MIT license
@@ -9,14 +9,12 @@ require 'iconv'
9
9
  # MetaInspector provides an easy way to scrape web pages and get its elements
10
10
  module MetaInspector
11
11
  class Scraper
12
- attr_reader :address
12
+ attr_reader :url
13
13
 
14
- # Initializes a new instance of MetaInspector, setting the URL address to the one given
15
- # TODO: validate address as http URL, dont initialize it if wrong format
16
- def initialize(address)
17
- @address = address
18
-
19
- @document = @title = @description = @keywords = @links = nil
14
+ # Initializes a new instance of MetaInspector, setting the URL to the one given
15
+ # If no scheme given, set it to http:// by default
16
+ def initialize(url)
17
+ @url = URI.parse(url).scheme.nil? ? 'http://' + url : url
20
18
  end
21
19
 
22
20
  # Returns the parsed document title, from the content of the <title> tag.
@@ -30,6 +28,13 @@ module MetaInspector
30
28
  @links ||= parsed_document.search("//a").map {|link| link.attributes["href"].to_s.strip} rescue nil
31
29
  end
32
30
 
31
+ # Returns the parsed image from Facebook's open graph property tags
32
+ # Most all major websites now define this property and is usually very relevant
33
+ # See doc at http://developers.facebook.com/docs/opengraph/
34
+ def image
35
+ @image ||= parsed_document.document.css("meta[@property='og:image']").first['content'] rescue nil
36
+ end
37
+
33
38
  # Returns the charset
34
39
  # TODO: We should trust the charset expressed on the Content-Type meta tag
35
40
  # and only guess it if none given
@@ -47,7 +52,7 @@ module MetaInspector
47
52
 
48
53
  # Returns the original, unparsed document
49
54
  def document
50
- @document ||= open(@address).read
55
+ @document ||= open(@url).read
51
56
 
52
57
  rescue SocketError
53
58
  warn 'MetaInspector exception: The url provided does not exist or is temporarily unavailable (socket error)'
@@ -71,7 +76,6 @@ module MetaInspector
71
76
  if method_name.to_s =~ /^meta_(.*)/
72
77
  content = parsed_document.css("meta[@name='#{$1}']").first['content'] rescue nil
73
78
  content = parsed_document.css("meta[@http-equiv='#{$1.gsub("_", "-")}']").first['content'] rescue nil if content.nil?
74
-
75
79
  content
76
80
  else
77
81
  super
@@ -1,5 +1,5 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  module MetaInspector
4
- VERSION = "1.2.0"
4
+ VERSION = "1.3.0"
5
5
  end
@@ -4,13 +4,36 @@ require File.join(File.dirname(__FILE__), "/spec_helper")
4
4
 
5
5
  describe MetaInspector do
6
6
 
7
+ context 'Initialization' do
8
+ it 'should accept an URL with a scheme' do
9
+ @m = MetaInspector.new('http://pagerankalert.com')
10
+ @m.url.should == 'http://pagerankalert.com'
11
+ end
12
+
13
+ it "should use http:// as a default scheme" do
14
+ @m = MetaInspector.new('pagerankalert.com')
15
+ @m.url.should == 'http://pagerankalert.com'
16
+ end
17
+ end
18
+
7
19
  context 'Doing a basic scrape' do
20
+ EXPECTED_TITLE = 'PageRankAlert.com :: Track your PageRank changes'
21
+
8
22
  before(:each) do
9
23
  @m = MetaInspector.new('http://pagerankalert.com')
10
24
  end
11
25
 
12
26
  it "should get the title" do
13
- @m.title.should == 'PageRankAlert.com :: Track your PageRank changes'
27
+ @m.title.should == EXPECTED_TITLE
28
+ end
29
+
30
+ it "should not find an image" do
31
+ @m.image.should == nil
32
+ end
33
+
34
+ it "should find an image" do
35
+ @m = MetaInspector.new('http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/')
36
+ @m.image.should == "http://o.onionstatic.com/images/articles/article/2772/Apple-Claims-600w-R_jpg_130x110_q85.jpg"
14
37
  end
15
38
 
16
39
  it "should get the links" do
metadata CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
4
4
  prerelease: false
5
5
  segments:
6
6
  - 1
7
- - 2
7
+ - 3
8
8
  - 0
9
- version: 1.2.0
9
+ version: 1.3.0
10
10
  platform: ruby
11
11
  authors:
12
12
  - Jaime Iniesta
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-05-05 00:00:00 +02:00
17
+ date: 2011-05-09 00:00:00 +02:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency