metainspector 1.2.0 → 1.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +13 -4
- data/lib/meta_inspector/scraper.rb +13 -9
- data/lib/meta_inspector/version.rb +1 -1
- data/spec/metainspector_spec.rb +24 -1
- metadata +3 -3
data/README.rdoc
CHANGED
@@ -18,13 +18,19 @@ or, for short, a convenience alias is also available:
|
|
18
18
|
|
19
19
|
page = MetaInspector.new('http://pagerankalert.com')
|
20
20
|
|
21
|
+
If you don't include the scheme on the URL, http:// will be used
|
22
|
+
by defaul:
|
23
|
+
|
24
|
+
page = MetaInspector.new('pagerankalert.com')
|
25
|
+
|
21
26
|
Then you can see the scraped data like this:
|
22
27
|
|
23
|
-
page.
|
28
|
+
page.url # URL of the page
|
24
29
|
page.title # title of the page, as string
|
25
30
|
page.links # array of strings, with every link found on the page
|
26
31
|
page.meta_description # meta description, as string
|
27
32
|
page.meta_keywords # meta keywords, as string
|
33
|
+
page.image # Most relevant image, if defined with og:image
|
28
34
|
|
29
35
|
MetaInspector uses dynamic methods for meta_tag discovery, so all these will work, and will be converted to a search of a meta tag by the corresponding name, and return its content attribute
|
30
36
|
|
@@ -53,7 +59,7 @@ You can find some sample scripts on the samples folder, including a basic scrapi
|
|
53
59
|
=> true
|
54
60
|
|
55
61
|
>> page = MetaInspector.new('http://pagerankalert.com')
|
56
|
-
=> #<MetaInspector:0x11330c0 @
|
62
|
+
=> #<MetaInspector:0x11330c0 @url="http://pagerankalert.com">
|
57
63
|
|
58
64
|
>> page.title
|
59
65
|
=> "PageRankAlert.com :: Track your PageRank changes"
|
@@ -76,15 +82,18 @@ You can find some sample scripts on the samples folder, including a basic scrapi
|
|
76
82
|
>> page.parsed_document.class
|
77
83
|
=> Nokogiri::HTML::Document
|
78
84
|
|
85
|
+
= ZOMG Fork! Thank you!
|
86
|
+
|
87
|
+
You're welcome to fork this project and send pull requests. I want to thank Ryan Romanchuk for his help https://github.com/rromanchuk
|
88
|
+
|
79
89
|
= To Do
|
80
90
|
|
81
|
-
* Get page.base_dir from the
|
91
|
+
* Get page.base_dir from the URL
|
82
92
|
* Distinguish between external and internal links, returning page.links for all of them as found, page.external_links and page.internal_links converted to absolute URLs
|
83
93
|
* Return array of images in page as absolute URLs
|
84
94
|
* Be able to set a timeout in seconds
|
85
95
|
* If keywords seem to be separated by blank spaces, replace them with commas
|
86
96
|
* Mocks
|
87
97
|
* Check content type, process only HTML pages, don't try to scrape TAR files like http://ftp.ruby-lang.org/pub/ruby/ruby-1.9.1-p129.tar.bz2 or video files like http://isabel.dit.upm.es/component/option,com_docman/task,doc_download/gid,831/Itemid,74/
|
88
|
-
* Get most important image querying Facebook
|
89
98
|
|
90
99
|
Copyright (c) 2009-2011 Jaime Iniesta, released under the MIT license
|
@@ -9,14 +9,12 @@ require 'iconv'
|
|
9
9
|
# MetaInspector provides an easy way to scrape web pages and get its elements
|
10
10
|
module MetaInspector
|
11
11
|
class Scraper
|
12
|
-
attr_reader :
|
12
|
+
attr_reader :url
|
13
13
|
|
14
|
-
# Initializes a new instance of MetaInspector, setting the URL
|
15
|
-
#
|
16
|
-
def initialize(
|
17
|
-
@
|
18
|
-
|
19
|
-
@document = @title = @description = @keywords = @links = nil
|
14
|
+
# Initializes a new instance of MetaInspector, setting the URL to the one given
|
15
|
+
# If no scheme given, set it to http:// by default
|
16
|
+
def initialize(url)
|
17
|
+
@url = URI.parse(url).scheme.nil? ? 'http://' + url : url
|
20
18
|
end
|
21
19
|
|
22
20
|
# Returns the parsed document title, from the content of the <title> tag.
|
@@ -30,6 +28,13 @@ module MetaInspector
|
|
30
28
|
@links ||= parsed_document.search("//a").map {|link| link.attributes["href"].to_s.strip} rescue nil
|
31
29
|
end
|
32
30
|
|
31
|
+
# Returns the parsed image from Facebook's open graph property tags
|
32
|
+
# Most all major websites now define this property and is usually very relevant
|
33
|
+
# See doc at http://developers.facebook.com/docs/opengraph/
|
34
|
+
def image
|
35
|
+
@image ||= parsed_document.document.css("meta[@property='og:image']").first['content'] rescue nil
|
36
|
+
end
|
37
|
+
|
33
38
|
# Returns the charset
|
34
39
|
# TODO: We should trust the charset expressed on the Content-Type meta tag
|
35
40
|
# and only guess it if none given
|
@@ -47,7 +52,7 @@ module MetaInspector
|
|
47
52
|
|
48
53
|
# Returns the original, unparsed document
|
49
54
|
def document
|
50
|
-
@document ||= open(@
|
55
|
+
@document ||= open(@url).read
|
51
56
|
|
52
57
|
rescue SocketError
|
53
58
|
warn 'MetaInspector exception: The url provided does not exist or is temporarily unavailable (socket error)'
|
@@ -71,7 +76,6 @@ module MetaInspector
|
|
71
76
|
if method_name.to_s =~ /^meta_(.*)/
|
72
77
|
content = parsed_document.css("meta[@name='#{$1}']").first['content'] rescue nil
|
73
78
|
content = parsed_document.css("meta[@http-equiv='#{$1.gsub("_", "-")}']").first['content'] rescue nil if content.nil?
|
74
|
-
|
75
79
|
content
|
76
80
|
else
|
77
81
|
super
|
data/spec/metainspector_spec.rb
CHANGED
@@ -4,13 +4,36 @@ require File.join(File.dirname(__FILE__), "/spec_helper")
|
|
4
4
|
|
5
5
|
describe MetaInspector do
|
6
6
|
|
7
|
+
context 'Initialization' do
|
8
|
+
it 'should accept an URL with a scheme' do
|
9
|
+
@m = MetaInspector.new('http://pagerankalert.com')
|
10
|
+
@m.url.should == 'http://pagerankalert.com'
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should use http:// as a default scheme" do
|
14
|
+
@m = MetaInspector.new('pagerankalert.com')
|
15
|
+
@m.url.should == 'http://pagerankalert.com'
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
7
19
|
context 'Doing a basic scrape' do
|
20
|
+
EXPECTED_TITLE = 'PageRankAlert.com :: Track your PageRank changes'
|
21
|
+
|
8
22
|
before(:each) do
|
9
23
|
@m = MetaInspector.new('http://pagerankalert.com')
|
10
24
|
end
|
11
25
|
|
12
26
|
it "should get the title" do
|
13
|
-
@m.title.should ==
|
27
|
+
@m.title.should == EXPECTED_TITLE
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should not find an image" do
|
31
|
+
@m.image.should == nil
|
32
|
+
end
|
33
|
+
|
34
|
+
it "should find an image" do
|
35
|
+
@m = MetaInspector.new('http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/')
|
36
|
+
@m.image.should == "http://o.onionstatic.com/images/articles/article/2772/Apple-Claims-600w-R_jpg_130x110_q85.jpg"
|
14
37
|
end
|
15
38
|
|
16
39
|
it "should get the links" do
|
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 1
|
7
|
-
-
|
7
|
+
- 3
|
8
8
|
- 0
|
9
|
-
version: 1.
|
9
|
+
version: 1.3.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jaime Iniesta
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-05-
|
17
|
+
date: 2011-05-09 00:00:00 +02:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|