metainspector 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +13 -4
- data/lib/meta_inspector/scraper.rb +13 -9
- data/lib/meta_inspector/version.rb +1 -1
- data/spec/metainspector_spec.rb +24 -1
- metadata +3 -3
data/README.rdoc
CHANGED
@@ -18,13 +18,19 @@ or, for short, a convenience alias is also available:
|
|
18
18
|
|
19
19
|
page = MetaInspector.new('http://pagerankalert.com')
|
20
20
|
|
21
|
+
If you don't include the scheme on the URL, http:// will be used
|
22
|
+
by defaul:
|
23
|
+
|
24
|
+
page = MetaInspector.new('pagerankalert.com')
|
25
|
+
|
21
26
|
Then you can see the scraped data like this:
|
22
27
|
|
23
|
-
page.
|
28
|
+
page.url # URL of the page
|
24
29
|
page.title # title of the page, as string
|
25
30
|
page.links # array of strings, with every link found on the page
|
26
31
|
page.meta_description # meta description, as string
|
27
32
|
page.meta_keywords # meta keywords, as string
|
33
|
+
page.image # Most relevant image, if defined with og:image
|
28
34
|
|
29
35
|
MetaInspector uses dynamic methods for meta_tag discovery, so all these will work, and will be converted to a search of a meta tag by the corresponding name, and return its content attribute
|
30
36
|
|
@@ -53,7 +59,7 @@ You can find some sample scripts on the samples folder, including a basic scrapi
|
|
53
59
|
=> true
|
54
60
|
|
55
61
|
>> page = MetaInspector.new('http://pagerankalert.com')
|
56
|
-
=> #<MetaInspector:0x11330c0 @
|
62
|
+
=> #<MetaInspector:0x11330c0 @url="http://pagerankalert.com">
|
57
63
|
|
58
64
|
>> page.title
|
59
65
|
=> "PageRankAlert.com :: Track your PageRank changes"
|
@@ -76,15 +82,18 @@ You can find some sample scripts on the samples folder, including a basic scrapi
|
|
76
82
|
>> page.parsed_document.class
|
77
83
|
=> Nokogiri::HTML::Document
|
78
84
|
|
85
|
+
= ZOMG Fork! Thank you!
|
86
|
+
|
87
|
+
You're welcome to fork this project and send pull requests. I want to thank Ryan Romanchuk for his help https://github.com/rromanchuk
|
88
|
+
|
79
89
|
= To Do
|
80
90
|
|
81
|
-
* Get page.base_dir from the
|
91
|
+
* Get page.base_dir from the URL
|
82
92
|
* Distinguish between external and internal links, returning page.links for all of them as found, page.external_links and page.internal_links converted to absolute URLs
|
83
93
|
* Return array of images in page as absolute URLs
|
84
94
|
* Be able to set a timeout in seconds
|
85
95
|
* If keywords seem to be separated by blank spaces, replace them with commas
|
86
96
|
* Mocks
|
87
97
|
* Check content type, process only HTML pages, don't try to scrape TAR files like http://ftp.ruby-lang.org/pub/ruby/ruby-1.9.1-p129.tar.bz2 or video files like http://isabel.dit.upm.es/component/option,com_docman/task,doc_download/gid,831/Itemid,74/
|
88
|
-
* Get most important image querying Facebook
|
89
98
|
|
90
99
|
Copyright (c) 2009-2011 Jaime Iniesta, released under the MIT license
|
@@ -9,14 +9,12 @@ require 'iconv'
|
|
9
9
|
# MetaInspector provides an easy way to scrape web pages and get its elements
|
10
10
|
module MetaInspector
|
11
11
|
class Scraper
|
12
|
-
attr_reader :
|
12
|
+
attr_reader :url
|
13
13
|
|
14
|
-
# Initializes a new instance of MetaInspector, setting the URL
|
15
|
-
#
|
16
|
-
def initialize(
|
17
|
-
@
|
18
|
-
|
19
|
-
@document = @title = @description = @keywords = @links = nil
|
14
|
+
# Initializes a new instance of MetaInspector, setting the URL to the one given
|
15
|
+
# If no scheme given, set it to http:// by default
|
16
|
+
def initialize(url)
|
17
|
+
@url = URI.parse(url).scheme.nil? ? 'http://' + url : url
|
20
18
|
end
|
21
19
|
|
22
20
|
# Returns the parsed document title, from the content of the <title> tag.
|
@@ -30,6 +28,13 @@ module MetaInspector
|
|
30
28
|
@links ||= parsed_document.search("//a").map {|link| link.attributes["href"].to_s.strip} rescue nil
|
31
29
|
end
|
32
30
|
|
31
|
+
# Returns the parsed image from Facebook's open graph property tags
|
32
|
+
# Most all major websites now define this property and is usually very relevant
|
33
|
+
# See doc at http://developers.facebook.com/docs/opengraph/
|
34
|
+
def image
|
35
|
+
@image ||= parsed_document.document.css("meta[@property='og:image']").first['content'] rescue nil
|
36
|
+
end
|
37
|
+
|
33
38
|
# Returns the charset
|
34
39
|
# TODO: We should trust the charset expressed on the Content-Type meta tag
|
35
40
|
# and only guess it if none given
|
@@ -47,7 +52,7 @@ module MetaInspector
|
|
47
52
|
|
48
53
|
# Returns the original, unparsed document
|
49
54
|
def document
|
50
|
-
@document ||= open(@
|
55
|
+
@document ||= open(@url).read
|
51
56
|
|
52
57
|
rescue SocketError
|
53
58
|
warn 'MetaInspector exception: The url provided does not exist or is temporarily unavailable (socket error)'
|
@@ -71,7 +76,6 @@ module MetaInspector
|
|
71
76
|
if method_name.to_s =~ /^meta_(.*)/
|
72
77
|
content = parsed_document.css("meta[@name='#{$1}']").first['content'] rescue nil
|
73
78
|
content = parsed_document.css("meta[@http-equiv='#{$1.gsub("_", "-")}']").first['content'] rescue nil if content.nil?
|
74
|
-
|
75
79
|
content
|
76
80
|
else
|
77
81
|
super
|
data/spec/metainspector_spec.rb
CHANGED
@@ -4,13 +4,36 @@ require File.join(File.dirname(__FILE__), "/spec_helper")
|
|
4
4
|
|
5
5
|
describe MetaInspector do
|
6
6
|
|
7
|
+
context 'Initialization' do
|
8
|
+
it 'should accept an URL with a scheme' do
|
9
|
+
@m = MetaInspector.new('http://pagerankalert.com')
|
10
|
+
@m.url.should == 'http://pagerankalert.com'
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should use http:// as a default scheme" do
|
14
|
+
@m = MetaInspector.new('pagerankalert.com')
|
15
|
+
@m.url.should == 'http://pagerankalert.com'
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
7
19
|
context 'Doing a basic scrape' do
|
20
|
+
EXPECTED_TITLE = 'PageRankAlert.com :: Track your PageRank changes'
|
21
|
+
|
8
22
|
before(:each) do
|
9
23
|
@m = MetaInspector.new('http://pagerankalert.com')
|
10
24
|
end
|
11
25
|
|
12
26
|
it "should get the title" do
|
13
|
-
@m.title.should ==
|
27
|
+
@m.title.should == EXPECTED_TITLE
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should not find an image" do
|
31
|
+
@m.image.should == nil
|
32
|
+
end
|
33
|
+
|
34
|
+
it "should find an image" do
|
35
|
+
@m = MetaInspector.new('http://www.theonion.com/articles/apple-claims-new-iphone-only-visible-to-most-loyal,2772/')
|
36
|
+
@m.image.should == "http://o.onionstatic.com/images/articles/article/2772/Apple-Claims-600w-R_jpg_130x110_q85.jpg"
|
14
37
|
end
|
15
38
|
|
16
39
|
it "should get the links" do
|
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 1
|
7
|
-
-
|
7
|
+
- 3
|
8
8
|
- 0
|
9
|
-
version: 1.
|
9
|
+
version: 1.3.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jaime Iniesta
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-05-
|
17
|
+
date: 2011-05-09 00:00:00 +02:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|