metainspector 1.9.0 → 1.9.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +18 -12
- data/lib/meta_inspector/scraper.rb +7 -5
- data/lib/meta_inspector/version.rb +1 -1
- data/lib/meta_inspector.rb +2 -2
- data/meta_inspector.gemspec +1 -1
- metadata +6 -7
data/README.rdoc
CHANGED
@@ -14,16 +14,21 @@ This gem is tested on Ruby versions 1.8.7, 1.9.2 and 1.9.3.
|
|
14
14
|
|
15
15
|
Initialize a scraper instance for an URL, like this:
|
16
16
|
|
17
|
-
page = MetaInspector::Scraper.new('http://
|
17
|
+
page = MetaInspector::Scraper.new('http://w3clove.com')
|
18
18
|
|
19
19
|
or, for short, a convenience alias is also available:
|
20
20
|
|
21
|
-
page = MetaInspector.new('http://
|
21
|
+
page = MetaInspector.new('http://w3clove.com')
|
22
22
|
|
23
23
|
If you don't include the scheme on the URL, http:// will be used
|
24
24
|
by defaul:
|
25
25
|
|
26
|
-
page = MetaInspector.new('
|
26
|
+
page = MetaInspector.new('w3clove.com')
|
27
|
+
|
28
|
+
By default, MetaInspector times out after 20 seconds of waiting for a page to respond.
|
29
|
+
You can set a different timeout with a second parameter, like this:
|
30
|
+
|
31
|
+
page = MetaInspector.new('w3clove.com', 5) # this would wait just 5 seconds to timeout
|
27
32
|
|
28
33
|
Then you can see the scraped data like this:
|
29
34
|
|
@@ -58,7 +63,7 @@ Please notice that MetaInspector is case sensitive, so page.meta_Content_Type is
|
|
58
63
|
|
59
64
|
You can also access most of the scraped data as a hash:
|
60
65
|
|
61
|
-
page.to_hash # { "url"=>"http://
|
66
|
+
page.to_hash # { "url"=>"http://w3clove.com", "title" => "W3CLove :: site-wide markup validation tool", ... }
|
62
67
|
|
63
68
|
The full scraped document if accessible from:
|
64
69
|
|
@@ -72,23 +77,23 @@ You can find some sample scripts on the samples folder, including a basic scrapi
|
|
72
77
|
>> require 'metainspector'
|
73
78
|
=> true
|
74
79
|
|
75
|
-
>> page = MetaInspector.new('http://
|
76
|
-
=> #<MetaInspector:0x11330c0 @url="http://
|
80
|
+
>> page = MetaInspector.new('http://w3clove.com')
|
81
|
+
=> #<MetaInspector:0x11330c0 @url="http://w3clove.com">
|
77
82
|
|
78
83
|
>> page.title
|
79
|
-
=> "
|
84
|
+
=> "W3CLove :: site-wide markup validation tool"
|
80
85
|
|
81
86
|
>> page.meta_description
|
82
|
-
=> "
|
87
|
+
=> "Site-wide markup validation tool. Validate the markup of your whole site with just one click."
|
83
88
|
|
84
89
|
>> page.meta_keywords
|
85
|
-
=> "
|
90
|
+
=> "html, markup, validation, validator, tool, w3c, development, standards, free"
|
86
91
|
|
87
92
|
>> page.links.size
|
88
|
-
=>
|
93
|
+
=> 15
|
89
94
|
|
90
|
-
>> page.links[
|
91
|
-
=> "
|
95
|
+
>> page.links[4]
|
96
|
+
=> "/plans-and-pricing"
|
92
97
|
|
93
98
|
>> page.document.class
|
94
99
|
=> String
|
@@ -103,6 +108,7 @@ You're welcome to fork this project and send pull requests. I want to thank spec
|
|
103
108
|
* Ryan Romanchuk https://github.com/rromanchuk
|
104
109
|
* Edmund Haselwanter https://github.com/ehaselwanter
|
105
110
|
* Jonathan Hernández https://github.com/ionmx
|
111
|
+
* Oriol Gual https://github.com/oriolgual
|
106
112
|
|
107
113
|
= To Do
|
108
114
|
|
@@ -4,6 +4,7 @@ require 'open-uri'
|
|
4
4
|
require 'nokogiri'
|
5
5
|
require 'charguess'
|
6
6
|
require 'hashie/rash'
|
7
|
+
require 'timeout'
|
7
8
|
|
8
9
|
# MetaInspector provides an easy way to scrape web pages and get its elements
|
9
10
|
module MetaInspector
|
@@ -11,10 +12,11 @@ module MetaInspector
|
|
11
12
|
attr_reader :url, :scheme
|
12
13
|
# Initializes a new instance of MetaInspector, setting the URL to the one given
|
13
14
|
# If no scheme given, set it to http:// by default
|
14
|
-
def initialize(url)
|
15
|
-
@url
|
16
|
-
@scheme
|
17
|
-
@
|
15
|
+
def initialize(url, timeout = 20)
|
16
|
+
@url = URI.parse(url).scheme.nil? ? 'http://' + url : url
|
17
|
+
@scheme = URI.parse(url).scheme || 'http'
|
18
|
+
@timeout = timeout
|
19
|
+
@data = Hashie::Rash.new('url' => @url)
|
18
20
|
end
|
19
21
|
|
20
22
|
# Returns the parsed document title, from the content of the <title> tag.
|
@@ -92,7 +94,7 @@ module MetaInspector
|
|
92
94
|
|
93
95
|
# Returns the original, unparsed document
|
94
96
|
def document
|
95
|
-
@document ||= open(@url).read
|
97
|
+
@document ||= Timeout::timeout(@timeout) { open(@url).read }
|
96
98
|
|
97
99
|
rescue SocketError
|
98
100
|
warn 'MetaInspector exception: The url provided does not exist or is temporarily unavailable (socket error)'
|
data/lib/meta_inspector.rb
CHANGED
data/meta_inspector.gemspec
CHANGED
@@ -14,7 +14,7 @@ Gem::Specification.new do |gem|
|
|
14
14
|
gem.require_paths = ["lib"]
|
15
15
|
gem.version = MetaInspector::VERSION
|
16
16
|
|
17
|
-
gem.add_dependency 'nokogiri', '1.5
|
17
|
+
gem.add_dependency 'nokogiri', '~> 1.5'
|
18
18
|
gem.add_dependency 'charguess', '1.3.20111021164500'
|
19
19
|
gem.add_dependency 'rash', '0.3.2'
|
20
20
|
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 49
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
8
|
- 9
|
9
|
-
-
|
10
|
-
version: 1.9.
|
9
|
+
- 1
|
10
|
+
version: 1.9.1
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jaime Iniesta
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2012-
|
18
|
+
date: 2012-07-11 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
name: nokogiri
|
@@ -23,14 +23,13 @@ dependencies:
|
|
23
23
|
requirement: &id001 !ruby/object:Gem::Requirement
|
24
24
|
none: false
|
25
25
|
requirements:
|
26
|
-
- -
|
26
|
+
- - ~>
|
27
27
|
- !ruby/object:Gem::Version
|
28
28
|
hash: 5
|
29
29
|
segments:
|
30
30
|
- 1
|
31
31
|
- 5
|
32
|
-
|
33
|
-
version: 1.5.3
|
32
|
+
version: "1.5"
|
34
33
|
type: :runtime
|
35
34
|
version_requirements: *id001
|
36
35
|
- !ruby/object:Gem::Dependency
|