metainspector 1.9.0 → 1.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +18 -12
- data/lib/meta_inspector/scraper.rb +7 -5
- data/lib/meta_inspector/version.rb +1 -1
- data/lib/meta_inspector.rb +2 -2
- data/meta_inspector.gemspec +1 -1
- metadata +6 -7
data/README.rdoc
CHANGED
@@ -14,16 +14,21 @@ This gem is tested on Ruby versions 1.8.7, 1.9.2 and 1.9.3.
|
|
14
14
|
|
15
15
|
Initialize a scraper instance for an URL, like this:
|
16
16
|
|
17
|
-
page = MetaInspector::Scraper.new('http://
|
17
|
+
page = MetaInspector::Scraper.new('http://w3clove.com')
|
18
18
|
|
19
19
|
or, for short, a convenience alias is also available:
|
20
20
|
|
21
|
-
page = MetaInspector.new('http://
|
21
|
+
page = MetaInspector.new('http://w3clove.com')
|
22
22
|
|
23
23
|
If you don't include the scheme on the URL, http:// will be used
|
24
24
|
by defaul:
|
25
25
|
|
26
|
-
page = MetaInspector.new('
|
26
|
+
page = MetaInspector.new('w3clove.com')
|
27
|
+
|
28
|
+
By default, MetaInspector times out after 20 seconds of waiting for a page to respond.
|
29
|
+
You can set a different timeout with a second parameter, like this:
|
30
|
+
|
31
|
+
page = MetaInspector.new('w3clove.com', 5) # this would wait just 5 seconds to timeout
|
27
32
|
|
28
33
|
Then you can see the scraped data like this:
|
29
34
|
|
@@ -58,7 +63,7 @@ Please notice that MetaInspector is case sensitive, so page.meta_Content_Type is
|
|
58
63
|
|
59
64
|
You can also access most of the scraped data as a hash:
|
60
65
|
|
61
|
-
page.to_hash # { "url"=>"http://
|
66
|
+
page.to_hash # { "url"=>"http://w3clove.com", "title" => "W3CLove :: site-wide markup validation tool", ... }
|
62
67
|
|
63
68
|
The full scraped document if accessible from:
|
64
69
|
|
@@ -72,23 +77,23 @@ You can find some sample scripts on the samples folder, including a basic scrapi
|
|
72
77
|
>> require 'metainspector'
|
73
78
|
=> true
|
74
79
|
|
75
|
-
>> page = MetaInspector.new('http://
|
76
|
-
=> #<MetaInspector:0x11330c0 @url="http://
|
80
|
+
>> page = MetaInspector.new('http://w3clove.com')
|
81
|
+
=> #<MetaInspector:0x11330c0 @url="http://w3clove.com">
|
77
82
|
|
78
83
|
>> page.title
|
79
|
-
=> "
|
84
|
+
=> "W3CLove :: site-wide markup validation tool"
|
80
85
|
|
81
86
|
>> page.meta_description
|
82
|
-
=> "
|
87
|
+
=> "Site-wide markup validation tool. Validate the markup of your whole site with just one click."
|
83
88
|
|
84
89
|
>> page.meta_keywords
|
85
|
-
=> "
|
90
|
+
=> "html, markup, validation, validator, tool, w3c, development, standards, free"
|
86
91
|
|
87
92
|
>> page.links.size
|
88
|
-
=>
|
93
|
+
=> 15
|
89
94
|
|
90
|
-
>> page.links[
|
91
|
-
=> "
|
95
|
+
>> page.links[4]
|
96
|
+
=> "/plans-and-pricing"
|
92
97
|
|
93
98
|
>> page.document.class
|
94
99
|
=> String
|
@@ -103,6 +108,7 @@ You're welcome to fork this project and send pull requests. I want to thank spec
|
|
103
108
|
* Ryan Romanchuk https://github.com/rromanchuk
|
104
109
|
* Edmund Haselwanter https://github.com/ehaselwanter
|
105
110
|
* Jonathan Hernández https://github.com/ionmx
|
111
|
+
* Oriol Gual https://github.com/oriolgual
|
106
112
|
|
107
113
|
= To Do
|
108
114
|
|
@@ -4,6 +4,7 @@ require 'open-uri'
|
|
4
4
|
require 'nokogiri'
|
5
5
|
require 'charguess'
|
6
6
|
require 'hashie/rash'
|
7
|
+
require 'timeout'
|
7
8
|
|
8
9
|
# MetaInspector provides an easy way to scrape web pages and get its elements
|
9
10
|
module MetaInspector
|
@@ -11,10 +12,11 @@ module MetaInspector
|
|
11
12
|
attr_reader :url, :scheme
|
12
13
|
# Initializes a new instance of MetaInspector, setting the URL to the one given
|
13
14
|
# If no scheme given, set it to http:// by default
|
14
|
-
def initialize(url)
|
15
|
-
@url
|
16
|
-
@scheme
|
17
|
-
@
|
15
|
+
def initialize(url, timeout = 20)
|
16
|
+
@url = URI.parse(url).scheme.nil? ? 'http://' + url : url
|
17
|
+
@scheme = URI.parse(url).scheme || 'http'
|
18
|
+
@timeout = timeout
|
19
|
+
@data = Hashie::Rash.new('url' => @url)
|
18
20
|
end
|
19
21
|
|
20
22
|
# Returns the parsed document title, from the content of the <title> tag.
|
@@ -92,7 +94,7 @@ module MetaInspector
|
|
92
94
|
|
93
95
|
# Returns the original, unparsed document
|
94
96
|
def document
|
95
|
-
@document ||= open(@url).read
|
97
|
+
@document ||= Timeout::timeout(@timeout) { open(@url).read }
|
96
98
|
|
97
99
|
rescue SocketError
|
98
100
|
warn 'MetaInspector exception: The url provided does not exist or is temporarily unavailable (socket error)'
|
data/lib/meta_inspector.rb
CHANGED
data/meta_inspector.gemspec
CHANGED
@@ -14,7 +14,7 @@ Gem::Specification.new do |gem|
|
|
14
14
|
gem.require_paths = ["lib"]
|
15
15
|
gem.version = MetaInspector::VERSION
|
16
16
|
|
17
|
-
gem.add_dependency 'nokogiri', '1.5
|
17
|
+
gem.add_dependency 'nokogiri', '~> 1.5'
|
18
18
|
gem.add_dependency 'charguess', '1.3.20111021164500'
|
19
19
|
gem.add_dependency 'rash', '0.3.2'
|
20
20
|
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 49
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
8
|
- 9
|
9
|
-
-
|
10
|
-
version: 1.9.
|
9
|
+
- 1
|
10
|
+
version: 1.9.1
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jaime Iniesta
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2012-
|
18
|
+
date: 2012-07-11 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
name: nokogiri
|
@@ -23,14 +23,13 @@ dependencies:
|
|
23
23
|
requirement: &id001 !ruby/object:Gem::Requirement
|
24
24
|
none: false
|
25
25
|
requirements:
|
26
|
-
- -
|
26
|
+
- - ~>
|
27
27
|
- !ruby/object:Gem::Version
|
28
28
|
hash: 5
|
29
29
|
segments:
|
30
30
|
- 1
|
31
31
|
- 5
|
32
|
-
|
33
|
-
version: 1.5.3
|
32
|
+
version: "1.5"
|
34
33
|
type: :runtime
|
35
34
|
version_requirements: *id001
|
36
35
|
- !ruby/object:Gem::Dependency
|