metainspector 3.2.0 → 3.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +13 -2
- data/examples/basic_scraping.rb +24 -0
- data/examples/link_checker.rb +100 -0
- data/examples/spider.rb +39 -0
- data/lib/meta_inspector/document.rb +4 -2
- data/lib/meta_inspector/request.rb +2 -2
- data/lib/meta_inspector/version.rb +1 -1
- data/spec/document_spec.rb +29 -7
- data/spec/fixtures/404.response +24 -0
- data/spec/request_spec.rb +17 -0
- data/spec/spec_helper.rb +3 -0
- metadata +6 -4
- data/samples/basic_scraping.rb +0 -22
- data/samples/spider.rb +0 -30
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 72feb77b18ae296efcdd96b30950e2deaefb9689
|
4
|
+
data.tar.gz: b6442b9f256c6b31e18005af2f5955c339b37298
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a9b4002024e2466d5538a1e54c39fe336f2b049128249956b1951a23dfb3f1d09cdea7da63b9e602102fe936f8e14b884000a48a1a970ecd05ab58a632aecef5
|
7
|
+
data.tar.gz: 99d8ced9fcf7e03df6316cd5442a54320df1dca1e32ad3a9ee9421638f38f779860994bece0496a048180df49bfbf67580df01ceb4c674d2e66c6df90e0c77f5
|
data/README.md
CHANGED
@@ -45,9 +45,18 @@ You can also include the html which will be used as the document to scrape:
|
|
45
45
|
|
46
46
|
page = MetaInspector.new("http://sitevalidator.com", :document => "<html><head><title>Hello From Passed Html</title><a href='/hello'>Hello link</a></head><body></body></html>")
|
47
47
|
|
48
|
+
## Accessing response status and headers
|
49
|
+
|
50
|
+
You can check the status and headers from the response like this:
|
51
|
+
|
52
|
+
```ruby
|
53
|
+
page.response.status # 200
|
54
|
+
page.response.headers # { "server"=>"nginx", "content-type"=>"text/html; charset=utf-8", "cache-control"=>"must-revalidate, private, max-age=0", ... }
|
55
|
+
```
|
56
|
+
|
48
57
|
## Accessing scraped data
|
49
58
|
|
50
|
-
|
59
|
+
You can see the scraped data like this:
|
51
60
|
|
52
61
|
page.url # URL of the page
|
53
62
|
page.scheme # Scheme of the page (http, https)
|
@@ -256,7 +265,7 @@ You should avoid using the `:store` option, or use it wisely, as silencing error
|
|
256
265
|
|
257
266
|
## Examples
|
258
267
|
|
259
|
-
You can find some sample scripts on the
|
268
|
+
You can find some sample scripts on the `examples` folder, including a basic scraping and a spider that will follow external links using a queue. What follows is an example of use from irb:
|
260
269
|
|
261
270
|
$ irb
|
262
271
|
>> require 'metainspector'
|
@@ -288,6 +297,8 @@ Thanks to all the contributors:
|
|
288
297
|
|
289
298
|
[https://github.com/jaimeiniesta/metainspector/graphs/contributors](https://github.com/jaimeiniesta/metainspector/graphs/contributors)
|
290
299
|
|
300
|
+
You are more than welcome to come chat with us on our [Gitter room](https://gitter.im/jaimeiniesta/metainspector) and [Google group](https://groups.google.com/forum/#!forum/metainspector).
|
301
|
+
|
291
302
|
## Related projects
|
292
303
|
|
293
304
|
* [go-metainspector](https://github.com/fern4lvarez/go-metainspector), a port of MetaInspector for Go.
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# A basic MetaInspector example for scraping a page
|
2
|
+
#
|
3
|
+
# Usage example:
|
4
|
+
#
|
5
|
+
# ruby basic_scraping.rb jaimeiniesta.com
|
6
|
+
|
7
|
+
require 'metainspector'
|
8
|
+
|
9
|
+
# Get the starting URL
|
10
|
+
url = ARGV[0] || (puts "Enter an url"; gets.strip)
|
11
|
+
|
12
|
+
page = MetaInspector.new(url)
|
13
|
+
|
14
|
+
puts "Scraping #{page.url} returned these results:"
|
15
|
+
puts "TITLE: #{page.title}"
|
16
|
+
puts "META DESCRIPTION: #{page.meta['description']}"
|
17
|
+
puts "META KEYWORDS: #{page.meta['keywords']}"
|
18
|
+
puts "#{page.links.size} links found..."
|
19
|
+
page.links.each do |link|
|
20
|
+
puts " ==> #{link}"
|
21
|
+
end
|
22
|
+
|
23
|
+
puts "to_hash..."
|
24
|
+
puts page.to_hash
|
@@ -0,0 +1,100 @@
|
|
1
|
+
# A basic spider that will follow internal links, checking broken links
|
2
|
+
#
|
3
|
+
# Usage example:
|
4
|
+
#
|
5
|
+
# ruby link_checker.rb alazan.com
|
6
|
+
|
7
|
+
require 'metainspector'
|
8
|
+
|
9
|
+
class BrokenLinkChecker
|
10
|
+
attr_reader :broken
|
11
|
+
|
12
|
+
def initialize(url)
|
13
|
+
@url = url
|
14
|
+
@queue = []
|
15
|
+
@visited = []
|
16
|
+
@ok = []
|
17
|
+
@broken = {}
|
18
|
+
|
19
|
+
check
|
20
|
+
end
|
21
|
+
|
22
|
+
def report
|
23
|
+
puts "\n#{@broken.size} broken links found."
|
24
|
+
|
25
|
+
@broken.each do |link, from|
|
26
|
+
puts "\n#{link} linked from"
|
27
|
+
from.each do |origin|
|
28
|
+
puts " - #{origin}"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def check
|
36
|
+
# Resolve initial redirections
|
37
|
+
page = MetaInspector.new(@url)
|
38
|
+
|
39
|
+
# Push this initial URL to the queue
|
40
|
+
@queue.push(page.url)
|
41
|
+
|
42
|
+
while @queue.any?
|
43
|
+
url = @queue.pop
|
44
|
+
|
45
|
+
page = MetaInspector.new(url, :warn_level => :store)
|
46
|
+
|
47
|
+
if page.ok?
|
48
|
+
# Gets all HTTP links
|
49
|
+
page.links.select {|l| l =~ /^http(s)?:\/\//i}.each do |link|
|
50
|
+
check_status(link, page.url)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
@visited.push(page.url)
|
55
|
+
|
56
|
+
page.internal_links.each do |link|
|
57
|
+
@queue.push(link) unless @visited.include?(link) || @broken.include?(link) || @queue.include?(link)
|
58
|
+
end
|
59
|
+
|
60
|
+
puts "#{'%3s' % @visited.size} pages visited, #{'%3s' % @queue.size} pages on queue, #{'%2s' % @broken.size} broken links"
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
# Checks the response status of the linked_url and stores it on the ok or broken collections
|
65
|
+
def check_status(linked_url, from_url)
|
66
|
+
if @broken.keys.include?(linked_url)
|
67
|
+
# This was already known to be broken, we add another origin
|
68
|
+
@broken[linked_url] << from_url
|
69
|
+
else
|
70
|
+
if !@ok.include?(linked_url)
|
71
|
+
# We still don't know about this link status, so we check it now
|
72
|
+
if reachable?(linked_url)
|
73
|
+
@ok << linked_url
|
74
|
+
else
|
75
|
+
@broken[linked_url] = [from_url]
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
# A page is reachable if its response status is less than 400
|
82
|
+
# In the case of exceptions, like timeouts or server connection errors,
|
83
|
+
# we consider it unreachable
|
84
|
+
def reachable?(url)
|
85
|
+
page = MetaInspector.new(url)
|
86
|
+
|
87
|
+
if page.response.status < 400
|
88
|
+
true
|
89
|
+
else
|
90
|
+
false
|
91
|
+
end
|
92
|
+
rescue Exception => e
|
93
|
+
false
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
# Get the starting URL
|
98
|
+
url = ARGV[0] || (puts "Enter a starting url"; gets.strip)
|
99
|
+
|
100
|
+
BrokenLinkChecker.new(url).report
|
data/examples/spider.rb
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
# A basic spider that will follow internal links
|
2
|
+
#
|
3
|
+
# Usage example:
|
4
|
+
#
|
5
|
+
# ruby spider.rb jaimeiniesta.com
|
6
|
+
|
7
|
+
require 'metainspector'
|
8
|
+
|
9
|
+
# Two arrays, one for the scraping queue and one for the visited links
|
10
|
+
queue = []
|
11
|
+
visited = []
|
12
|
+
|
13
|
+
# Get the starting URL
|
14
|
+
url = ARGV[0] || (puts "Enter a starting url"; gets.strip)
|
15
|
+
|
16
|
+
# Resolve initial redirections
|
17
|
+
page = MetaInspector.new(url)
|
18
|
+
|
19
|
+
# Push this initial URL to the queue
|
20
|
+
queue.push(page.url)
|
21
|
+
|
22
|
+
while queue.any?
|
23
|
+
url = queue.pop
|
24
|
+
|
25
|
+
visited.push(url)
|
26
|
+
|
27
|
+
puts "VISITED: #{url}"
|
28
|
+
|
29
|
+
page = MetaInspector.new(url)
|
30
|
+
|
31
|
+
page.internal_links.each do |link|
|
32
|
+
queue.push(link) unless visited.include?(link) || queue.include?(link)
|
33
|
+
end
|
34
|
+
|
35
|
+
puts "#{visited.size} pages visited, #{queue.size} pages on queue\n\n"
|
36
|
+
end
|
37
|
+
|
38
|
+
puts "\nScraping finished, these are the internal links found:\n\n"
|
39
|
+
puts visited.sort
|
@@ -34,7 +34,7 @@ module MetaInspector
|
|
34
34
|
|
35
35
|
extend Forwardable
|
36
36
|
def_delegators :@url, :url, :scheme, :host, :root_url
|
37
|
-
def_delegators :@request, :content_type
|
37
|
+
def_delegators :@request, :content_type, :response
|
38
38
|
def_delegators :@parser, :parsed, :respond_to?, :title, :description, :links, :internal_links, :external_links,
|
39
39
|
:images, :image, :feed, :charset, :meta_tags, :meta_tag, :meta, :favicon
|
40
40
|
|
@@ -51,7 +51,9 @@ module MetaInspector
|
|
51
51
|
'feed' => feed,
|
52
52
|
'content_type' => content_type,
|
53
53
|
'meta_tags' => meta_tags,
|
54
|
-
'favicon' => favicon
|
54
|
+
'favicon' => favicon,
|
55
|
+
'response' => { 'status' => response.status,
|
56
|
+
'headers' => response.headers }
|
55
57
|
}
|
56
58
|
end
|
57
59
|
|
@@ -34,8 +34,6 @@ module MetaInspector
|
|
34
34
|
response.headers["content-type"].split(";")[0] if response
|
35
35
|
end
|
36
36
|
|
37
|
-
private
|
38
|
-
|
39
37
|
def response
|
40
38
|
request_count ||= 0
|
41
39
|
request_count += 1
|
@@ -48,6 +46,8 @@ module MetaInspector
|
|
48
46
|
nil
|
49
47
|
end
|
50
48
|
|
49
|
+
private
|
50
|
+
|
51
51
|
def fetch
|
52
52
|
session = Faraday.new(:url => url) do |faraday|
|
53
53
|
if @allow_redirections
|
data/spec/document_spec.rb
CHANGED
@@ -24,9 +24,9 @@ describe MetaInspector::Document do
|
|
24
24
|
it "should return a Hash with all the values set" do
|
25
25
|
@m = MetaInspector::Document.new('http://pagerankalert.com')
|
26
26
|
@m.to_hash.should == {
|
27
|
-
"url" =>"http://pagerankalert.com/",
|
28
|
-
"title" =>"PageRankAlert.com :: Track your PageRank changes & receive alerts",
|
29
|
-
"favicon" =>"http://pagerankalert.com/src/favicon.ico",
|
27
|
+
"url" => "http://pagerankalert.com/",
|
28
|
+
"title" => "PageRankAlert.com :: Track your PageRank changes & receive alerts",
|
29
|
+
"favicon" => "http://pagerankalert.com/src/favicon.ico",
|
30
30
|
"links" => ["http://pagerankalert.com/",
|
31
31
|
"http://pagerankalert.com/es?language=es",
|
32
32
|
"http://pagerankalert.com/users/sign_up",
|
@@ -46,14 +46,36 @@ describe MetaInspector::Document do
|
|
46
46
|
"images" => ["http://pagerankalert.com/images/pagerank_alert.png?1305794559"],
|
47
47
|
"charset" => "utf-8",
|
48
48
|
"feed" => "http://feeds.feedburner.com/PageRankAlert",
|
49
|
-
"content_type" =>"text/html",
|
50
|
-
"meta_tags" => {
|
49
|
+
"content_type" => "text/html",
|
50
|
+
"meta_tags" => {
|
51
|
+
"name" => {
|
52
|
+
"description" => ["Track your PageRank(TM) changes and receive alerts by email"],
|
51
53
|
"keywords" => ["pagerank, seo, optimization, google"], "robots"=>["all,follow"],
|
52
54
|
"csrf-param" => ["authenticity_token"],
|
53
|
-
"csrf-token" => ["iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="]
|
55
|
+
"csrf-token" => ["iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="]
|
56
|
+
},
|
54
57
|
"http-equiv" => {},
|
55
58
|
"property" => {},
|
56
|
-
"charset" => ["utf-8"]
|
59
|
+
"charset" => ["utf-8"]
|
60
|
+
},
|
61
|
+
"response" => {
|
62
|
+
"status" => 200,
|
63
|
+
"headers" => {
|
64
|
+
"server" => "nginx/0.7.67",
|
65
|
+
"date"=>"Mon, 30 May 2011 09:45:42 GMT",
|
66
|
+
"content-type" => "text/html; charset=utf-8",
|
67
|
+
"connection" => "keep-alive",
|
68
|
+
"etag" => "\"d0534cf7ad7d7a7fb737fe4ad99b0fd1\"",
|
69
|
+
"x-ua-compatible" => "IE=Edge,chrome=1",
|
70
|
+
"x-runtime" => "0.031274",
|
71
|
+
"set-cookie" => "_session_id=33575f7694b4492af4c4e282d62a7127; path=/; HttpOnly",
|
72
|
+
"cache-control" => "max-age=0, private, must-revalidate",
|
73
|
+
"content-length" => "6690",
|
74
|
+
"x-varnish" => "2167295052",
|
75
|
+
"age" => "0",
|
76
|
+
"via" => "1.1 varnish"
|
77
|
+
}
|
78
|
+
}
|
57
79
|
}
|
58
80
|
end
|
59
81
|
|
@@ -0,0 +1,24 @@
|
|
1
|
+
HTTP/1.1 404 Not Found
|
2
|
+
Server: nginx
|
3
|
+
Date: Fri, 17 Oct 2014 21:01:44 GMT
|
4
|
+
Content-Type: text/html; charset=utf-8
|
5
|
+
Content-Length: 933
|
6
|
+
Connection: keep-alive
|
7
|
+
Status: 404 Not Found
|
8
|
+
X-Request-Id: 84997334b729a4e1ad65c10d9c1f68a7
|
9
|
+
X-Runtime: 0.032396
|
10
|
+
X-Rack-Cache: miss
|
11
|
+
|
12
|
+
<!DOCTYPE html>
|
13
|
+
<html>
|
14
|
+
|
15
|
+
<head>
|
16
|
+
<meta charset="UTF-8" />
|
17
|
+
<title>The page you were looking for doesn't exist (404)</title>
|
18
|
+
</head>
|
19
|
+
|
20
|
+
<body>
|
21
|
+
<h1>Four Oh Four!</h1>
|
22
|
+
<h2>The page you were looking for doesn't exist.</h2>
|
23
|
+
</body>
|
24
|
+
</html>
|
data/spec/request_spec.rb
CHANGED
@@ -12,6 +12,23 @@ describe MetaInspector::Request do
|
|
12
12
|
end
|
13
13
|
end
|
14
14
|
|
15
|
+
describe "response" do
|
16
|
+
it "contains the response status" do
|
17
|
+
page_request = MetaInspector::Request.new(url('http://example.com'))
|
18
|
+
page_request.response.status.should == 200
|
19
|
+
end
|
20
|
+
|
21
|
+
it "contains the response headers" do
|
22
|
+
page_request = MetaInspector::Request.new(url('http://example.com'))
|
23
|
+
page_request.response.headers
|
24
|
+
.should == {"server"=>"nginx/0.7.67", "date"=>"Fri, 18 Nov 2011 21:46:46 GMT",
|
25
|
+
"content-type"=>"text/html", "connection"=>"keep-alive",
|
26
|
+
"last-modified"=>"Mon, 14 Nov 2011 16:53:18 GMT",
|
27
|
+
"content-length"=>"4987", "x-varnish"=>"2000423390",
|
28
|
+
"age"=>"0", "via"=>"1.1 varnish"}
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
15
32
|
describe "content_type" do
|
16
33
|
it "should return the correct content type of the url for html pages" do
|
17
34
|
page_request = MetaInspector::Request.new(url('http://pagerankalert.com'))
|
data/spec/spec_helper.rb
CHANGED
@@ -30,6 +30,9 @@ end
|
|
30
30
|
# This is the base page to be used in the examples
|
31
31
|
FakeWeb.register_uri(:get, "http://example.com/", :response => fixture_file("example.response"))
|
32
32
|
|
33
|
+
# Used to test response status codes
|
34
|
+
FakeWeb.register_uri(:get, "http://example.com/404", :response => fixture_file("404.response"))
|
35
|
+
|
33
36
|
# These are older fixtures
|
34
37
|
FakeWeb.register_uri(:get, "http://pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
|
35
38
|
FakeWeb.register_uri(:get, "http://pagerankalert-shortcut.com", :response => fixture_file("pagerankalert-shortcut.com.response"))
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.
|
4
|
+
version: 3.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jaime Iniesta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-10-
|
11
|
+
date: 2014-10-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -179,6 +179,9 @@ files:
|
|
179
179
|
- MIT-LICENSE
|
180
180
|
- README.md
|
181
181
|
- Rakefile
|
182
|
+
- examples/basic_scraping.rb
|
183
|
+
- examples/link_checker.rb
|
184
|
+
- examples/spider.rb
|
182
185
|
- lib/meta_inspector.rb
|
183
186
|
- lib/meta_inspector/document.rb
|
184
187
|
- lib/meta_inspector/exception_log.rb
|
@@ -189,10 +192,9 @@ files:
|
|
189
192
|
- lib/meta_inspector/version.rb
|
190
193
|
- lib/metainspector.rb
|
191
194
|
- meta_inspector.gemspec
|
192
|
-
- samples/basic_scraping.rb
|
193
|
-
- samples/spider.rb
|
194
195
|
- spec/document_spec.rb
|
195
196
|
- spec/exception_log_spec.rb
|
197
|
+
- spec/fixtures/404.response
|
196
198
|
- spec/fixtures/alazan.com.response
|
197
199
|
- spec/fixtures/alazan_websolution.response
|
198
200
|
- spec/fixtures/charset_000.response
|
data/samples/basic_scraping.rb
DELETED
@@ -1,22 +0,0 @@
|
|
1
|
-
# Some basic MetaInspector samples
|
2
|
-
|
3
|
-
$: << File.join(File.dirname(__FILE__), "/../lib")
|
4
|
-
require 'meta_inspector'
|
5
|
-
require 'ap'
|
6
|
-
|
7
|
-
puts "Enter a valid http url to scrape it"
|
8
|
-
url = gets.strip
|
9
|
-
page = MetaInspector.new(url)
|
10
|
-
puts "...please wait while scraping the page..."
|
11
|
-
|
12
|
-
puts "Scraping #{page.url} returned these results:"
|
13
|
-
puts "TITLE: #{page.title}"
|
14
|
-
puts "META DESCRIPTION: #{page.meta_description}"
|
15
|
-
puts "META KEYWORDS: #{page.meta_keywords}"
|
16
|
-
puts "#{page.links.size} links found..."
|
17
|
-
page.links.each do |link|
|
18
|
-
puts " ==> #{link}"
|
19
|
-
end
|
20
|
-
|
21
|
-
puts "to_hash..."
|
22
|
-
ap page.to_hash
|
data/samples/spider.rb
DELETED
@@ -1,30 +0,0 @@
|
|
1
|
-
# A basic spider that will follow links on an infinite loop
|
2
|
-
$: << File.join(File.dirname(__FILE__), "/../lib")
|
3
|
-
require 'rubygems'
|
4
|
-
require 'meta_inspector'
|
5
|
-
|
6
|
-
q = Queue.new
|
7
|
-
visited_links=[]
|
8
|
-
|
9
|
-
puts "Enter a valid http url to spider it following internal links"
|
10
|
-
url = gets.strip
|
11
|
-
|
12
|
-
page = MetaInspector.new(url)
|
13
|
-
q.push(url)
|
14
|
-
|
15
|
-
while q.size > 0
|
16
|
-
visited_links << url = q.pop
|
17
|
-
page = MetaInspector.new(url)
|
18
|
-
puts "Spidering #{page.url}"
|
19
|
-
|
20
|
-
puts "TITLE: #{page.title}"
|
21
|
-
puts "META DESCRIPTION: #{page.meta_description}"
|
22
|
-
puts "META KEYWORDS: #{page.meta_keywords}"
|
23
|
-
puts "LINKS: #{page.internal_links.size}"
|
24
|
-
page.internal_links.each do |link|
|
25
|
-
if !visited_links.include?(link)
|
26
|
-
q.push(link)
|
27
|
-
end
|
28
|
-
end
|
29
|
-
puts "#{visited_links.size} pages visited, #{q.size} pages on queue\n\n"
|
30
|
-
end
|