metainspector 3.2.0 → 3.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +13 -2
- data/examples/basic_scraping.rb +24 -0
- data/examples/link_checker.rb +100 -0
- data/examples/spider.rb +39 -0
- data/lib/meta_inspector/document.rb +4 -2
- data/lib/meta_inspector/request.rb +2 -2
- data/lib/meta_inspector/version.rb +1 -1
- data/spec/document_spec.rb +29 -7
- data/spec/fixtures/404.response +24 -0
- data/spec/request_spec.rb +17 -0
- data/spec/spec_helper.rb +3 -0
- metadata +6 -4
- data/samples/basic_scraping.rb +0 -22
- data/samples/spider.rb +0 -30
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 72feb77b18ae296efcdd96b30950e2deaefb9689
|
4
|
+
data.tar.gz: b6442b9f256c6b31e18005af2f5955c339b37298
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a9b4002024e2466d5538a1e54c39fe336f2b049128249956b1951a23dfb3f1d09cdea7da63b9e602102fe936f8e14b884000a48a1a970ecd05ab58a632aecef5
|
7
|
+
data.tar.gz: 99d8ced9fcf7e03df6316cd5442a54320df1dca1e32ad3a9ee9421638f38f779860994bece0496a048180df49bfbf67580df01ceb4c674d2e66c6df90e0c77f5
|
data/README.md
CHANGED
@@ -45,9 +45,18 @@ You can also include the html which will be used as the document to scrape:
|
|
45
45
|
|
46
46
|
page = MetaInspector.new("http://sitevalidator.com", :document => "<html><head><title>Hello From Passed Html</title><a href='/hello'>Hello link</a></head><body></body></html>")
|
47
47
|
|
48
|
+
## Accessing response status and headers
|
49
|
+
|
50
|
+
You can check the status and headers from the response like this:
|
51
|
+
|
52
|
+
```ruby
|
53
|
+
page.response.status # 200
|
54
|
+
page.response.headers # { "server"=>"nginx", "content-type"=>"text/html; charset=utf-8", "cache-control"=>"must-revalidate, private, max-age=0", ... }
|
55
|
+
```
|
56
|
+
|
48
57
|
## Accessing scraped data
|
49
58
|
|
50
|
-
|
59
|
+
You can see the scraped data like this:
|
51
60
|
|
52
61
|
page.url # URL of the page
|
53
62
|
page.scheme # Scheme of the page (http, https)
|
@@ -256,7 +265,7 @@ You should avoid using the `:store` option, or use it wisely, as silencing error
|
|
256
265
|
|
257
266
|
## Examples
|
258
267
|
|
259
|
-
You can find some sample scripts on the
|
268
|
+
You can find some sample scripts on the `examples` folder, including a basic scraping and a spider that will follow external links using a queue. What follows is an example of use from irb:
|
260
269
|
|
261
270
|
$ irb
|
262
271
|
>> require 'metainspector'
|
@@ -288,6 +297,8 @@ Thanks to all the contributors:
|
|
288
297
|
|
289
298
|
[https://github.com/jaimeiniesta/metainspector/graphs/contributors](https://github.com/jaimeiniesta/metainspector/graphs/contributors)
|
290
299
|
|
300
|
+
You are more than welcome to come chat with us on our [Gitter room](https://gitter.im/jaimeiniesta/metainspector) and [Google group](https://groups.google.com/forum/#!forum/metainspector).
|
301
|
+
|
291
302
|
## Related projects
|
292
303
|
|
293
304
|
* [go-metainspector](https://github.com/fern4lvarez/go-metainspector), a port of MetaInspector for Go.
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# A basic MetaInspector example for scraping a page
|
2
|
+
#
|
3
|
+
# Usage example:
|
4
|
+
#
|
5
|
+
# ruby basic_scraping.rb jaimeiniesta.com
|
6
|
+
|
7
|
+
require 'metainspector'
|
8
|
+
|
9
|
+
# Get the starting URL
|
10
|
+
url = ARGV[0] || (puts "Enter an url"; gets.strip)
|
11
|
+
|
12
|
+
page = MetaInspector.new(url)
|
13
|
+
|
14
|
+
puts "Scraping #{page.url} returned these results:"
|
15
|
+
puts "TITLE: #{page.title}"
|
16
|
+
puts "META DESCRIPTION: #{page.meta['description']}"
|
17
|
+
puts "META KEYWORDS: #{page.meta['keywords']}"
|
18
|
+
puts "#{page.links.size} links found..."
|
19
|
+
page.links.each do |link|
|
20
|
+
puts " ==> #{link}"
|
21
|
+
end
|
22
|
+
|
23
|
+
puts "to_hash..."
|
24
|
+
puts page.to_hash
|
@@ -0,0 +1,100 @@
|
|
1
|
+
# A basic spider that will follow internal links, checking broken links
|
2
|
+
#
|
3
|
+
# Usage example:
|
4
|
+
#
|
5
|
+
# ruby link_checker.rb alazan.com
|
6
|
+
|
7
|
+
require 'metainspector'
|
8
|
+
|
9
|
+
class BrokenLinkChecker
|
10
|
+
attr_reader :broken
|
11
|
+
|
12
|
+
def initialize(url)
|
13
|
+
@url = url
|
14
|
+
@queue = []
|
15
|
+
@visited = []
|
16
|
+
@ok = []
|
17
|
+
@broken = {}
|
18
|
+
|
19
|
+
check
|
20
|
+
end
|
21
|
+
|
22
|
+
def report
|
23
|
+
puts "\n#{@broken.size} broken links found."
|
24
|
+
|
25
|
+
@broken.each do |link, from|
|
26
|
+
puts "\n#{link} linked from"
|
27
|
+
from.each do |origin|
|
28
|
+
puts " - #{origin}"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def check
|
36
|
+
# Resolve initial redirections
|
37
|
+
page = MetaInspector.new(@url)
|
38
|
+
|
39
|
+
# Push this initial URL to the queue
|
40
|
+
@queue.push(page.url)
|
41
|
+
|
42
|
+
while @queue.any?
|
43
|
+
url = @queue.pop
|
44
|
+
|
45
|
+
page = MetaInspector.new(url, :warn_level => :store)
|
46
|
+
|
47
|
+
if page.ok?
|
48
|
+
# Gets all HTTP links
|
49
|
+
page.links.select {|l| l =~ /^http(s)?:\/\//i}.each do |link|
|
50
|
+
check_status(link, page.url)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
@visited.push(page.url)
|
55
|
+
|
56
|
+
page.internal_links.each do |link|
|
57
|
+
@queue.push(link) unless @visited.include?(link) || @broken.include?(link) || @queue.include?(link)
|
58
|
+
end
|
59
|
+
|
60
|
+
puts "#{'%3s' % @visited.size} pages visited, #{'%3s' % @queue.size} pages on queue, #{'%2s' % @broken.size} broken links"
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
# Checks the response status of the linked_url and stores it on the ok or broken collections
|
65
|
+
def check_status(linked_url, from_url)
|
66
|
+
if @broken.keys.include?(linked_url)
|
67
|
+
# This was already known to be broken, we add another origin
|
68
|
+
@broken[linked_url] << from_url
|
69
|
+
else
|
70
|
+
if !@ok.include?(linked_url)
|
71
|
+
# We still don't know about this link status, so we check it now
|
72
|
+
if reachable?(linked_url)
|
73
|
+
@ok << linked_url
|
74
|
+
else
|
75
|
+
@broken[linked_url] = [from_url]
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
# A page is reachable if its response status is less than 400
|
82
|
+
# In the case of exceptions, like timeouts or server connection errors,
|
83
|
+
# we consider it unreachable
|
84
|
+
def reachable?(url)
|
85
|
+
page = MetaInspector.new(url)
|
86
|
+
|
87
|
+
if page.response.status < 400
|
88
|
+
true
|
89
|
+
else
|
90
|
+
false
|
91
|
+
end
|
92
|
+
rescue Exception => e
|
93
|
+
false
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
# Get the starting URL
|
98
|
+
url = ARGV[0] || (puts "Enter a starting url"; gets.strip)
|
99
|
+
|
100
|
+
BrokenLinkChecker.new(url).report
|
data/examples/spider.rb
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
# A basic spider that will follow internal links
|
2
|
+
#
|
3
|
+
# Usage example:
|
4
|
+
#
|
5
|
+
# ruby spider.rb jaimeiniesta.com
|
6
|
+
|
7
|
+
require 'metainspector'
|
8
|
+
|
9
|
+
# Two arrays, one for the scraping queue and one for the visited links
|
10
|
+
queue = []
|
11
|
+
visited = []
|
12
|
+
|
13
|
+
# Get the starting URL
|
14
|
+
url = ARGV[0] || (puts "Enter a starting url"; gets.strip)
|
15
|
+
|
16
|
+
# Resolve initial redirections
|
17
|
+
page = MetaInspector.new(url)
|
18
|
+
|
19
|
+
# Push this initial URL to the queue
|
20
|
+
queue.push(page.url)
|
21
|
+
|
22
|
+
while queue.any?
|
23
|
+
url = queue.pop
|
24
|
+
|
25
|
+
visited.push(url)
|
26
|
+
|
27
|
+
puts "VISITED: #{url}"
|
28
|
+
|
29
|
+
page = MetaInspector.new(url)
|
30
|
+
|
31
|
+
page.internal_links.each do |link|
|
32
|
+
queue.push(link) unless visited.include?(link) || queue.include?(link)
|
33
|
+
end
|
34
|
+
|
35
|
+
puts "#{visited.size} pages visited, #{queue.size} pages on queue\n\n"
|
36
|
+
end
|
37
|
+
|
38
|
+
puts "\nScraping finished, these are the internal links found:\n\n"
|
39
|
+
puts visited.sort
|
@@ -34,7 +34,7 @@ module MetaInspector
|
|
34
34
|
|
35
35
|
extend Forwardable
|
36
36
|
def_delegators :@url, :url, :scheme, :host, :root_url
|
37
|
-
def_delegators :@request, :content_type
|
37
|
+
def_delegators :@request, :content_type, :response
|
38
38
|
def_delegators :@parser, :parsed, :respond_to?, :title, :description, :links, :internal_links, :external_links,
|
39
39
|
:images, :image, :feed, :charset, :meta_tags, :meta_tag, :meta, :favicon
|
40
40
|
|
@@ -51,7 +51,9 @@ module MetaInspector
|
|
51
51
|
'feed' => feed,
|
52
52
|
'content_type' => content_type,
|
53
53
|
'meta_tags' => meta_tags,
|
54
|
-
'favicon' => favicon
|
54
|
+
'favicon' => favicon,
|
55
|
+
'response' => { 'status' => response.status,
|
56
|
+
'headers' => response.headers }
|
55
57
|
}
|
56
58
|
end
|
57
59
|
|
@@ -34,8 +34,6 @@ module MetaInspector
|
|
34
34
|
response.headers["content-type"].split(";")[0] if response
|
35
35
|
end
|
36
36
|
|
37
|
-
private
|
38
|
-
|
39
37
|
def response
|
40
38
|
request_count ||= 0
|
41
39
|
request_count += 1
|
@@ -48,6 +46,8 @@ module MetaInspector
|
|
48
46
|
nil
|
49
47
|
end
|
50
48
|
|
49
|
+
private
|
50
|
+
|
51
51
|
def fetch
|
52
52
|
session = Faraday.new(:url => url) do |faraday|
|
53
53
|
if @allow_redirections
|
data/spec/document_spec.rb
CHANGED
@@ -24,9 +24,9 @@ describe MetaInspector::Document do
|
|
24
24
|
it "should return a Hash with all the values set" do
|
25
25
|
@m = MetaInspector::Document.new('http://pagerankalert.com')
|
26
26
|
@m.to_hash.should == {
|
27
|
-
"url" =>"http://pagerankalert.com/",
|
28
|
-
"title" =>"PageRankAlert.com :: Track your PageRank changes & receive alerts",
|
29
|
-
"favicon" =>"http://pagerankalert.com/src/favicon.ico",
|
27
|
+
"url" => "http://pagerankalert.com/",
|
28
|
+
"title" => "PageRankAlert.com :: Track your PageRank changes & receive alerts",
|
29
|
+
"favicon" => "http://pagerankalert.com/src/favicon.ico",
|
30
30
|
"links" => ["http://pagerankalert.com/",
|
31
31
|
"http://pagerankalert.com/es?language=es",
|
32
32
|
"http://pagerankalert.com/users/sign_up",
|
@@ -46,14 +46,36 @@ describe MetaInspector::Document do
|
|
46
46
|
"images" => ["http://pagerankalert.com/images/pagerank_alert.png?1305794559"],
|
47
47
|
"charset" => "utf-8",
|
48
48
|
"feed" => "http://feeds.feedburner.com/PageRankAlert",
|
49
|
-
"content_type" =>"text/html",
|
50
|
-
"meta_tags" => {
|
49
|
+
"content_type" => "text/html",
|
50
|
+
"meta_tags" => {
|
51
|
+
"name" => {
|
52
|
+
"description" => ["Track your PageRank(TM) changes and receive alerts by email"],
|
51
53
|
"keywords" => ["pagerank, seo, optimization, google"], "robots"=>["all,follow"],
|
52
54
|
"csrf-param" => ["authenticity_token"],
|
53
|
-
"csrf-token" => ["iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="]
|
55
|
+
"csrf-token" => ["iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="]
|
56
|
+
},
|
54
57
|
"http-equiv" => {},
|
55
58
|
"property" => {},
|
56
|
-
"charset" => ["utf-8"]
|
59
|
+
"charset" => ["utf-8"]
|
60
|
+
},
|
61
|
+
"response" => {
|
62
|
+
"status" => 200,
|
63
|
+
"headers" => {
|
64
|
+
"server" => "nginx/0.7.67",
|
65
|
+
"date"=>"Mon, 30 May 2011 09:45:42 GMT",
|
66
|
+
"content-type" => "text/html; charset=utf-8",
|
67
|
+
"connection" => "keep-alive",
|
68
|
+
"etag" => "\"d0534cf7ad7d7a7fb737fe4ad99b0fd1\"",
|
69
|
+
"x-ua-compatible" => "IE=Edge,chrome=1",
|
70
|
+
"x-runtime" => "0.031274",
|
71
|
+
"set-cookie" => "_session_id=33575f7694b4492af4c4e282d62a7127; path=/; HttpOnly",
|
72
|
+
"cache-control" => "max-age=0, private, must-revalidate",
|
73
|
+
"content-length" => "6690",
|
74
|
+
"x-varnish" => "2167295052",
|
75
|
+
"age" => "0",
|
76
|
+
"via" => "1.1 varnish"
|
77
|
+
}
|
78
|
+
}
|
57
79
|
}
|
58
80
|
end
|
59
81
|
|
@@ -0,0 +1,24 @@
|
|
1
|
+
HTTP/1.1 404 Not Found
|
2
|
+
Server: nginx
|
3
|
+
Date: Fri, 17 Oct 2014 21:01:44 GMT
|
4
|
+
Content-Type: text/html; charset=utf-8
|
5
|
+
Content-Length: 933
|
6
|
+
Connection: keep-alive
|
7
|
+
Status: 404 Not Found
|
8
|
+
X-Request-Id: 84997334b729a4e1ad65c10d9c1f68a7
|
9
|
+
X-Runtime: 0.032396
|
10
|
+
X-Rack-Cache: miss
|
11
|
+
|
12
|
+
<!DOCTYPE html>
|
13
|
+
<html>
|
14
|
+
|
15
|
+
<head>
|
16
|
+
<meta charset="UTF-8" />
|
17
|
+
<title>The page you were looking for doesn't exist (404)</title>
|
18
|
+
</head>
|
19
|
+
|
20
|
+
<body>
|
21
|
+
<h1>Four Oh Four!</h1>
|
22
|
+
<h2>The page you were looking for doesn't exist.</h2>
|
23
|
+
</body>
|
24
|
+
</html>
|
data/spec/request_spec.rb
CHANGED
@@ -12,6 +12,23 @@ describe MetaInspector::Request do
|
|
12
12
|
end
|
13
13
|
end
|
14
14
|
|
15
|
+
describe "response" do
|
16
|
+
it "contains the response status" do
|
17
|
+
page_request = MetaInspector::Request.new(url('http://example.com'))
|
18
|
+
page_request.response.status.should == 200
|
19
|
+
end
|
20
|
+
|
21
|
+
it "contains the response headers" do
|
22
|
+
page_request = MetaInspector::Request.new(url('http://example.com'))
|
23
|
+
page_request.response.headers
|
24
|
+
.should == {"server"=>"nginx/0.7.67", "date"=>"Fri, 18 Nov 2011 21:46:46 GMT",
|
25
|
+
"content-type"=>"text/html", "connection"=>"keep-alive",
|
26
|
+
"last-modified"=>"Mon, 14 Nov 2011 16:53:18 GMT",
|
27
|
+
"content-length"=>"4987", "x-varnish"=>"2000423390",
|
28
|
+
"age"=>"0", "via"=>"1.1 varnish"}
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
15
32
|
describe "content_type" do
|
16
33
|
it "should return the correct content type of the url for html pages" do
|
17
34
|
page_request = MetaInspector::Request.new(url('http://pagerankalert.com'))
|
data/spec/spec_helper.rb
CHANGED
@@ -30,6 +30,9 @@ end
|
|
30
30
|
# This is the base page to be used in the examples
|
31
31
|
FakeWeb.register_uri(:get, "http://example.com/", :response => fixture_file("example.response"))
|
32
32
|
|
33
|
+
# Used to test response status codes
|
34
|
+
FakeWeb.register_uri(:get, "http://example.com/404", :response => fixture_file("404.response"))
|
35
|
+
|
33
36
|
# These are older fixtures
|
34
37
|
FakeWeb.register_uri(:get, "http://pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
|
35
38
|
FakeWeb.register_uri(:get, "http://pagerankalert-shortcut.com", :response => fixture_file("pagerankalert-shortcut.com.response"))
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metainspector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.
|
4
|
+
version: 3.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jaime Iniesta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-10-
|
11
|
+
date: 2014-10-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -179,6 +179,9 @@ files:
|
|
179
179
|
- MIT-LICENSE
|
180
180
|
- README.md
|
181
181
|
- Rakefile
|
182
|
+
- examples/basic_scraping.rb
|
183
|
+
- examples/link_checker.rb
|
184
|
+
- examples/spider.rb
|
182
185
|
- lib/meta_inspector.rb
|
183
186
|
- lib/meta_inspector/document.rb
|
184
187
|
- lib/meta_inspector/exception_log.rb
|
@@ -189,10 +192,9 @@ files:
|
|
189
192
|
- lib/meta_inspector/version.rb
|
190
193
|
- lib/metainspector.rb
|
191
194
|
- meta_inspector.gemspec
|
192
|
-
- samples/basic_scraping.rb
|
193
|
-
- samples/spider.rb
|
194
195
|
- spec/document_spec.rb
|
195
196
|
- spec/exception_log_spec.rb
|
197
|
+
- spec/fixtures/404.response
|
196
198
|
- spec/fixtures/alazan.com.response
|
197
199
|
- spec/fixtures/alazan_websolution.response
|
198
200
|
- spec/fixtures/charset_000.response
|
data/samples/basic_scraping.rb
DELETED
@@ -1,22 +0,0 @@
|
|
1
|
-
# Some basic MetaInspector samples
|
2
|
-
|
3
|
-
$: << File.join(File.dirname(__FILE__), "/../lib")
|
4
|
-
require 'meta_inspector'
|
5
|
-
require 'ap'
|
6
|
-
|
7
|
-
puts "Enter a valid http url to scrape it"
|
8
|
-
url = gets.strip
|
9
|
-
page = MetaInspector.new(url)
|
10
|
-
puts "...please wait while scraping the page..."
|
11
|
-
|
12
|
-
puts "Scraping #{page.url} returned these results:"
|
13
|
-
puts "TITLE: #{page.title}"
|
14
|
-
puts "META DESCRIPTION: #{page.meta_description}"
|
15
|
-
puts "META KEYWORDS: #{page.meta_keywords}"
|
16
|
-
puts "#{page.links.size} links found..."
|
17
|
-
page.links.each do |link|
|
18
|
-
puts " ==> #{link}"
|
19
|
-
end
|
20
|
-
|
21
|
-
puts "to_hash..."
|
22
|
-
ap page.to_hash
|
data/samples/spider.rb
DELETED
@@ -1,30 +0,0 @@
|
|
1
|
-
# A basic spider that will follow links on an infinite loop
|
2
|
-
$: << File.join(File.dirname(__FILE__), "/../lib")
|
3
|
-
require 'rubygems'
|
4
|
-
require 'meta_inspector'
|
5
|
-
|
6
|
-
q = Queue.new
|
7
|
-
visited_links=[]
|
8
|
-
|
9
|
-
puts "Enter a valid http url to spider it following internal links"
|
10
|
-
url = gets.strip
|
11
|
-
|
12
|
-
page = MetaInspector.new(url)
|
13
|
-
q.push(url)
|
14
|
-
|
15
|
-
while q.size > 0
|
16
|
-
visited_links << url = q.pop
|
17
|
-
page = MetaInspector.new(url)
|
18
|
-
puts "Spidering #{page.url}"
|
19
|
-
|
20
|
-
puts "TITLE: #{page.title}"
|
21
|
-
puts "META DESCRIPTION: #{page.meta_description}"
|
22
|
-
puts "META KEYWORDS: #{page.meta_keywords}"
|
23
|
-
puts "LINKS: #{page.internal_links.size}"
|
24
|
-
page.internal_links.each do |link|
|
25
|
-
if !visited_links.include?(link)
|
26
|
-
q.push(link)
|
27
|
-
end
|
28
|
-
end
|
29
|
-
puts "#{visited_links.size} pages visited, #{q.size} pages on queue\n\n"
|
30
|
-
end
|