metainspector 3.2.0 → 3.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 553be427edfb113d96f5956a55d184ecce6c2874
4
- data.tar.gz: 5c11f4b906d2fafa9bda9669521ac1cb1a958734
3
+ metadata.gz: 72feb77b18ae296efcdd96b30950e2deaefb9689
4
+ data.tar.gz: b6442b9f256c6b31e18005af2f5955c339b37298
5
5
  SHA512:
6
- metadata.gz: 0188e170dce0f4a4e94b5f91c280b6bc597dfe8b14eb3795d4bf0f7719a282ae04b1277c1b9bb02675f27891d2ddc23781e444bc78b47e95c61151a579bf92f3
7
- data.tar.gz: 51f3d50f0d1f575f5bafe3977bc9fbac3de740a7caf89a92b54a47e25408ba29946e9e09ea0a475462b7ded4651a237da72f8b8cd50e2672742a20caa9f692bb
6
+ metadata.gz: a9b4002024e2466d5538a1e54c39fe336f2b049128249956b1951a23dfb3f1d09cdea7da63b9e602102fe936f8e14b884000a48a1a970ecd05ab58a632aecef5
7
+ data.tar.gz: 99d8ced9fcf7e03df6316cd5442a54320df1dca1e32ad3a9ee9421638f38f779860994bece0496a048180df49bfbf67580df01ceb4c674d2e66c6df90e0c77f5
data/README.md CHANGED
@@ -45,9 +45,18 @@ You can also include the html which will be used as the document to scrape:
45
45
 
46
46
  page = MetaInspector.new("http://sitevalidator.com", :document => "<html><head><title>Hello From Passed Html</title><a href='/hello'>Hello link</a></head><body></body></html>")
47
47
 
48
+ ## Accessing response status and headers
49
+
50
+ You can check the status and headers from the response like this:
51
+
52
+ ```ruby
53
+ page.response.status # 200
54
+ page.response.headers # { "server"=>"nginx", "content-type"=>"text/html; charset=utf-8", "cache-control"=>"must-revalidate, private, max-age=0", ... }
55
+ ```
56
+
48
57
  ## Accessing scraped data
49
58
 
50
- Then you can see the scraped data like this:
59
+ You can see the scraped data like this:
51
60
 
52
61
  page.url # URL of the page
53
62
  page.scheme # Scheme of the page (http, https)
@@ -256,7 +265,7 @@ You should avoid using the `:store` option, or use it wisely, as silencing error
256
265
 
257
266
  ## Examples
258
267
 
259
- You can find some sample scripts on the samples folder, including a basic scraping and a spider that will follow external links using a queue. What follows is an example of use from irb:
268
+ You can find some sample scripts on the `examples` folder, including a basic scraping and a spider that will follow external links using a queue. What follows is an example of use from irb:
260
269
 
261
270
  $ irb
262
271
  >> require 'metainspector'
@@ -288,6 +297,8 @@ Thanks to all the contributors:
288
297
 
289
298
  [https://github.com/jaimeiniesta/metainspector/graphs/contributors](https://github.com/jaimeiniesta/metainspector/graphs/contributors)
290
299
 
300
+ You are more than welcome to come chat with us on our [Gitter room](https://gitter.im/jaimeiniesta/metainspector) and [Google group](https://groups.google.com/forum/#!forum/metainspector).
301
+
291
302
  ## Related projects
292
303
 
293
304
  * [go-metainspector](https://github.com/fern4lvarez/go-metainspector), a port of MetaInspector for Go.
@@ -0,0 +1,24 @@
1
+ # A basic MetaInspector example for scraping a page
2
+ #
3
+ # Usage example:
4
+ #
5
+ # ruby basic_scraping.rb jaimeiniesta.com
6
+
7
+ require 'metainspector'
8
+
9
+ # Get the starting URL
10
+ url = ARGV[0] || (puts "Enter an url"; gets.strip)
11
+
12
+ page = MetaInspector.new(url)
13
+
14
+ puts "Scraping #{page.url} returned these results:"
15
+ puts "TITLE: #{page.title}"
16
+ puts "META DESCRIPTION: #{page.meta['description']}"
17
+ puts "META KEYWORDS: #{page.meta['keywords']}"
18
+ puts "#{page.links.size} links found..."
19
+ page.links.each do |link|
20
+ puts " ==> #{link}"
21
+ end
22
+
23
+ puts "to_hash..."
24
+ puts page.to_hash
@@ -0,0 +1,100 @@
1
+ # A basic spider that will follow internal links, checking broken links
2
+ #
3
+ # Usage example:
4
+ #
5
+ # ruby link_checker.rb alazan.com
6
+
7
+ require 'metainspector'
8
+
9
+ class BrokenLinkChecker
10
+ attr_reader :broken
11
+
12
+ def initialize(url)
13
+ @url = url
14
+ @queue = []
15
+ @visited = []
16
+ @ok = []
17
+ @broken = {}
18
+
19
+ check
20
+ end
21
+
22
+ def report
23
+ puts "\n#{@broken.size} broken links found."
24
+
25
+ @broken.each do |link, from|
26
+ puts "\n#{link} linked from"
27
+ from.each do |origin|
28
+ puts " - #{origin}"
29
+ end
30
+ end
31
+ end
32
+
33
+ private
34
+
35
+ def check
36
+ # Resolve initial redirections
37
+ page = MetaInspector.new(@url)
38
+
39
+ # Push this initial URL to the queue
40
+ @queue.push(page.url)
41
+
42
+ while @queue.any?
43
+ url = @queue.pop
44
+
45
+ page = MetaInspector.new(url, :warn_level => :store)
46
+
47
+ if page.ok?
48
+ # Gets all HTTP links
49
+ page.links.select {|l| l =~ /^http(s)?:\/\//i}.each do |link|
50
+ check_status(link, page.url)
51
+ end
52
+ end
53
+
54
+ @visited.push(page.url)
55
+
56
+ page.internal_links.each do |link|
57
+ @queue.push(link) unless @visited.include?(link) || @broken.include?(link) || @queue.include?(link)
58
+ end
59
+
60
+ puts "#{'%3s' % @visited.size} pages visited, #{'%3s' % @queue.size} pages on queue, #{'%2s' % @broken.size} broken links"
61
+ end
62
+ end
63
+
64
+ # Checks the response status of the linked_url and stores it on the ok or broken collections
65
+ def check_status(linked_url, from_url)
66
+ if @broken.keys.include?(linked_url)
67
+ # This was already known to be broken, we add another origin
68
+ @broken[linked_url] << from_url
69
+ else
70
+ if !@ok.include?(linked_url)
71
+ # We still don't know about this link status, so we check it now
72
+ if reachable?(linked_url)
73
+ @ok << linked_url
74
+ else
75
+ @broken[linked_url] = [from_url]
76
+ end
77
+ end
78
+ end
79
+ end
80
+
81
+ # A page is reachable if its response status is less than 400
82
+ # In the case of exceptions, like timeouts or server connection errors,
83
+ # we consider it unreachable
84
+ def reachable?(url)
85
+ page = MetaInspector.new(url)
86
+
87
+ if page.response.status < 400
88
+ true
89
+ else
90
+ false
91
+ end
92
+ rescue Exception => e
93
+ false
94
+ end
95
+ end
96
+
97
+ # Get the starting URL
98
+ url = ARGV[0] || (puts "Enter a starting url"; gets.strip)
99
+
100
+ BrokenLinkChecker.new(url).report
@@ -0,0 +1,39 @@
1
+ # A basic spider that will follow internal links
2
+ #
3
+ # Usage example:
4
+ #
5
+ # ruby spider.rb jaimeiniesta.com
6
+
7
+ require 'metainspector'
8
+
9
+ # Two arrays, one for the scraping queue and one for the visited links
10
+ queue = []
11
+ visited = []
12
+
13
+ # Get the starting URL
14
+ url = ARGV[0] || (puts "Enter a starting url"; gets.strip)
15
+
16
+ # Resolve initial redirections
17
+ page = MetaInspector.new(url)
18
+
19
+ # Push this initial URL to the queue
20
+ queue.push(page.url)
21
+
22
+ while queue.any?
23
+ url = queue.pop
24
+
25
+ visited.push(url)
26
+
27
+ puts "VISITED: #{url}"
28
+
29
+ page = MetaInspector.new(url)
30
+
31
+ page.internal_links.each do |link|
32
+ queue.push(link) unless visited.include?(link) || queue.include?(link)
33
+ end
34
+
35
+ puts "#{visited.size} pages visited, #{queue.size} pages on queue\n\n"
36
+ end
37
+
38
+ puts "\nScraping finished, these are the internal links found:\n\n"
39
+ puts visited.sort
@@ -34,7 +34,7 @@ module MetaInspector
34
34
 
35
35
  extend Forwardable
36
36
  def_delegators :@url, :url, :scheme, :host, :root_url
37
- def_delegators :@request, :content_type
37
+ def_delegators :@request, :content_type, :response
38
38
  def_delegators :@parser, :parsed, :respond_to?, :title, :description, :links, :internal_links, :external_links,
39
39
  :images, :image, :feed, :charset, :meta_tags, :meta_tag, :meta, :favicon
40
40
 
@@ -51,7 +51,9 @@ module MetaInspector
51
51
  'feed' => feed,
52
52
  'content_type' => content_type,
53
53
  'meta_tags' => meta_tags,
54
- 'favicon' => favicon
54
+ 'favicon' => favicon,
55
+ 'response' => { 'status' => response.status,
56
+ 'headers' => response.headers }
55
57
  }
56
58
  end
57
59
 
@@ -34,8 +34,6 @@ module MetaInspector
34
34
  response.headers["content-type"].split(";")[0] if response
35
35
  end
36
36
 
37
- private
38
-
39
37
  def response
40
38
  request_count ||= 0
41
39
  request_count += 1
@@ -48,6 +46,8 @@ module MetaInspector
48
46
  nil
49
47
  end
50
48
 
49
+ private
50
+
51
51
  def fetch
52
52
  session = Faraday.new(:url => url) do |faraday|
53
53
  if @allow_redirections
@@ -1,5 +1,5 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  module MetaInspector
4
- VERSION = "3.2.0"
4
+ VERSION = "3.3.0"
5
5
  end
@@ -24,9 +24,9 @@ describe MetaInspector::Document do
24
24
  it "should return a Hash with all the values set" do
25
25
  @m = MetaInspector::Document.new('http://pagerankalert.com')
26
26
  @m.to_hash.should == {
27
- "url" =>"http://pagerankalert.com/",
28
- "title" =>"PageRankAlert.com :: Track your PageRank changes & receive alerts",
29
- "favicon" =>"http://pagerankalert.com/src/favicon.ico",
27
+ "url" => "http://pagerankalert.com/",
28
+ "title" => "PageRankAlert.com :: Track your PageRank changes & receive alerts",
29
+ "favicon" => "http://pagerankalert.com/src/favicon.ico",
30
30
  "links" => ["http://pagerankalert.com/",
31
31
  "http://pagerankalert.com/es?language=es",
32
32
  "http://pagerankalert.com/users/sign_up",
@@ -46,14 +46,36 @@ describe MetaInspector::Document do
46
46
  "images" => ["http://pagerankalert.com/images/pagerank_alert.png?1305794559"],
47
47
  "charset" => "utf-8",
48
48
  "feed" => "http://feeds.feedburner.com/PageRankAlert",
49
- "content_type" =>"text/html",
50
- "meta_tags" => { "name" => { "description" => ["Track your PageRank(TM) changes and receive alerts by email"],
49
+ "content_type" => "text/html",
50
+ "meta_tags" => {
51
+ "name" => {
52
+ "description" => ["Track your PageRank(TM) changes and receive alerts by email"],
51
53
  "keywords" => ["pagerank, seo, optimization, google"], "robots"=>["all,follow"],
52
54
  "csrf-param" => ["authenticity_token"],
53
- "csrf-token" => ["iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="] },
55
+ "csrf-token" => ["iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="]
56
+ },
54
57
  "http-equiv" => {},
55
58
  "property" => {},
56
- "charset" => ["utf-8"] }
59
+ "charset" => ["utf-8"]
60
+ },
61
+ "response" => {
62
+ "status" => 200,
63
+ "headers" => {
64
+ "server" => "nginx/0.7.67",
65
+ "date"=>"Mon, 30 May 2011 09:45:42 GMT",
66
+ "content-type" => "text/html; charset=utf-8",
67
+ "connection" => "keep-alive",
68
+ "etag" => "\"d0534cf7ad7d7a7fb737fe4ad99b0fd1\"",
69
+ "x-ua-compatible" => "IE=Edge,chrome=1",
70
+ "x-runtime" => "0.031274",
71
+ "set-cookie" => "_session_id=33575f7694b4492af4c4e282d62a7127; path=/; HttpOnly",
72
+ "cache-control" => "max-age=0, private, must-revalidate",
73
+ "content-length" => "6690",
74
+ "x-varnish" => "2167295052",
75
+ "age" => "0",
76
+ "via" => "1.1 varnish"
77
+ }
78
+ }
57
79
  }
58
80
  end
59
81
 
@@ -0,0 +1,24 @@
1
+ HTTP/1.1 404 Not Found
2
+ Server: nginx
3
+ Date: Fri, 17 Oct 2014 21:01:44 GMT
4
+ Content-Type: text/html; charset=utf-8
5
+ Content-Length: 933
6
+ Connection: keep-alive
7
+ Status: 404 Not Found
8
+ X-Request-Id: 84997334b729a4e1ad65c10d9c1f68a7
9
+ X-Runtime: 0.032396
10
+ X-Rack-Cache: miss
11
+
12
+ <!DOCTYPE html>
13
+ <html>
14
+
15
+ <head>
16
+ <meta charset="UTF-8" />
17
+ <title>The page you were looking for doesn't exist (404)</title>
18
+ </head>
19
+
20
+ <body>
21
+ <h1>Four Oh Four!</h1>
22
+ <h2>The page you were looking for doesn't exist.</h2>
23
+ </body>
24
+ </html>
data/spec/request_spec.rb CHANGED
@@ -12,6 +12,23 @@ describe MetaInspector::Request do
12
12
  end
13
13
  end
14
14
 
15
+ describe "response" do
16
+ it "contains the response status" do
17
+ page_request = MetaInspector::Request.new(url('http://example.com'))
18
+ page_request.response.status.should == 200
19
+ end
20
+
21
+ it "contains the response headers" do
22
+ page_request = MetaInspector::Request.new(url('http://example.com'))
23
+ page_request.response.headers
24
+ .should == {"server"=>"nginx/0.7.67", "date"=>"Fri, 18 Nov 2011 21:46:46 GMT",
25
+ "content-type"=>"text/html", "connection"=>"keep-alive",
26
+ "last-modified"=>"Mon, 14 Nov 2011 16:53:18 GMT",
27
+ "content-length"=>"4987", "x-varnish"=>"2000423390",
28
+ "age"=>"0", "via"=>"1.1 varnish"}
29
+ end
30
+ end
31
+
15
32
  describe "content_type" do
16
33
  it "should return the correct content type of the url for html pages" do
17
34
  page_request = MetaInspector::Request.new(url('http://pagerankalert.com'))
data/spec/spec_helper.rb CHANGED
@@ -30,6 +30,9 @@ end
30
30
  # This is the base page to be used in the examples
31
31
  FakeWeb.register_uri(:get, "http://example.com/", :response => fixture_file("example.response"))
32
32
 
33
+ # Used to test response status codes
34
+ FakeWeb.register_uri(:get, "http://example.com/404", :response => fixture_file("404.response"))
35
+
33
36
  # These are older fixtures
34
37
  FakeWeb.register_uri(:get, "http://pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
35
38
  FakeWeb.register_uri(:get, "http://pagerankalert-shortcut.com", :response => fixture_file("pagerankalert-shortcut.com.response"))
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: metainspector
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.2.0
4
+ version: 3.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jaime Iniesta
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-10-14 00:00:00.000000000 Z
11
+ date: 2014-10-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -179,6 +179,9 @@ files:
179
179
  - MIT-LICENSE
180
180
  - README.md
181
181
  - Rakefile
182
+ - examples/basic_scraping.rb
183
+ - examples/link_checker.rb
184
+ - examples/spider.rb
182
185
  - lib/meta_inspector.rb
183
186
  - lib/meta_inspector/document.rb
184
187
  - lib/meta_inspector/exception_log.rb
@@ -189,10 +192,9 @@ files:
189
192
  - lib/meta_inspector/version.rb
190
193
  - lib/metainspector.rb
191
194
  - meta_inspector.gemspec
192
- - samples/basic_scraping.rb
193
- - samples/spider.rb
194
195
  - spec/document_spec.rb
195
196
  - spec/exception_log_spec.rb
197
+ - spec/fixtures/404.response
196
198
  - spec/fixtures/alazan.com.response
197
199
  - spec/fixtures/alazan_websolution.response
198
200
  - spec/fixtures/charset_000.response
@@ -1,22 +0,0 @@
1
- # Some basic MetaInspector samples
2
-
3
- $: << File.join(File.dirname(__FILE__), "/../lib")
4
- require 'meta_inspector'
5
- require 'ap'
6
-
7
- puts "Enter a valid http url to scrape it"
8
- url = gets.strip
9
- page = MetaInspector.new(url)
10
- puts "...please wait while scraping the page..."
11
-
12
- puts "Scraping #{page.url} returned these results:"
13
- puts "TITLE: #{page.title}"
14
- puts "META DESCRIPTION: #{page.meta_description}"
15
- puts "META KEYWORDS: #{page.meta_keywords}"
16
- puts "#{page.links.size} links found..."
17
- page.links.each do |link|
18
- puts " ==> #{link}"
19
- end
20
-
21
- puts "to_hash..."
22
- ap page.to_hash
data/samples/spider.rb DELETED
@@ -1,30 +0,0 @@
1
- # A basic spider that will follow links on an infinite loop
2
- $: << File.join(File.dirname(__FILE__), "/../lib")
3
- require 'rubygems'
4
- require 'meta_inspector'
5
-
6
- q = Queue.new
7
- visited_links=[]
8
-
9
- puts "Enter a valid http url to spider it following internal links"
10
- url = gets.strip
11
-
12
- page = MetaInspector.new(url)
13
- q.push(url)
14
-
15
- while q.size > 0
16
- visited_links << url = q.pop
17
- page = MetaInspector.new(url)
18
- puts "Spidering #{page.url}"
19
-
20
- puts "TITLE: #{page.title}"
21
- puts "META DESCRIPTION: #{page.meta_description}"
22
- puts "META KEYWORDS: #{page.meta_keywords}"
23
- puts "LINKS: #{page.internal_links.size}"
24
- page.internal_links.each do |link|
25
- if !visited_links.include?(link)
26
- q.push(link)
27
- end
28
- end
29
- puts "#{visited_links.size} pages visited, #{q.size} pages on queue\n\n"
30
- end