metainspector 3.2.0 → 3.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 553be427edfb113d96f5956a55d184ecce6c2874
4
- data.tar.gz: 5c11f4b906d2fafa9bda9669521ac1cb1a958734
3
+ metadata.gz: 72feb77b18ae296efcdd96b30950e2deaefb9689
4
+ data.tar.gz: b6442b9f256c6b31e18005af2f5955c339b37298
5
5
  SHA512:
6
- metadata.gz: 0188e170dce0f4a4e94b5f91c280b6bc597dfe8b14eb3795d4bf0f7719a282ae04b1277c1b9bb02675f27891d2ddc23781e444bc78b47e95c61151a579bf92f3
7
- data.tar.gz: 51f3d50f0d1f575f5bafe3977bc9fbac3de740a7caf89a92b54a47e25408ba29946e9e09ea0a475462b7ded4651a237da72f8b8cd50e2672742a20caa9f692bb
6
+ metadata.gz: a9b4002024e2466d5538a1e54c39fe336f2b049128249956b1951a23dfb3f1d09cdea7da63b9e602102fe936f8e14b884000a48a1a970ecd05ab58a632aecef5
7
+ data.tar.gz: 99d8ced9fcf7e03df6316cd5442a54320df1dca1e32ad3a9ee9421638f38f779860994bece0496a048180df49bfbf67580df01ceb4c674d2e66c6df90e0c77f5
data/README.md CHANGED
@@ -45,9 +45,18 @@ You can also include the html which will be used as the document to scrape:
45
45
 
46
46
  page = MetaInspector.new("http://sitevalidator.com", :document => "<html><head><title>Hello From Passed Html</title><a href='/hello'>Hello link</a></head><body></body></html>")
47
47
 
48
+ ## Accessing response status and headers
49
+
50
+ You can check the status and headers from the response like this:
51
+
52
+ ```ruby
53
+ page.response.status # 200
54
+ page.response.headers # { "server"=>"nginx", "content-type"=>"text/html; charset=utf-8", "cache-control"=>"must-revalidate, private, max-age=0", ... }
55
+ ```
56
+
48
57
  ## Accessing scraped data
49
58
 
50
- Then you can see the scraped data like this:
59
+ You can see the scraped data like this:
51
60
 
52
61
  page.url # URL of the page
53
62
  page.scheme # Scheme of the page (http, https)
@@ -256,7 +265,7 @@ You should avoid using the `:store` option, or use it wisely, as silencing error
256
265
 
257
266
  ## Examples
258
267
 
259
- You can find some sample scripts on the samples folder, including a basic scraping and a spider that will follow external links using a queue. What follows is an example of use from irb:
268
+ You can find some sample scripts on the `examples` folder, including a basic scraping and a spider that will follow external links using a queue. What follows is an example of use from irb:
260
269
 
261
270
  $ irb
262
271
  >> require 'metainspector'
@@ -288,6 +297,8 @@ Thanks to all the contributors:
288
297
 
289
298
  [https://github.com/jaimeiniesta/metainspector/graphs/contributors](https://github.com/jaimeiniesta/metainspector/graphs/contributors)
290
299
 
300
+ You are more than welcome to come chat with us on our [Gitter room](https://gitter.im/jaimeiniesta/metainspector) and [Google group](https://groups.google.com/forum/#!forum/metainspector).
301
+
291
302
  ## Related projects
292
303
 
293
304
  * [go-metainspector](https://github.com/fern4lvarez/go-metainspector), a port of MetaInspector for Go.
@@ -0,0 +1,24 @@
1
+ # A basic MetaInspector example for scraping a page
2
+ #
3
+ # Usage example:
4
+ #
5
+ # ruby basic_scraping.rb jaimeiniesta.com
6
+
7
+ require 'metainspector'
8
+
9
+ # Get the starting URL
10
+ url = ARGV[0] || (puts "Enter an url"; gets.strip)
11
+
12
+ page = MetaInspector.new(url)
13
+
14
+ puts "Scraping #{page.url} returned these results:"
15
+ puts "TITLE: #{page.title}"
16
+ puts "META DESCRIPTION: #{page.meta['description']}"
17
+ puts "META KEYWORDS: #{page.meta['keywords']}"
18
+ puts "#{page.links.size} links found..."
19
+ page.links.each do |link|
20
+ puts " ==> #{link}"
21
+ end
22
+
23
+ puts "to_hash..."
24
+ puts page.to_hash
@@ -0,0 +1,100 @@
1
+ # A basic spider that will follow internal links, checking broken links
2
+ #
3
+ # Usage example:
4
+ #
5
+ # ruby link_checker.rb alazan.com
6
+
7
+ require 'metainspector'
8
+
9
+ class BrokenLinkChecker
10
+ attr_reader :broken
11
+
12
+ def initialize(url)
13
+ @url = url
14
+ @queue = []
15
+ @visited = []
16
+ @ok = []
17
+ @broken = {}
18
+
19
+ check
20
+ end
21
+
22
+ def report
23
+ puts "\n#{@broken.size} broken links found."
24
+
25
+ @broken.each do |link, from|
26
+ puts "\n#{link} linked from"
27
+ from.each do |origin|
28
+ puts " - #{origin}"
29
+ end
30
+ end
31
+ end
32
+
33
+ private
34
+
35
+ def check
36
+ # Resolve initial redirections
37
+ page = MetaInspector.new(@url)
38
+
39
+ # Push this initial URL to the queue
40
+ @queue.push(page.url)
41
+
42
+ while @queue.any?
43
+ url = @queue.pop
44
+
45
+ page = MetaInspector.new(url, :warn_level => :store)
46
+
47
+ if page.ok?
48
+ # Gets all HTTP links
49
+ page.links.select {|l| l =~ /^http(s)?:\/\//i}.each do |link|
50
+ check_status(link, page.url)
51
+ end
52
+ end
53
+
54
+ @visited.push(page.url)
55
+
56
+ page.internal_links.each do |link|
57
+ @queue.push(link) unless @visited.include?(link) || @broken.include?(link) || @queue.include?(link)
58
+ end
59
+
60
+ puts "#{'%3s' % @visited.size} pages visited, #{'%3s' % @queue.size} pages on queue, #{'%2s' % @broken.size} broken links"
61
+ end
62
+ end
63
+
64
+ # Checks the response status of the linked_url and stores it on the ok or broken collections
65
+ def check_status(linked_url, from_url)
66
+ if @broken.keys.include?(linked_url)
67
+ # This was already known to be broken, we add another origin
68
+ @broken[linked_url] << from_url
69
+ else
70
+ if !@ok.include?(linked_url)
71
+ # We still don't know about this link status, so we check it now
72
+ if reachable?(linked_url)
73
+ @ok << linked_url
74
+ else
75
+ @broken[linked_url] = [from_url]
76
+ end
77
+ end
78
+ end
79
+ end
80
+
81
+ # A page is reachable if its response status is less than 400
82
+ # In the case of exceptions, like timeouts or server connection errors,
83
+ # we consider it unreachable
84
+ def reachable?(url)
85
+ page = MetaInspector.new(url)
86
+
87
+ if page.response.status < 400
88
+ true
89
+ else
90
+ false
91
+ end
92
+ rescue Exception => e
93
+ false
94
+ end
95
+ end
96
+
97
+ # Get the starting URL
98
+ url = ARGV[0] || (puts "Enter a starting url"; gets.strip)
99
+
100
+ BrokenLinkChecker.new(url).report
@@ -0,0 +1,39 @@
1
+ # A basic spider that will follow internal links
2
+ #
3
+ # Usage example:
4
+ #
5
+ # ruby spider.rb jaimeiniesta.com
6
+
7
+ require 'metainspector'
8
+
9
+ # Two arrays, one for the scraping queue and one for the visited links
10
+ queue = []
11
+ visited = []
12
+
13
+ # Get the starting URL
14
+ url = ARGV[0] || (puts "Enter a starting url"; gets.strip)
15
+
16
+ # Resolve initial redirections
17
+ page = MetaInspector.new(url)
18
+
19
+ # Push this initial URL to the queue
20
+ queue.push(page.url)
21
+
22
+ while queue.any?
23
+ url = queue.pop
24
+
25
+ visited.push(url)
26
+
27
+ puts "VISITED: #{url}"
28
+
29
+ page = MetaInspector.new(url)
30
+
31
+ page.internal_links.each do |link|
32
+ queue.push(link) unless visited.include?(link) || queue.include?(link)
33
+ end
34
+
35
+ puts "#{visited.size} pages visited, #{queue.size} pages on queue\n\n"
36
+ end
37
+
38
+ puts "\nScraping finished, these are the internal links found:\n\n"
39
+ puts visited.sort
@@ -34,7 +34,7 @@ module MetaInspector
34
34
 
35
35
  extend Forwardable
36
36
  def_delegators :@url, :url, :scheme, :host, :root_url
37
- def_delegators :@request, :content_type
37
+ def_delegators :@request, :content_type, :response
38
38
  def_delegators :@parser, :parsed, :respond_to?, :title, :description, :links, :internal_links, :external_links,
39
39
  :images, :image, :feed, :charset, :meta_tags, :meta_tag, :meta, :favicon
40
40
 
@@ -51,7 +51,9 @@ module MetaInspector
51
51
  'feed' => feed,
52
52
  'content_type' => content_type,
53
53
  'meta_tags' => meta_tags,
54
- 'favicon' => favicon
54
+ 'favicon' => favicon,
55
+ 'response' => { 'status' => response.status,
56
+ 'headers' => response.headers }
55
57
  }
56
58
  end
57
59
 
@@ -34,8 +34,6 @@ module MetaInspector
34
34
  response.headers["content-type"].split(";")[0] if response
35
35
  end
36
36
 
37
- private
38
-
39
37
  def response
40
38
  request_count ||= 0
41
39
  request_count += 1
@@ -48,6 +46,8 @@ module MetaInspector
48
46
  nil
49
47
  end
50
48
 
49
+ private
50
+
51
51
  def fetch
52
52
  session = Faraday.new(:url => url) do |faraday|
53
53
  if @allow_redirections
@@ -1,5 +1,5 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
 
3
3
  module MetaInspector
4
- VERSION = "3.2.0"
4
+ VERSION = "3.3.0"
5
5
  end
@@ -24,9 +24,9 @@ describe MetaInspector::Document do
24
24
  it "should return a Hash with all the values set" do
25
25
  @m = MetaInspector::Document.new('http://pagerankalert.com')
26
26
  @m.to_hash.should == {
27
- "url" =>"http://pagerankalert.com/",
28
- "title" =>"PageRankAlert.com :: Track your PageRank changes & receive alerts",
29
- "favicon" =>"http://pagerankalert.com/src/favicon.ico",
27
+ "url" => "http://pagerankalert.com/",
28
+ "title" => "PageRankAlert.com :: Track your PageRank changes & receive alerts",
29
+ "favicon" => "http://pagerankalert.com/src/favicon.ico",
30
30
  "links" => ["http://pagerankalert.com/",
31
31
  "http://pagerankalert.com/es?language=es",
32
32
  "http://pagerankalert.com/users/sign_up",
@@ -46,14 +46,36 @@ describe MetaInspector::Document do
46
46
  "images" => ["http://pagerankalert.com/images/pagerank_alert.png?1305794559"],
47
47
  "charset" => "utf-8",
48
48
  "feed" => "http://feeds.feedburner.com/PageRankAlert",
49
- "content_type" =>"text/html",
50
- "meta_tags" => { "name" => { "description" => ["Track your PageRank(TM) changes and receive alerts by email"],
49
+ "content_type" => "text/html",
50
+ "meta_tags" => {
51
+ "name" => {
52
+ "description" => ["Track your PageRank(TM) changes and receive alerts by email"],
51
53
  "keywords" => ["pagerank, seo, optimization, google"], "robots"=>["all,follow"],
52
54
  "csrf-param" => ["authenticity_token"],
53
- "csrf-token" => ["iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="] },
55
+ "csrf-token" => ["iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="]
56
+ },
54
57
  "http-equiv" => {},
55
58
  "property" => {},
56
- "charset" => ["utf-8"] }
59
+ "charset" => ["utf-8"]
60
+ },
61
+ "response" => {
62
+ "status" => 200,
63
+ "headers" => {
64
+ "server" => "nginx/0.7.67",
65
+ "date"=>"Mon, 30 May 2011 09:45:42 GMT",
66
+ "content-type" => "text/html; charset=utf-8",
67
+ "connection" => "keep-alive",
68
+ "etag" => "\"d0534cf7ad7d7a7fb737fe4ad99b0fd1\"",
69
+ "x-ua-compatible" => "IE=Edge,chrome=1",
70
+ "x-runtime" => "0.031274",
71
+ "set-cookie" => "_session_id=33575f7694b4492af4c4e282d62a7127; path=/; HttpOnly",
72
+ "cache-control" => "max-age=0, private, must-revalidate",
73
+ "content-length" => "6690",
74
+ "x-varnish" => "2167295052",
75
+ "age" => "0",
76
+ "via" => "1.1 varnish"
77
+ }
78
+ }
57
79
  }
58
80
  end
59
81
 
@@ -0,0 +1,24 @@
1
+ HTTP/1.1 404 Not Found
2
+ Server: nginx
3
+ Date: Fri, 17 Oct 2014 21:01:44 GMT
4
+ Content-Type: text/html; charset=utf-8
5
+ Content-Length: 933
6
+ Connection: keep-alive
7
+ Status: 404 Not Found
8
+ X-Request-Id: 84997334b729a4e1ad65c10d9c1f68a7
9
+ X-Runtime: 0.032396
10
+ X-Rack-Cache: miss
11
+
12
+ <!DOCTYPE html>
13
+ <html>
14
+
15
+ <head>
16
+ <meta charset="UTF-8" />
17
+ <title>The page you were looking for doesn't exist (404)</title>
18
+ </head>
19
+
20
+ <body>
21
+ <h1>Four Oh Four!</h1>
22
+ <h2>The page you were looking for doesn't exist.</h2>
23
+ </body>
24
+ </html>
data/spec/request_spec.rb CHANGED
@@ -12,6 +12,23 @@ describe MetaInspector::Request do
12
12
  end
13
13
  end
14
14
 
15
+ describe "response" do
16
+ it "contains the response status" do
17
+ page_request = MetaInspector::Request.new(url('http://example.com'))
18
+ page_request.response.status.should == 200
19
+ end
20
+
21
+ it "contains the response headers" do
22
+ page_request = MetaInspector::Request.new(url('http://example.com'))
23
+ page_request.response.headers
24
+ .should == {"server"=>"nginx/0.7.67", "date"=>"Fri, 18 Nov 2011 21:46:46 GMT",
25
+ "content-type"=>"text/html", "connection"=>"keep-alive",
26
+ "last-modified"=>"Mon, 14 Nov 2011 16:53:18 GMT",
27
+ "content-length"=>"4987", "x-varnish"=>"2000423390",
28
+ "age"=>"0", "via"=>"1.1 varnish"}
29
+ end
30
+ end
31
+
15
32
  describe "content_type" do
16
33
  it "should return the correct content type of the url for html pages" do
17
34
  page_request = MetaInspector::Request.new(url('http://pagerankalert.com'))
data/spec/spec_helper.rb CHANGED
@@ -30,6 +30,9 @@ end
30
30
  # This is the base page to be used in the examples
31
31
  FakeWeb.register_uri(:get, "http://example.com/", :response => fixture_file("example.response"))
32
32
 
33
+ # Used to test response status codes
34
+ FakeWeb.register_uri(:get, "http://example.com/404", :response => fixture_file("404.response"))
35
+
33
36
  # These are older fixtures
34
37
  FakeWeb.register_uri(:get, "http://pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
35
38
  FakeWeb.register_uri(:get, "http://pagerankalert-shortcut.com", :response => fixture_file("pagerankalert-shortcut.com.response"))
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: metainspector
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.2.0
4
+ version: 3.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jaime Iniesta
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-10-14 00:00:00.000000000 Z
11
+ date: 2014-10-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -179,6 +179,9 @@ files:
179
179
  - MIT-LICENSE
180
180
  - README.md
181
181
  - Rakefile
182
+ - examples/basic_scraping.rb
183
+ - examples/link_checker.rb
184
+ - examples/spider.rb
182
185
  - lib/meta_inspector.rb
183
186
  - lib/meta_inspector/document.rb
184
187
  - lib/meta_inspector/exception_log.rb
@@ -189,10 +192,9 @@ files:
189
192
  - lib/meta_inspector/version.rb
190
193
  - lib/metainspector.rb
191
194
  - meta_inspector.gemspec
192
- - samples/basic_scraping.rb
193
- - samples/spider.rb
194
195
  - spec/document_spec.rb
195
196
  - spec/exception_log_spec.rb
197
+ - spec/fixtures/404.response
196
198
  - spec/fixtures/alazan.com.response
197
199
  - spec/fixtures/alazan_websolution.response
198
200
  - spec/fixtures/charset_000.response
@@ -1,22 +0,0 @@
1
- # Some basic MetaInspector samples
2
-
3
- $: << File.join(File.dirname(__FILE__), "/../lib")
4
- require 'meta_inspector'
5
- require 'ap'
6
-
7
- puts "Enter a valid http url to scrape it"
8
- url = gets.strip
9
- page = MetaInspector.new(url)
10
- puts "...please wait while scraping the page..."
11
-
12
- puts "Scraping #{page.url} returned these results:"
13
- puts "TITLE: #{page.title}"
14
- puts "META DESCRIPTION: #{page.meta_description}"
15
- puts "META KEYWORDS: #{page.meta_keywords}"
16
- puts "#{page.links.size} links found..."
17
- page.links.each do |link|
18
- puts " ==> #{link}"
19
- end
20
-
21
- puts "to_hash..."
22
- ap page.to_hash
data/samples/spider.rb DELETED
@@ -1,30 +0,0 @@
1
- # A basic spider that will follow links on an infinite loop
2
- $: << File.join(File.dirname(__FILE__), "/../lib")
3
- require 'rubygems'
4
- require 'meta_inspector'
5
-
6
- q = Queue.new
7
- visited_links=[]
8
-
9
- puts "Enter a valid http url to spider it following internal links"
10
- url = gets.strip
11
-
12
- page = MetaInspector.new(url)
13
- q.push(url)
14
-
15
- while q.size > 0
16
- visited_links << url = q.pop
17
- page = MetaInspector.new(url)
18
- puts "Spidering #{page.url}"
19
-
20
- puts "TITLE: #{page.title}"
21
- puts "META DESCRIPTION: #{page.meta_description}"
22
- puts "META KEYWORDS: #{page.meta_keywords}"
23
- puts "LINKS: #{page.internal_links.size}"
24
- page.internal_links.each do |link|
25
- if !visited_links.include?(link)
26
- q.push(link)
27
- end
28
- end
29
- puts "#{visited_links.size} pages visited, #{q.size} pages on queue\n\n"
30
- end