RubyGems - metainspector - Versions diffs - 3.2.0 → 3.3.0 - Mend

metainspector 3.2.0 → 3.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

checksums.yaml +4 -4
data/README.md +13 -2
data/examples/basic_scraping.rb +24 -0
data/examples/link_checker.rb +100 -0
data/examples/spider.rb +39 -0
data/lib/meta_inspector/document.rb +4 -2
data/lib/meta_inspector/request.rb +2 -2
data/lib/meta_inspector/version.rb +1 -1
data/spec/document_spec.rb +29 -7
data/spec/fixtures/404.response +24 -0
data/spec/request_spec.rb +17 -0
data/spec/spec_helper.rb +3 -0
metadata +6 -4
data/samples/basic_scraping.rb +0 -22
data/samples/spider.rb +0 -30

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 553be427edfb113d96f5956a55d184ecce6c2874
-  data.tar.gz: 5c11f4b906d2fafa9bda9669521ac1cb1a958734
+  metadata.gz: 72feb77b18ae296efcdd96b30950e2deaefb9689
+  data.tar.gz: b6442b9f256c6b31e18005af2f5955c339b37298
 SHA512:
-  metadata.gz: 0188e170dce0f4a4e94b5f91c280b6bc597dfe8b14eb3795d4bf0f7719a282ae04b1277c1b9bb02675f27891d2ddc23781e444bc78b47e95c61151a579bf92f3
-  data.tar.gz: 51f3d50f0d1f575f5bafe3977bc9fbac3de740a7caf89a92b54a47e25408ba29946e9e09ea0a475462b7ded4651a237da72f8b8cd50e2672742a20caa9f692bb
+  metadata.gz: a9b4002024e2466d5538a1e54c39fe336f2b049128249956b1951a23dfb3f1d09cdea7da63b9e602102fe936f8e14b884000a48a1a970ecd05ab58a632aecef5
+  data.tar.gz: 99d8ced9fcf7e03df6316cd5442a54320df1dca1e32ad3a9ee9421638f38f779860994bece0496a048180df49bfbf67580df01ceb4c674d2e66c6df90e0c77f5

data/README.md CHANGED Viewed

@@ -45,9 +45,18 @@ You can also include the html which will be used as the document to scrape:
     page = MetaInspector.new("http://sitevalidator.com", :document => "<html><head><title>Hello From Passed Html</title><a href='/hello'>Hello link</a></head><body></body></html>")
+## Accessing response status and headers
+You can check the status and headers from the response like this:
+```ruby
+page.response.status  # 200
+page.response.headers # { "server"=>"nginx", "content-type"=>"text/html; charset=utf-8", "cache-control"=>"must-revalidate, private, max-age=0", ... }
+```
 ## Accessing scraped data
-Then you can see the scraped data like this:
+You can see the scraped data like this:
     page.url                 # URL of the page
     page.scheme              # Scheme of the page (http, https)
@@ -256,7 +265,7 @@ You should avoid using the `:store` option, or use it wisely, as silencing error
 ## Examples
-You can find some sample scripts on the samples folder, including a basic scraping and a spider that will follow external links using a queue. What follows is an example of use from irb:
+You can find some sample scripts on the `examples` folder, including a basic scraping and a spider that will follow external links using a queue. What follows is an example of use from irb:
     $ irb
     >> require 'metainspector'
@@ -288,6 +297,8 @@ Thanks to all the contributors:
 [https://github.com/jaimeiniesta/metainspector/graphs/contributors](https://github.com/jaimeiniesta/metainspector/graphs/contributors)
+You are more than welcome to come chat with us on our [Gitter room](https://gitter.im/jaimeiniesta/metainspector) and [Google group](https://groups.google.com/forum/#!forum/metainspector).
 ## Related projects
 * [go-metainspector](https://github.com/fern4lvarez/go-metainspector), a port of MetaInspector for Go.

data/examples/basic_scraping.rb ADDED Viewed

@@ -0,0 +1,24 @@
+# A basic MetaInspector example for scraping a page
+#
+# Usage example:
+#
+#   ruby basic_scraping.rb jaimeiniesta.com
+require 'metainspector'
+# Get the starting URL
+url = ARGV[0] || (puts "Enter an url"; gets.strip)
+page = MetaInspector.new(url)
+puts "Scraping #{page.url} returned these results:"
+puts "TITLE: #{page.title}"
+puts "META DESCRIPTION: #{page.meta['description']}"
+puts "META KEYWORDS: #{page.meta['keywords']}"
+puts "#{page.links.size} links found..."
+page.links.each do |link|
+  puts " ==> #{link}"
+end
+puts "to_hash..."
+puts page.to_hash

data/examples/link_checker.rb ADDED Viewed

@@ -0,0 +1,100 @@
+# A basic spider that will follow internal links, checking broken links
+#
+# Usage example:
+#
+#   ruby link_checker.rb alazan.com
+require 'metainspector'
+class BrokenLinkChecker
+  attr_reader :broken
+  def initialize(url)
+    @url      = url
+    @queue    = []
+    @visited  = []
+    @ok       = []
+    @broken   = {}
+    check
+  end
+  def report
+    puts "\n#{@broken.size} broken links found."
+    @broken.each do |link, from|
+      puts "\n#{link} linked from"
+      from.each do |origin|
+        puts " - #{origin}"
+      end
+    end
+  end
+  private
+  def check
+    # Resolve initial redirections
+    page = MetaInspector.new(@url)
+    # Push this initial URL to the queue
+    @queue.push(page.url)
+    while @queue.any?
+      url = @queue.pop
+      page = MetaInspector.new(url, :warn_level => :store)
+      if page.ok?
+        # Gets all HTTP links
+        page.links.select {|l| l =~ /^http(s)?:\/\//i}.each do |link|
+          check_status(link, page.url)
+        end
+      end
+      @visited.push(page.url)
+      page.internal_links.each do |link|
+        @queue.push(link) unless @visited.include?(link) || @broken.include?(link) || @queue.include?(link)
+      end
+      puts "#{'%3s' % @visited.size} pages visited, #{'%3s' % @queue.size} pages on queue, #{'%2s' % @broken.size} broken links"
+    end
+  end
+  # Checks the response status of the linked_url and stores it on the ok or broken collections
+  def check_status(linked_url, from_url)
+    if @broken.keys.include?(linked_url)
+      # This was already known to be broken, we add another origin
+      @broken[linked_url] << from_url
+    else
+      if !@ok.include?(linked_url)
+        # We still don't know about this link status, so we check it now
+        if reachable?(linked_url)
+          @ok << linked_url
+        else
+          @broken[linked_url] = [from_url]
+        end
+      end
+    end
+  end
+  # A page is reachable if its response status is less than 400
+  # In the case of exceptions, like timeouts or server connection errors,
+  # we consider it unreachable
+  def reachable?(url)
+    page = MetaInspector.new(url)
+    if page.response.status < 400
+      true
+    else
+      false
+    end
+  rescue Exception => e
+    false
+  end
+end
+# Get the starting URL
+url = ARGV[0] || (puts "Enter a starting url"; gets.strip)
+BrokenLinkChecker.new(url).report

data/examples/spider.rb ADDED Viewed

@@ -0,0 +1,39 @@
+# A basic spider that will follow internal links
+#
+# Usage example:
+#
+#   ruby spider.rb jaimeiniesta.com
+require 'metainspector'
+# Two arrays, one for the scraping queue and one for the visited links
+queue   = []
+visited = []
+# Get the starting URL
+url = ARGV[0] || (puts "Enter a starting url"; gets.strip)
+# Resolve initial redirections
+page = MetaInspector.new(url)
+# Push this initial URL to the queue
+queue.push(page.url)
+while queue.any?
+  url = queue.pop
+  visited.push(url)
+  puts "VISITED: #{url}"
+  page = MetaInspector.new(url)
+  page.internal_links.each do |link|
+    queue.push(link) unless visited.include?(link) || queue.include?(link)
+  end
+  puts "#{visited.size} pages visited, #{queue.size} pages on queue\n\n"
+end
+puts "\nScraping finished, these are the internal links found:\n\n"
+puts visited.sort

data/lib/meta_inspector/document.rb CHANGED Viewed

@@ -34,7 +34,7 @@ module MetaInspector
     extend Forwardable
     def_delegators :@url,     :url, :scheme, :host, :root_url
-    def_delegators :@request, :content_type
+    def_delegators :@request, :content_type, :response
     def_delegators :@parser,  :parsed, :respond_to?, :title, :description, :links, :internal_links, :external_links,
                               :images, :image, :feed, :charset, :meta_tags, :meta_tag, :meta, :favicon
@@ -51,7 +51,9 @@ module MetaInspector
         'feed' => feed,
         'content_type' => content_type,
         'meta_tags' => meta_tags,
-        'favicon' => favicon
+        'favicon' => favicon,
+        'response' => { 'status'  => response.status,
+                        'headers' => response.headers }
       }
     end

data/lib/meta_inspector/request.rb CHANGED Viewed

@@ -34,8 +34,6 @@ module MetaInspector
       response.headers["content-type"].split(";")[0] if response
     end
-    private
     def response
       request_count ||= 0
       request_count += 1
@@ -48,6 +46,8 @@ module MetaInspector
       nil
     end
+    private
     def fetch
       session = Faraday.new(:url => url) do |faraday|
         if @allow_redirections

data/lib/meta_inspector/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # -*- encoding: utf-8 -*-
 module MetaInspector
-  VERSION = "3.2.0"
+  VERSION = "3.3.0"
 end

data/spec/document_spec.rb CHANGED Viewed

@@ -24,9 +24,9 @@ describe MetaInspector::Document do
   it "should return a Hash with all the values set" do
     @m = MetaInspector::Document.new('http://pagerankalert.com')
     @m.to_hash.should == {
-                            "url"             =>"http://pagerankalert.com/",
-                            "title"           =>"PageRankAlert.com :: Track your PageRank changes & receive alerts",
-                            "favicon"         =>"http://pagerankalert.com/src/favicon.ico",
+                            "url"             => "http://pagerankalert.com/",
+                            "title"           => "PageRankAlert.com :: Track your PageRank changes & receive alerts",
+                            "favicon"         => "http://pagerankalert.com/src/favicon.ico",
                             "links"           => ["http://pagerankalert.com/",
                                                   "http://pagerankalert.com/es?language=es",
                                                   "http://pagerankalert.com/users/sign_up",
@@ -46,14 +46,36 @@ describe MetaInspector::Document do
                             "images"          => ["http://pagerankalert.com/images/pagerank_alert.png?1305794559"],
                             "charset"         => "utf-8",
                             "feed"            => "http://feeds.feedburner.com/PageRankAlert",
-                            "content_type"    =>"text/html",
-                            "meta_tags"       => { "name" => { "description" => ["Track your PageRank(TM) changes and receive alerts by email"],
+                            "content_type"    => "text/html",
+                            "meta_tags"       => {
+                                                   "name" => {
+                                                               "description" => ["Track your PageRank(TM) changes and receive alerts by email"],
                                                                "keywords"    => ["pagerank, seo, optimization, google"], "robots"=>["all,follow"],
                                                                "csrf-param"  => ["authenticity_token"],
-                                                               "csrf-token"  => ["iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="] },
+                                                               "csrf-token"  => ["iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="]
+                                                             },
                                                    "http-equiv" => {},
                                                    "property"   => {},
-                                                   "charset"    => ["utf-8"] }
+                                                   "charset"    => ["utf-8"]
+                                                 },
+                            "response"        => {
+                                                   "status"  => 200,
+                                                   "headers" => {
+                                                                  "server" => "nginx/0.7.67",
+                                                                  "date"=>"Mon, 30 May 2011 09:45:42 GMT",
+                                                                  "content-type" => "text/html; charset=utf-8",
+                                                                  "connection" => "keep-alive",
+                                                                  "etag" => "\"d0534cf7ad7d7a7fb737fe4ad99b0fd1\"",
+                                                                  "x-ua-compatible" => "IE=Edge,chrome=1",
+                                                                  "x-runtime" => "0.031274",
+                                                                  "set-cookie" => "_session_id=33575f7694b4492af4c4e282d62a7127; path=/; HttpOnly",
+                                                                  "cache-control" => "max-age=0, private, must-revalidate",
+                                                                  "content-length" => "6690",
+                                                                  "x-varnish" => "2167295052",
+                                                                  "age" => "0",
+                                                                  "via" => "1.1 varnish"
+                                                                }
+                                                 }
                          }
   end

data/spec/fixtures/404.response ADDED Viewed

@@ -0,0 +1,24 @@
+HTTP/1.1 404 Not Found
+Server: nginx
+Date: Fri, 17 Oct 2014 21:01:44 GMT
+Content-Type: text/html; charset=utf-8
+Content-Length: 933
+Connection: keep-alive
+Status: 404 Not Found
+X-Request-Id: 84997334b729a4e1ad65c10d9c1f68a7
+X-Runtime: 0.032396
+X-Rack-Cache: miss
+<!DOCTYPE html>
+<html>
+<head>
+  <meta charset="UTF-8" />
+  <title>The page you were looking for doesn't exist (404)</title>
+</head>
+<body>
+  <h1>Four Oh Four!</h1>
+  <h2>The page you were looking for doesn't exist.</h2>
+</body>
+</html>

data/spec/request_spec.rb CHANGED Viewed

@@ -12,6 +12,23 @@ describe MetaInspector::Request do
     end
   end
+  describe "response" do
+    it "contains the response status" do
+      page_request = MetaInspector::Request.new(url('http://example.com'))
+      page_request.response.status.should == 200
+    end
+    it "contains the response headers" do
+      page_request = MetaInspector::Request.new(url('http://example.com'))
+      page_request.response.headers
+        .should == {"server"=>"nginx/0.7.67", "date"=>"Fri, 18 Nov 2011 21:46:46 GMT",
+                    "content-type"=>"text/html", "connection"=>"keep-alive",
+                    "last-modified"=>"Mon, 14 Nov 2011 16:53:18 GMT",
+                    "content-length"=>"4987", "x-varnish"=>"2000423390",
+                    "age"=>"0", "via"=>"1.1 varnish"}
+    end
+  end
   describe "content_type" do
     it "should return the correct content type of the url for html pages" do
       page_request = MetaInspector::Request.new(url('http://pagerankalert.com'))

data/spec/spec_helper.rb CHANGED Viewed

@@ -30,6 +30,9 @@ end
 # This is the base page to be used in the examples
 FakeWeb.register_uri(:get, "http://example.com/", :response => fixture_file("example.response"))
+# Used to test response status codes
+FakeWeb.register_uri(:get, "http://example.com/404", :response => fixture_file("404.response"))
 # These are older fixtures
 FakeWeb.register_uri(:get, "http://pagerankalert.com", :response => fixture_file("pagerankalert.com.response"))
 FakeWeb.register_uri(:get, "http://pagerankalert-shortcut.com", :response => fixture_file("pagerankalert-shortcut.com.response"))

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: metainspector
 version: !ruby/object:Gem::Version
-  version: 3.2.0
+  version: 3.3.0
 platform: ruby
 authors:
 - Jaime Iniesta
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-10-14 00:00:00.000000000 Z
+date: 2014-10-24 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
@@ -179,6 +179,9 @@ files:
 - MIT-LICENSE
 - README.md
 - Rakefile
+- examples/basic_scraping.rb
+- examples/link_checker.rb
+- examples/spider.rb
 - lib/meta_inspector.rb
 - lib/meta_inspector/document.rb
 - lib/meta_inspector/exception_log.rb
@@ -189,10 +192,9 @@ files:
 - lib/meta_inspector/version.rb
 - lib/metainspector.rb
 - meta_inspector.gemspec
-- samples/basic_scraping.rb
-- samples/spider.rb
 - spec/document_spec.rb
 - spec/exception_log_spec.rb
+- spec/fixtures/404.response
 - spec/fixtures/alazan.com.response
 - spec/fixtures/alazan_websolution.response
 - spec/fixtures/charset_000.response

data/samples/basic_scraping.rb DELETED Viewed

@@ -1,22 +0,0 @@
-# Some basic MetaInspector samples
-$: << File.join(File.dirname(__FILE__), "/../lib")
-require 'meta_inspector'
-require 'ap'
-puts "Enter a valid http url to scrape it"
-url = gets.strip
-page = MetaInspector.new(url)
-puts "...please wait while scraping the page..."
-puts "Scraping #{page.url} returned these results:"
-puts "TITLE: #{page.title}"
-puts "META DESCRIPTION: #{page.meta_description}"
-puts "META KEYWORDS: #{page.meta_keywords}"
-puts "#{page.links.size} links found..."
-page.links.each do |link|
-  puts " ==> #{link}"
-end
-puts "to_hash..."
-ap page.to_hash

data/samples/spider.rb DELETED Viewed

@@ -1,30 +0,0 @@
-# A basic spider that will follow links on an infinite loop
-$: << File.join(File.dirname(__FILE__), "/../lib")
-require 'rubygems'
-require 'meta_inspector'
-q = Queue.new
-visited_links=[]
-puts "Enter a valid http url to spider it following internal links"
-url = gets.strip
-page = MetaInspector.new(url)
-q.push(url)
-while q.size > 0
-  visited_links << url = q.pop
-  page = MetaInspector.new(url)
-  puts "Spidering #{page.url}"
-  puts "TITLE: #{page.title}"
-  puts "META DESCRIPTION: #{page.meta_description}"
-  puts "META KEYWORDS: #{page.meta_keywords}"
-  puts "LINKS: #{page.internal_links.size}"
-  page.internal_links.each do |link|
-    if !visited_links.include?(link)
-      q.push(link)
-    end
-  end
-  puts "#{visited_links.size} pages visited, #{q.size} pages on queue\n\n"
-end