RubyGems - basilisk - Versions diffs - 0.2.5 - Mend

basilisk 0.2.5

Files changed (17) hide show

data/HISTORY +3 -0
data/LICENSE +23 -0
data/README.rdoc +36 -0
data/bin/basil +47 -0
data/lib/basilisk.rb +37 -0
data/lib/basilisk/core.rb +69 -0
data/lib/basilisk/parser.rb +69 -0
data/lib/basilisk/processor.rb +43 -0
data/lib/basilisk/processors/error_processor.rb +33 -0
data/lib/basilisk/processors/image_processor.rb +93 -0
data/lib/basilisk/processors/seo_processor.rb +74 -0
data/lib/basilisk/processors/sitemap_processor.rb +52 -0
data/lib/basilisk/processors/terms_processor.rb +49 -0
data/lib/basilisk/template.rb +54 -0
data/test/basilisk_test.rb +0 -0
data/test/test_helper.rb +0 -0
metadata +100 -0

data/HISTORY ADDED

@@ -0,0 +1,3 @@
+=== 1.0.0 / 2008-12-16
+* First release

data/LICENSE ADDED

@@ -0,0 +1,23 @@
+Copyright (c) 2009, Kyle Banker, Alexander Interactive, Inc.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+Except as contained in this notice, the name(s) of the above copyright holders
+shall not be used in advertising or otherwise to promote the sale, use or other
+dealings in this Software without prior written authorization.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.

data/README.rdoc ADDED

@@ -0,0 +1,36 @@
+= basilisk
+a command-line front-end for the anemone web-crawler (http://github.com/chriskite/anemone). basilisk produces useful reports for qa-ing websites. It also features an extensible page processor class for writing your own page processors.
+Included page processors:
+- seo:     generates a csv with the following columns: url, title, description, keywords, h1s, h2s
+- sitemap: generates an xml sitemap
+- image:   generates a list of broken images and images lacking an alt tag.
+- error:   generates a csv of urls returning html response codes other than success and redirect.
+See the generated yml config file for even more options.
+== install
+  sudo gem install basilisk
+== usage
+To create a new search:
+  basil create [search_name] [url]
+- Creates a search config file ([search_name].yml), which you may edit to change the default options, specify which page process you want to run, any regex and css terms for searching across the site, and regexes for skipping urls.
+To run the search:
+  basil run [search_name]
+- Runs the specified search. Note: you must create a search before running it. Files generated by the page processors will reside in a folder called [search_name].
+== author & license
+basilisk is licensed under a modified MIT licence. See LICENCE.txt.
+basilisk was written by Kyle Banker, largely dependent on the anemone web-crawler by Chris Kite.
+Copyright 2009 Alexander Interactive, Inc.

data/bin/basil ADDED

@@ -0,0 +1,47 @@
+#! /usr/bin/env ruby
+# == Synopsis
+# Crawls a site starting at the given URL, and outputs the total number
+#
+# == Usage
+# basil create [search_name] [url]
+# basil run [search_name]
+#
+# == Author
+# Kyle Banker
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+require 'basilisk'
+def usage
+  puts <<END
+basil(isk): a front-end for the anemone web crawler.
+Usage:
+  To create a new search:
+    basil create [search_name] [url]
+      - This will create a search config file, which you may edit to change the default options.
+  To run the search:
+    basil run [search_name]
+      - Runs the specified search. Note: you must create a search before running it.
+END
+end
+begin
+  if ARGV[0] == "create" && ARGV[1] && URI(ARGV[2])
+    Basilisk.create(ARGV[1], ARGV[2])
+  elsif ARGV[0] == "run" && !ARGV[1].nil?
+    Basilisk.run(File.join(Dir.pwd, ARGV[1]))
+  else
+    raise BasiliskArgumentError
+  end
+rescue BasiliskArgumentError
+  usage
+  Process.exit
+end

data/lib/basilisk.rb ADDED

@@ -0,0 +1,37 @@
+require 'ostruct'
+require 'yaml'
+require 'rubygems'
+require 'anemone'
+require 'fastercsv'
+require 'basilisk/core'
+require 'basilisk/parser'
+require 'basilisk/processor'
+require 'basilisk/template'
+$:.unshift File.join(File.dirname(__FILE__), 'basilisk', 'processors')
+require 'seo_processor'
+require 'sitemap_processor'
+require 'error_processor'
+require 'terms_processor'
+require 'image_processor'
+BASILISK_ROOT = File.join(File.dirname(__FILE__), "..")
+class BasiliskError < StandardError; end
+class BasiliskArgumentError < BasiliskError; end
+class BasiliskImageError < BasiliskError; end
+module Basilisk
+  extend self
+  def run(opt_file)
+    Basilisk::Core.run(Basilisk::Parser.get_options(opt_file))
+  end
+  def create(search_name, url)
+    Basilisk::Core.create(search_name, url)
+  end
+end

data/lib/basilisk/core.rb ADDED

@@ -0,0 +1,69 @@
+require 'basilisk/processor'
+module Basilisk
+  module Core
+    extend self
+    # Takes search options and runs the crawler with any processors.
+    def run(search_opts)
+      # We need to close the processors if user presses ctrl-c.
+      trap("INT") do
+        puts "\n**Interrupt received**\n***Closing processors...\n"
+        close_processors(search_opts.processor_instances)
+        Process.exit
+      end
+      Anemone.crawl(search_opts.url, :user_agent => search_opts.user_agent, :verbose => true) do |anemone|
+        anemone.skip_links_like(search_opts.skip_patterns || [])
+        # At least one search processor must be specified.
+        anemone.on_every_page do |page|
+          search_opts.processor_instances.each do |processor|
+            processor.process_page(page, anemone.pages)
+          end
+        end
+        # Close callback on all processors.
+        anemone.after_crawl do |pages|
+          close_processors(search_opts.processor_instances)
+        end
+      end
+    end
+    def close_processors(instances)
+      instances.each do |processor|
+        processor.close_file
+      end
+    end
+    # Create a folder for the processor results,
+    # and a default yaml config file in the current directory.
+    def create(search_name, url)
+      filename   = create_config_file(search_name, url, filename)
+      foldername = create_results_folder(search_name)
+      Basilisk::Template.output_instructions(search_name, filename, foldername)
+      rescue => e
+        puts "Error: Could not create config file or folder."
+        puts "Please make sure that a folder of the same name doesn't already exist.\n"
+        puts "(#{e})"
+    end
+    def create_config_file(search_name, url, filename)
+      filename = File.join(Dir.pwd, search_name + ".yml")
+      file     = File.open(filename, "w")
+      file.write(Basilisk::Template.default(:name => search_name, :url => url))
+      file.close
+      return filename
+    end
+    def create_results_folder(search_name)
+      foldername = File.join(Dir.pwd, search_name)
+      Dir.mkdir(foldername)
+      return foldername
+    end
+  end
+end

data/lib/basilisk/parser.rb ADDED

@@ -0,0 +1,69 @@
+module Basilisk
+  # Parses YAML config file and instantiates specified processor classes.
+  module Parser
+    extend self
+    def get_options(opt_file)
+      yaml_opts         = open_yaml_file(opt_file)
+      search_opts       = assign_options(yaml_opts)
+      validate_options(search_opts)
+      return search_opts
+    end
+    private
+    def open_yaml_file(filename)
+      filename += ".yml" unless filename.include?(".yml")
+      YAML::parse(File.open(filename))
+    end
+    def assign_options(yaml_opts)
+      search_opts               = OpenStruct.new
+      search_opts.name          = yaml_opts['basilisk']['name'].value
+      search_opts.url           = yaml_opts['basilisk']['url'].value
+      search_opts.user_agent    = yaml_opts['basilisk']['user_agent'].value
+      search_opts.skip_patterns = get_patterns(yaml_opts['basilisk']['skip_url_patterns'].value)
+      search_opts.processor_instances   =
+        instantiate_processors(yaml_opts['basilisk']['processors'].value, search_opts.name)
+      search_opts.regex_search_terms   = get_patterns(yaml_opts['basilisk']['regex_search_terms'].value)
+      search_opts.css_search_terms     = split_and_strip(yaml_opts['basilisk']['css_search_terms'].value, ";")
+      search_opts.processor_instances  << init_term_processor(search_opts) if search_has_terms?(search_opts)
+      return search_opts
+    end
+    def validate_options(search_opts)
+      return true
+    end
+    def instantiate_processors(processors, search_name)
+      split_and_strip(processors, ";").map do |name|
+        get_processor_class(name).new(search_name)
+      end
+    end
+    # Returns an array of case-insensitive regexps.
+    def get_patterns(pattern_string)
+      split_and_strip(pattern_string, ";").select do |name|
+        name != ""
+      end.map {|name| Regexp.new(name, true)}
+    end
+    def split_and_strip(collection, separator)
+      collection.split(separator).map {|item| item.strip }
+    end
+    def get_processor_class(name)
+      Module.const_get("Basilisk").const_get(name.capitalize + "Processor")
+    end
+    def search_has_terms?(opts)
+      !opts.regex_search_terms.empty? || !opts.css_search_terms.empty?
+    end
+    def init_term_processor(opts)
+      Basilisk::TermsProcessor.new(opts.name, opts.regex_search_terms, opts.css_search_terms)
+    end
+  end
+end

data/lib/basilisk/processor.rb ADDED

@@ -0,0 +1,43 @@
+module Basilisk
+  # Base class for page processors.
+  class Processor
+    def initialize(search_name)
+      @search_name   = search_name
+      @base_folder   = Dir.pwd
+      @output_folder = File.join(Dir.pwd, search_name)
+    end
+    def process_page(page, page_hash)
+    end
+    # Called when the crawl is completed.
+    def close_file
+    end
+    protected
+    def filename_for_output
+      File.join @output_folder,
+        self.class.name.sub("Processor", "").sub("Basilisk::", "").downcase + ".csv"
+    end
+    def write_file(&block)
+      file = File.open(filename_for_output, "a")
+      yield file
+      file.close
+    end
+  end
+  # Processors that outputs a csv should inherit from this class.
+  class CSVProcessor < Processor
+    def write_file(&block)
+      FasterCSV.open(filename_for_output, "a") do |csv|
+        yield csv
+      end
+    end
+  end
+end

data/lib/basilisk/processors/error_processor.rb ADDED

@@ -0,0 +1,33 @@
+module Basilisk
+  # Stores page errors.
+  class ErrorProcessor < Basilisk::CSVProcessor
+    def initialize(search_name)
+      super
+      save_header_row
+    end
+    def process_page(page, page_hash)
+      write_row(page, page_hash) if page.code != 200 && !page.redirect?
+    end
+    private
+    def filename_for_output
+      File.join @output_folder, @search_name + "-errors.csv"
+    end
+    def save_header_row
+      write_file do |csv|
+        csv << ["URL", "Error"]
+      end
+    end
+    def write_row(page, page_hash)
+      write_file do |file|
+        file << [page.url, page.code]
+      end
+    end
+  end
+end

data/lib/basilisk/processors/image_processor.rb ADDED

@@ -0,0 +1,93 @@
+require 'net/http'
+require 'uri'
+module Basilisk
+  # Generates a report for broken images and images missing alt tags.
+  class ImageProcessor < Basilisk::CSVProcessor
+    def initialize(search_name)
+      super
+      save_header_row
+      @image_url_cache = []
+    end
+    def process_page(page, page_hash)
+      return unless page.doc
+      begin
+        page.doc.css('img').each do |image|
+          begin
+          image_src = image['src']
+          absolute_image_url = image_url(page, image_src)
+          next if @image_url_cache.include?(absolute_image_url)
+          @image_url_cache << absolute_image_url
+          check_for_broken_image(page, absolute_image_url)
+          check_for_missing_alt_tag(page, image, absolute_image_url)
+        rescue BasiliskImageError => e
+          write_row(page, image['src'], e.message)
+        end
+      end
+      end
+    end
+    private
+    def check_for_broken_image(page, absolute_image_src)
+      http_status = image_http_status(absolute_image_src)
+      if http_status != "200"
+        write_row(page, absolute_image_src, "Image broken (#{http_status})")
+      end
+    end
+    def check_for_missing_alt_tag(page, image, absolute_image_src)
+      return unless image['alt']
+      image_alt = image['alt'].strip
+      if image_alt == ""
+        write_row(page, absolute_image_src, "Alt tag missing")
+      end
+    end
+    # Perform a head request on the image so we won't have to download it.
+    def image_http_status(uri)
+      puts "Requesting Image: #{uri}"
+      http     = Net::HTTP.new(uri.host, uri.port)
+      response = http.head(uri.path)
+      return response.code
+      rescue
+        return "500"
+    end
+    # Construct the image's absolute url, if necessary.
+    def image_url(page, image_src)
+      image_uri = URI.parse(image_src)
+      if image_uri.absolute?
+        image_uri
+      elsif image_uri.relative?
+        root     = URI::Generic.build :scheme => page.url.scheme, :host => page.url.host
+        URI.join root.to_s, image_uri.to_s
+      else
+        raise BasiliskImageError, "Could not parse image src."
+      end
+      rescue
+        raise BasiliskImageError, "Could not parse image src."
+    end
+    def filename_for_output
+      File.join @output_folder, @search_name + "-images.csv"
+    end
+    def save_header_row
+      write_file do |csv|
+        csv << ["Page URL", "Image URL", "Message"]
+      end
+    end
+    def write_row(page, image_url, message)
+      write_file do |file|
+        file << [page.url, image_url, message]
+      end
+    end
+  end
+end

data/lib/basilisk/processors/seo_processor.rb ADDED

@@ -0,0 +1,74 @@
+module Basilisk
+  # Write a csv containing important seo fields: title, h1, h2, description, and keywords.
+  class SeoProcessor < Basilisk::CSVProcessor
+    HTMLTags = ['title', 'h1', 'h2']
+    MetaTags = ['description', 'keywords']
+    def initialize(search_name)
+      super
+      save_header_row
+    end
+    def process_page(page, page_hash)
+      @tags = Hash.new("")
+      HTMLTags.each { |tag_name|  check_html_element(tag_name, page.doc) }
+      MetaTags.each { |meta_name| check_meta_element(meta_name, page.doc) }
+      save_tag_row(page)
+    end
+    private
+    # Take a tag name (:h1, :title) and an hpricot doc.
+    # Stores the number of occurrences of the element
+    # along with its content.
+    def check_html_element(name, doc)
+      elements = doc.css(name)
+      process_tag(name, elements, "text")
+    end
+    # Take a meta name (:description, keywords) and an hpricot doc.
+    # Stores the number of occurrences of the element
+    # along with its content.
+    def check_meta_element(name, doc)
+      elements = doc.css("meta[@name='#{name}']")
+      process_tag(name, elements, ["[]", "content"])
+    end
+    # Code that processes an array of nokogiri elements
+    # by formatting them and saving to the @tags hash.
+    def process_tag(name, elements, content_method)
+      (@tags[name] += "MISSING") && return if elements.empty?
+      @tags[name]  += "(#{elements.size}): " if elements.size > 1
+      elements.each do |e|
+        content     = e.send(*content_method)
+        text_to_add = content == "" ? "BLANK" : "#{content}"
+        text_to_add = add_parentheses(text_to_add) if elements.size > 1
+        @tags[name] += text_to_add
+      end
+      @tags[name].strip!
+    end
+    def save_header_row
+      write_file do |csv|
+        csv << ["URL", HTMLTags, MetaTags].flatten
+      end
+    end
+    def save_tag_row(page)
+      row = []
+      row << page.url.to_s
+      [HTMLTags, MetaTags].flatten.each do |tag_key|
+        row << @tags[tag_key]
+      end
+      write_file do |csv|
+        csv << row
+      end
+    end
+    def add_parentheses(text)
+      "(#{text}) "
+    end
+  end
+end

data/lib/basilisk/processors/sitemap_processor.rb ADDED

@@ -0,0 +1,52 @@
+module Basilisk
+  # Build a google-compatible xml sitemap for the crawled site.
+  class SitemapProcessor < Basilisk::Processor
+    def initialize(search_name)
+      super
+      @date = Time.now.strftime("%Y-%m-%d")
+      save_header
+    end
+    def process_page(page, page_hash)
+      write_url(page)
+    end
+    def close_file
+      write_file do |file|
+        file.write "</urlset>\n"
+      end
+    end
+    private
+    def filename_for_output
+      File.join @output_folder, @search_name + "-sitemap.xml"
+    end
+    def save_header
+      write_file do |file|
+        file.write "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
+        file.write "<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n"
+      end
+    end
+    def write_url(page)
+      write_file do |file|
+      file.write "<url>\n"
+      file.write "  <loc>#{page.url}</loc>\n"
+      file.write "  <lastmod>#{@date}></lastmod>\n"
+      file.write "  <changefreq>monthly</changefreq>\n"
+      file.write "  <priority>#{priority(page.url)}</priority>\n"
+      file.write "</url>\n"
+     end
+    end
+    # Assigns a default priority of 1.0 to 0.1 based on page depth.
+    def priority(url)
+      level = 1.0 - ((url.to_s.split("/").size - 3) / 10.0)
+      level < 0.1 ? 0.1 : level
+    end
+  end
+end

data/lib/basilisk/processors/terms_processor.rb ADDED

@@ -0,0 +1,49 @@
+module Basilisk
+  # Stores page errors.
+  class TermsProcessor < Basilisk::CSVProcessor
+    def initialize(search_name, regex_terms, css_terms)
+      super(search_name)
+      @regex_terms = regex_terms
+      @css_terms   = css_terms
+      save_header_row
+    end
+    def process_page(page, page_hash)
+      regexes   = match_regexes(page.doc)
+      css_terms = match_css_terms(page.doc)
+      write_row(page, regexes, css_terms) if !regexes.empty? || !css_terms.empty?
+    end
+    private
+    def filename_for_output
+      File.join @output_folder, @search_name + "-terms.csv"
+    end
+    def save_header_row
+      write_file do |csv|
+        csv << ["URL", "Regex Terms", "CSS Terms"]
+      end
+    end
+    def write_row(page, regexes, css_terms)
+      write_file do |csv|
+        csv << [page.url, regexes.map {|r| r.source }.join(';'), css_terms.join(';')]
+      end
+    end
+    def match_regexes(doc)
+      @regex_terms.select do |term|
+        doc.to_s =~ term
+      end
+    end
+    def match_css_terms(doc)
+      @css_terms.select do |term|
+        doc.css(term)
+      end
+    end
+  end
+end

data/lib/basilisk/template.rb ADDED

@@ -0,0 +1,54 @@
+module Basilisk
+  module Template
+    extend self
+    def default(options={})
+      yaml = <<-CONFIG
+# This is a basilisk config file.
+# Available processors include the following:
+#   seo: generates a csv with the following columns: url, title, description, keywords, h1s, h2s
+#   sitemap: generates an xml sitemap
+#   image: generates a list of broken images and images lacking an alt tag.
+#   error: generates a csv of urls returning html response codes other than success and redirect.
+#
+# Separate processors with a semi-colon:
+#   processors: "seo; sitemap; error"
+# Separate regex terms with a semi-colon:
+#   regex_search_terms: "error\w+;invalid\w+"
+# Separate css terms with a semi-colon:
+#   css_search_terms: "#error_message; .error"
+# Regex patterns separated with semi-colons
+#   skip_url_patterns: "[0-9]+;some silly expression\s+;"
+# Optionally specify a user agent:
+#   user_agent: "sneaky-crawler"
+basilisk:
+  name: "#{options[:name]}"
+  url: "#{options[:url]}"
+  processors: "seo; sitemap; error"
+  regex_search_terms: ""
+  css_search_terms: ""
+  skip_url_patterns: ""
+  user_agent: "anemone-basilisk"
+      CONFIG
+    end
+    def output_instructions(search_name, filename, foldername)
+      instruction = <<-INSTRUCTIONS
+You just created the following search: #{search_name}
+If you'd like to change the default options, edit the file #{filename}
+To run your search:
+  basil run #{search_name}
+Your search results will appear in #{foldername}
+      INSTRUCTIONS
+      puts instruction
+    end
+  end
+end

data/test/basilisk_test.rb ADDED

File without changes

data/test/test_helper.rb ADDED

File without changes

metadata ADDED

@@ -0,0 +1,100 @@
+--- !ruby/object:Gem::Specification
+name: basilisk
+version: !ruby/object:Gem::Version
+  version: 0.2.5
+platform: ruby
+authors:
+- Kyle Banker
+- Alexander Interactive, Inc.
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2009-08-24 00:00:00 -04:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: anemone
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.1.2
+    version:
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.3.0
+    version:
+- !ruby/object:Gem::Dependency
+  name: fastercsv
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.5.0
+    version:
+description:
+email: knb@alexanderinteractive.com
+executables:
+- basil
+extensions: []
+extra_rdoc_files:
+- README.rdoc
+files:
+- HISTORY
+- LICENSE
+- README.rdoc
+- bin/basil
+- lib/basilisk.rb
+- lib/basilisk/core.rb
+- lib/basilisk/parser.rb
+- lib/basilisk/processor.rb
+- lib/basilisk/template.rb
+- lib/basilisk/processors/error_processor.rb
+- lib/basilisk/processors/seo_processor.rb
+- lib/basilisk/processors/terms_processor.rb
+- lib/basilisk/processors/sitemap_processor.rb
+- lib/basilisk/processors/image_processor.rb
+- test/basilisk_test.rb
+- test/test_helper.rb
+has_rdoc: true
+homepage: http://github.com/aiaio/basilisk
+post_install_message:
+rdoc_options:
+- --main
+- README.rdoc
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project: basilisk
+rubygems_version: 1.3.1
+signing_key:
+specification_version: 2
+summary: A command-line front-end for the anemone web-spider. Generates reports for seo, http errors and an xml sitemap. Extensible page handler.
+test_files:
+- test/basilisk_test.rb
+- test/test_helper.rb