RubyGems - scraptacular - Versions diffs - 0.0.2 - Mend

scraptacular 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

checksums.yaml +15 -0
data/.gitignore +17 -0
data/.rspec +1 -0
data/Gemfile +4 -0
data/LICENSE.txt +22 -0
data/README.md +118 -0
data/Rakefile +1 -0
data/bin/scraptacular +6 -0
data/lib/scraptacular.rb +36 -0
data/lib/scraptacular/command_line.rb +100 -0
data/lib/scraptacular/dsl.rb +20 -0
data/lib/scraptacular/group.rb +30 -0
data/lib/scraptacular/result.rb +31 -0
data/lib/scraptacular/scraper.rb +48 -0
data/lib/scraptacular/suite.rb +44 -0
data/lib/scraptacular/url.rb +17 -0
data/lib/scraptacular/version.rb +3 -0
data/lib/scraptacular/world.rb +42 -0
data/scraptacular.gemspec +22 -0
data/spec/scraptacular/command_line_spec.rb +40 -0
data/spec/scraptacular/dsl_spec.rb +41 -0
data/spec/scraptacular/group_spec.rb +62 -0
data/spec/scraptacular/result_spec.rb +48 -0
data/spec/scraptacular/scraper_spec.rb +84 -0
data/spec/scraptacular/suite_spec.rb +94 -0
data/spec/scraptacular/url_spec.rb +22 -0
data/spec/scraptacular/world_spec.rb +58 -0
data/spec/spec_helper.rb +25 -0
metadata +108 -0

checksums.yaml ADDED

@@ -0,0 +1,15 @@
+---
+!binary "U0hBMQ==":
+  metadata.gz: !binary |-
+    ZDYyMzNjY2M5ZGRjYmUyNGJjYzkyNGU5OWNmZTlmZTg4NWU3OTkyOQ==
+  data.tar.gz: !binary |-
+    OTljY2E2NWNmOGYxMmE3NThkZjQxMmJlMGY3NzA4MGMxOTZjOWI5NQ==
+!binary "U0hBNTEy":
+  metadata.gz: !binary |-
+    OGY2ZWE1YjJkNGNkMDRmMWUyYmNlMzk5YWNkYWJjY2U5YjIwNmE0YmUzNjhi
+    OTFkM2U0YjcwZGM4MWY3NDhjMTg4MzZjYzQwZmNmZDRjNjI0ZTVlNjM0NmM5
+    OTVmNmEwNjM2MGZhZDkxZDg3M2QzMGE5YTU5NjYwZGJmZjhiYTI=
+  data.tar.gz: !binary |-
+    ZjI0YWFmZTc3Y2VlMWYzMjJmZDhiM2RlZjBhOWMxMmE1MTU3ZWMxZjBlYTVj
+    MWY4MzliZWU4Mjc1Y2U1ZmQxMzlmMDBiYjZmZjRmZDI0YjM0ZjIwZjkwMWM4
+    MjZhOTk5MmZlNmMzODg4MjI5N2Q2ZjU4ZjU3NjUzNmQ3MzI1ZmE=

data/.gitignore ADDED

@@ -0,0 +1,17 @@
+*.gem
+*.rbc
+.bundle
+.config
+.yardoc
+Gemfile.lock
+InstalledFiles
+_yardoc
+coverage
+doc/
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp

data/.rspec ADDED

	@@ -0,0 +1 @@
1	+ --colour

data/Gemfile ADDED

@@ -0,0 +1,4 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in scraptacular.gemspec
+gemspec

data/LICENSE.txt ADDED

@@ -0,0 +1,22 @@
+Copyright (c) 2013 Roger Vandervort
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED

@@ -0,0 +1,118 @@
+# Scraptacular
+Organized web-scraping.
+## Installation
+Add this line to your application's Gemfile:
+    gem 'scraptacular'
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install scraptacular
+## Usage
+### Defining Scrapers
+The scraper describes what content should be plucked from the page and returned in the result.
+*Example 1 : Basic Usage*
+```ruby
+ scraper :yahoo_front_page do
+   result do
+     highest_trending_url { page.search("ol.trending_now_trend_list li a").first.attributes["href"].value }
+     anything { "My returned value" }
+   end
+ end
+```
+*Example 2 : Multiple Level Scraping*
+```ruby
+ scraper :event_index_page do
+   # Find URLs, scrape the contents of those pages using the :event_detail_page scraper
+   scrape_links("a.css_selector_for_links", with: :event_detail_page).each do |link|
+     result do
+       # Provide partial result from the index page
+       event_title { page.search("h4").first.text }
+       # Merge results from the detail page
+       merge(link)
+     end
+   end
+ end
+ scraper :event_detail_page do
+   result do
+      date { ... }
+      price { ... }
+   end
+ end
+```
+Scraping a page returns a Scraptacular::Result object :
+```ruby
+result = results.first  # See section below on running a scraping session
+result.class  # Scraptacular::Result
+result.to_h   # {:highest_trending_url => "http://www.harlemshakevideos.com", :anything => "My returned value" }
+```
+### Setting Up Scraping Sessions
+Scraping sessions are divided into groups and suites. The group is a logical separation by content topic.
+The suite generally refers to a set of urls which should be scraped using the same scraper
+```ruby
+scrape_group "Ruby Sites" do
+  suite "Google", with: :google_result_index do
+    # The url will be scraped using the :google_result_index scraper
+    url "https://www.google.com/search?q=Ruby"
+    # Tell Scraptacular to use a different scraper for an individual URL
+    url "https://www.google.com/search?q=Ruby+On+Rails", with: :google_alternate_index
+  end
+end
+```
+### Running From The Command Line
+Scraptacular comes with its own command line utility. Currently the only supported output format is JSON:
+See scraptacular --help for more info.
+```
+$ scraptacular -d /path/to/scraper_definitions.rb -s /path/to/sessions.rb -o /path/to/outout.json
+```
+### Inside Your Project
+```ruby
+require 'scraptacular'
+# Set up the definitions an sessions
+scraper :my_scraper do
+end
+scrape_group "My Group" do
+  suite "My Suite", with: :my_scraper do
+    url "http..."
+    url "http..."
+  end
+end
+# Run all groups
+results = Scraptacular.world.run
+# Run a single group
+results = Scraptacular.world.run({group: "My Group"})
+```
+## Contributing
+1. Fork it
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create new Pull Request

data/Rakefile ADDED

	@@ -0,0 +1 @@
1	+ require "bundler/gem_tasks"

data/bin/scraptacular ADDED

@@ -0,0 +1,6 @@
+#!/usr/bin/env ruby
+require 'scraptacular'
+# --
+Scraptacular::CommandLine.new

data/lib/scraptacular.rb ADDED

@@ -0,0 +1,36 @@
+require 'mechanize'
+class String
+  def clear
+    self.tr_s("\n\r\t","   ")
+  end
+end
+module Scraptacular
+  class << self
+    def [](scraper_identifier)
+      world.scrapers[scraper_identifier]
+    end
+    def agent
+      @agent ||= Mechanize.new
+    end
+    def define_scraper(identifier, &block)
+      Scraptacular.world.scrapers[identifier.to_sym] = Scraptacular::Scraper.new(identifier, &block)
+    end
+    def world
+      @world ||= Scraptacular::World.new
+    end
+  end
+end
+require 'scraptacular/dsl'
+require 'scraptacular/world'
+require 'scraptacular/group'
+require 'scraptacular/suite'
+require 'scraptacular/url'
+require 'scraptacular/scraper'
+require 'scraptacular/command_line'
+require 'scraptacular/result'

data/lib/scraptacular/command_line.rb ADDED

@@ -0,0 +1,100 @@
+# http://www.ruby-doc.org/stdlib/libdoc/optparse/rdoc/classes/OptionParserr.html
+require 'optparse'
+require 'json'
+module Scraptacular
+  class CommandLine
+    def initialize
+      parse_options
+      at_exit do
+        next unless $!.nil? || $!.kind_of?(SystemExit)
+        status = run(options).to_i
+        exit status if status != 0
+      end
+    end
+    def load_file(file_name)
+    end
+    def options
+      @options ||= {}
+    end
+    def parse_options
+      @options = {}
+      OptionParser.new do |parser|
+        parser.banner = "Usage: scraptacular -d DEFINITION_FILE -s SESSION_FILE"
+        parser.on('-d', '--definition-file DEFINITION_FILE', 'Specify a file container scraper definitions') do |file|
+          @options[:definition_file] = file
+        end
+        parser.on('-s','--session-file SESSION_FILE','Specify groups, suites, and URLs to be scraped') do |file|
+          @options[:session_file] = file
+        end
+        parser.on('-o','--output-file [OUTPUT_FILE]', 'Scrape result output file. Only useful for text output') do |file|
+          @options[:output_file] = file
+        end
+        parser.on('-g GROUP', '--group', 'Specify a single group to scrape') do |group|
+          @options[:only_group] = group
+        end
+        parser.on('-f', '--format [FORMAT]', 'only "json" is supported') do |format|
+          @options[:format] = "json"
+        end
+        parser.on_tail('-h','--help','The help file') do
+          puts parser
+          exit
+        end
+      end.parse!
+    end
+    def run(options, out = $stdout)
+      return 1 unless (validate_file(options[:definition_file], "definition file", out)  &&
+                      validate_file(options[:session_file], "session file", out) )
+      res = Scraptacular.world.run(options)
+      #TODO: Replace with "Formatters" e.g. JSONFormatter, XMLFormatter, whatever
+      if options[:format] == "json"
+        res.each do |group, suites|
+          suites.each do |suite, results|
+            suites[suite] = results.map!(&:to_h)
+          end
+        end
+        res = res.to_json
+      end
+      # TODO : Replace with "Outputters".
+      if options[:output_file]
+        File.open(options[:output_file], 'w') { |file| file.write(res) }
+      else
+        out.puts res
+      end
+      return 0
+    end
+    def validate_file(file_name, caption, out)
+      if file_name.nil?
+        out.puts "You must specify a #{caption}. See --help for more information"
+        return false
+      else
+        if File.exists?(file_name)
+          load file_name
+          return true
+        else
+          out.puts "The #{caption} specified does not exist: #{file_name}"
+          return false
+        end
+      end
+    end
+  end
+end

data/lib/scraptacular/dsl.rb ADDED

@@ -0,0 +1,20 @@
+module Scraptacular
+  module Core
+    module DSL
+      # Define a new grouping
+      def scrape_group(name, &group_block)
+        Scraptacular.world.register_group name, &group_block
+      end
+      # Define a new scraper
+      def scraper(name, &scraper_block)
+        Scraptacular.world.register_scraper name, &scraper_block
+      end
+    end
+  end
+end
+extend Scraptacular::Core::DSL
+Module.send :include, Scraptacular::Core::DSL

data/lib/scraptacular/group.rb ADDED

@@ -0,0 +1,30 @@
+module Scraptacular
+  class Group
+    attr_accessor :name, :suites
+    def initialize(name, &block)
+      @name = name
+      @suites = []
+      instance_eval(&block)
+    end
+    def run(out)
+      out.puts "Group: #{self.name}"
+      results = {}
+      suites.each do |suite|
+        results[suite.name] = suite.run(out)
+      end
+      results
+    end
+    def suite(name, options = {}, &block)
+      suite = Scraptacular::Suite.new(name, options, &block)
+      @suites << suite
+      suite
+    end
+  end
+end

data/lib/scraptacular/result.rb ADDED

@@ -0,0 +1,31 @@
+module Scraptacular
+  class Result
+    attr_reader :page
+    def initialize(page)
+      @page = page
+      @result = {}
+    end
+    def merge(other_result, priority = :other)
+      if priority == :other
+        @result.merge! other_result.to_h
+      else
+        @result = other_result.to_h.merge @result
+      end
+    end
+    def method_missing(method_name, *args, &block)
+      # field_name { page.search(lskdjfskdf) }
+      @result[method_name] = instance_eval(&block)
+    end
+    def to_h
+      @result
+    end
+    def to_json
+      @result.to_json
+    end
+  end
+end

data/lib/scraptacular/scraper.rb ADDED

@@ -0,0 +1,48 @@
+module Scraptacular
+  class Scraper
+    attr_reader :name, :page
+    def initialize(name, &block)
+      @name = name
+      @block = block
+    end
+    def result(&block)
+      retval = Scraptacular::Result.new(@page)
+      retval.instance_eval(&block)
+      retval.send :remove_instance_variable, :@page
+      @results << retval
+    end
+    def run(page)
+      @page = page
+      @results = []
+      instance_eval &@block
+      @results
+    end
+    def scrape_links(selector, options = {})
+      if options[:with]
+        unless scraper = Scraptacular.world.scrapers[options[:with]]
+          raise ArgumentError, "scraper #{options[:with]} does not exist"
+        end
+      else
+        raise ArgumentError, "You must supply a scraper using the :with option"
+      end
+      retval = []
+      page.search(selector).each do |link|
+        subpage = Scraptacular.agent.get(link.attributes["href"].value)
+        retval += [*scraper.run(subpage)]
+      end
+      retval
+    end
+  end
+end

data/lib/scraptacular/suite.rb ADDED

@@ -0,0 +1,44 @@
+module Scraptacular
+  class Suite
+    attr_accessor :default_scraper, :urls, :name
+    def initialize(name, options = {}, &block)
+      @name = name
+      if !options.has_key?(:with)
+        raise ArgumentError, "You must supply a default scraper using :with"
+      end
+      @default_scraper =  Scraptacular.world.scrapers[options[:with]]
+      if @default_scraper.nil?
+        raise ArgumentError, "The supplied scraper :#{options[:with]} does not exist"
+      end
+      @urls = []
+      instance_eval(&block)
+    end
+    def run(out)
+      out.puts "  Suite: #{self.name}"
+      results = []
+      urls.each do |url|
+        out.puts "   URL: #{url.path}"
+        scraper = url.scraper
+        scraper ||= default_scraper
+        page = Scraptacular.agent.get(url.path)
+        results += [*scraper.run(page)]
+      end
+      results
+    end
+    def url(url_path, options = {})
+      urls << Scraptacular::URL.new(url_path, options)
+    end
+  end
+end

data/lib/scraptacular/url.rb ADDED

@@ -0,0 +1,17 @@
+module Scraptacular
+  class URL
+    attr_reader :path, :scraper
+    def initialize(path, options = {})
+      @path = path
+      if options[:with]
+        unless @scraper = Scraptacular.world.scrapers[options[:with]]
+          raise ArgumentError, "The supplied scraper :#{options[:with]} does not exist"
+        end
+      end
+    end
+  end
+end

data/lib/scraptacular/version.rb ADDED

@@ -0,0 +1,3 @@
+module Scraptacular
+  VERSION = "0.0.2"
+end

data/lib/scraptacular/world.rb ADDED

@@ -0,0 +1,42 @@
+module Scraptacular
+  class World
+    attr_accessor :groups, :scrapers
+    def initialize
+      @groups = []
+      @scrapers = {}
+    end
+    def register_group(name, &block)
+      group = Scraptacular::Group.new(name, &block)
+      @groups << group
+      group
+    end
+    def register_scraper(identifier, &block)
+      scraper = Scraptacular::Scraper.new(identifier, &block)
+      @scrapers[identifier] = scraper
+      scraper
+    end
+    def reset
+      @results = {}
+    end
+    def run(options, out = $stdout)
+      reset
+      if options[:group]
+        groups_to_run = @groups.select { |g| g.name == options[:group] }
+      else
+        groups_to_run = @groups
+      end
+      groups_to_run.each do |group|
+        @results[group.name] = group.run(out)
+      end
+      @results
+    end
+  end
+end

data/scraptacular.gemspec ADDED

@@ -0,0 +1,22 @@
+# -*- encoding: utf-8 -*-
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'scraptacular/version'
+Gem::Specification.new do |gem|
+  gem.name          = "scraptacular"
+  gem.version       = Scraptacular::VERSION
+  gem.authors       = ["Roger Vandervort"]
+  gem.email         = ["rvandervort@gmail.com"]
+  gem.description   = %q{Organized web scraping}
+  gem.summary       = %q{Organized web scraping}
+  gem.homepage      = "https://github.com/rvandervort/scraptacular"
+  gem.files         = `git ls-files`.split($/)
+  gem.executables   = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
+  gem.test_files    = gem.files.grep(%r{^(test|spec|features)/})
+  gem.require_paths = ["lib"]
+  gem.add_runtime_dependency 'mechanize', '~> 2.5'
+  gem.add_development_dependency 'simplecov', '~> 0.7.1'
+end

data/spec/scraptacular/command_line_spec.rb ADDED

@@ -0,0 +1,40 @@
+require 'spec_helper.rb'
+describe Scraptacular::CommandLine do
+  describe ".run" do
+    let(:options) { {definition_file: 'def_file', session_file: 'sess_file' } }
+    let(:cl) { described_class.new }
+    before :each do
+      described_class.any_instance.stub(:parse_options)
+      cl.instance_variable_set :@options, options
+    end
+    context "with valid inputs" do
+      before :each do
+        cl.stub(:validate_file).with('def_file', 'definition file', $stdout).and_return(true)
+        cl.stub(:validate_file).with('sess_file', 'session file', $stdout).and_return(true)
+      end
+      it "tells the Scraptacular.world to run" do
+        Scraptacular.world.should_receive(:run).with(options)
+        cl.run(options)
+      end
+    end
+    context "with invalid inputs" do
+      before :each do
+        cl.stub(:validate_file).and_return(false)
+      end
+      it "tells the Scraptacular.world to run" do
+        Scraptacular.world.should_not_receive(:run)
+        cl.run(options)
+      end
+    end
+  end
+end

data/spec/scraptacular/dsl_spec.rb ADDED

@@ -0,0 +1,41 @@
+require 'spec_helper.rb'
+main = self
+methods = [
+  :scrape_group,
+  :scraper
+]
+methods.each do |method_name|
+  describe "##{method_name}" do
+    it "is not added to every object" do
+      expect(main).to respond_to(method_name)
+      expect(Module.new).to respond_to(method_name)
+      expect(Object.new).not_to respond_to(method_name)
+    end
+  end
+end
+describe "#scrape_group" do
+  let(:world) { Scraptacular::World.new}
+  let(:name) { "Test Group" }
+  let(:test_block) { Proc.new { } }
+  before :each do
+    Scraptacular.stub(:world).and_return(world)
+  end
+  it "registers a new ScrapeGroup object" do
+    world.should_receive(:register_group).with(name, &test_block)
+    Module.new.scrape_group(name, &test_block)
+  end
+  it "returns the new group" do
+    Module.new.scrape_group(name, &test_block).should be_instance_of(Scraptacular::Group)
+  end
+end
+describe "#scraper" do
+end

data/spec/scraptacular/group_spec.rb ADDED

@@ -0,0 +1,62 @@
+require 'spec_helper.rb'
+describe Scraptacular::Group do
+  let(:name) { "Test Group" }
+  let(:block) { Proc.new { @test_instance_var = 42 } }
+  let(:group) { described_class.new(name, &block) }
+  before :all do
+    Scraptacular.world.register_scraper :test_scraper, &Proc.new {}
+  end
+  describe "initialize" do
+    it "sets the group name" do
+      group.name.should == name
+    end
+    it "creates an array of suites" do
+      group.suites.should be_instance_of(Array)
+    end
+    it "executes the block to register the suites" do
+      group.instance_variable_get(:@test_instance_var).should == 42
+    end
+  end
+  describe ".run" do
+    let(:suite) { Scraptacular::Suite.new "Test Suite", {with: :test_scraper}, &Proc.new {}}
+    before :each do
+      suite.stub(:run).and_return([1,2,3])
+      group.suites << suite
+    end
+    it "runs each suite" do
+      suite.should_receive(:run)
+      group.run($stdout)
+    end
+    it "returns a hash of results for the suites" do
+      group.run($stdout).should == {"Test Suite" => [1,2,3]}
+    end
+  end
+  describe ".suite" do
+    let(:suite_name) { "Test Suite" }
+    let(:suite_options) {{with: :default_scraper } }
+    let(:suite_block) { Proc.new {} }
+    before :each do
+      Scraptacular.world.scrapers.stub(:[]).and_return(true)
+    end
+    it "creates a new Scraptacular::Suite" do
+      group.suites.should_receive(:<<).with(an_instance_of(Scraptacular::Suite))
+      group.suite(suite_name, suite_options, &suite_block)
+    end
+    it "returns the new Suite object" do
+      group.suite(suite_name, suite_options, &suite_block).should be_instance_of(Scraptacular::Suite)
+    end
+  end
+end

data/spec/scraptacular/result_spec.rb ADDED

@@ -0,0 +1,48 @@
+require 'spec_helper.rb'
+describe Scraptacular::Result do
+  let(:page) { stub('Page') }
+  let(:result) { described_class.new(page) }
+  describe "initialize" do
+    it "sets the internal page reference" do
+      result.instance_variable_get(:@page).should == page
+    end
+    it "sets the result to an empty hash" do
+      result.instance_variable_get(:@result).should == {}
+    end
+  end
+  describe ".merge(other_result, priority = :other)" do
+    let(:other_result) { described_class.new(page) }
+    before :each do
+      result.instance_variable_set :@result, {url: "url1", attribute: "test"}
+      other_result.instance_variable_set :@result, {url: "url2", attribute2: "test"}
+    end
+    it "adds values from the other_result to the current result hash" do
+      result.merge other_result, :other
+      result.to_h.should == {url: "url2", attribute: "test", attribute2: "test"}
+    end
+    context "priority is self" do
+      it "does not overwrite values in the current result wiht those from the other" do
+        result.merge other_result, :self
+        result.to_h.should == {url: "url1", attribute: "test", attribute2: "test"}
+      end
+    end
+  end
+  describe ".method_missing" do
+    it "evaluates the method name as a new key in the result hash" do
+      result.new_field { "test value" }
+      result.to_h.has_key?(:new_field).should be_true
+      result.to_h[:new_field].should == "test value"
+    end
+  end
+  describe ".to_h" do
+  end
+end

data/spec/scraptacular/scraper_spec.rb ADDED

@@ -0,0 +1,84 @@
+require 'spec_helper.rb'
+describe Scraptacular::Scraper do
+  let(:name) { :scraper_name }
+  let(:block) { Proc.new { @test_val = "Hello" } }
+  let(:scraper) { described_class.new(name, &block) }
+  describe "initialize" do
+    it "sets the internal identifier" do
+      scraper.name.should == name
+    end
+    it "sets the internal block" do
+      scraper.instance_variable_get(:@block).should == block
+    end
+  end
+  describe ".scrape_links(selector, options = {})" do
+    let(:sub_result1) { stub('Node', attributes: {"href" => stub('Attribute', value: "url1") }) }
+    let(:sub_result2) { stub('Node', attributes: {"href" => stub('Attribute', value: "url2") }) }
+    let(:page) { stub('Mechanize::Page') }
+    let(:sub_page1) { stub('Mechanize::Page') }
+    let(:sub_page2) { stub('Mechanize::Page') }
+    let(:sub_scraper) do
+      Scraptacular::define_scraper :exists do
+        result do
+        end
+      end
+    end
+    before :each do
+      sub_scraper
+      scraper.instance_variable_set :@page, page
+      page.stub(:search).and_return([sub_result1, sub_result2])
+      Mechanize.any_instance.stub(:get).with("url1").and_return(sub_page1)
+      Mechanize.any_instance.stub(:get).with("url2").and_return(sub_page2)
+    end
+    it "raises an ArgumentError if no :with parameter was supplied" do
+      expect { scraper.scrape_links("a.test_links", {})}.to raise_error(ArgumentError)
+    end
+    it "raises an ArgumentError if the :with scraper is not registered" do
+      expect { scraper.scrape_links("a.test_links", {with: :does_not_exist})}.to raise_error(ArgumentError)
+    end
+    it "does not raise an ArgumentError if the :with scraper is registered" do
+      expect { scraper.scrape_links("a.test_links", {with: :exists})}.not_to raise_error(ArgumentError)
+    end
+    it "searches the page for links matching the selector" do
+      page.should_receive(:search).with("a.test_links")
+      scraper.scrape_links "a.test_links", {with: :exists}
+    end
+    it "retrieves the page contents for each found URL" do
+      Mechanize.any_instance.should_receive(:get).with("url1").exactly(1).times
+      Mechanize.any_instance.should_receive(:get).with("url2").exactly(1).times
+      scraper.scrape_links "a.test_links", {with: :exists}
+    end
+    it "scrapes the sub-pages using the supplied :with scraper" do
+      sub_scraper.should_receive(:run).with(sub_page1).exactly(1).times
+      sub_scraper.should_receive(:run).with(sub_page2).exactly(1).times
+      scraper.scrape_links "a.test_links", {with: :exists}
+    end
+    it "returns an array of results" do
+      scraper.scrape_links("a.test_links", {with: :exists}).should be_instance_of(Array)
+    end
+  end
+  describe ".run" do
+    let(:page) { stub 'Mechanize::Page' }
+    it "evalutes the block" do
+      expect { scraper.run page }.to change { scraper.instance_variable_get(:@test_val) }.to("Hello")
+    end
+    it "returns an array of results" do
+      scraper.run(page).should be_instance_of(Array)
+    end
+  end
+  describe ".result" do
+  end
+end

data/spec/scraptacular/suite_spec.rb ADDED

@@ -0,0 +1,94 @@
+require 'spec_helper.rb'
+describe Scraptacular::Suite do
+  let(:name) { "Test Suite" }
+  let(:block) { Proc.new { @test_var = "HI!"  } }
+  let(:scraper) { stub("Scraptacular::Scraper") }
+  let(:suite_options) {{with: :some_identifier }}
+  let(:suite) { Scraptacular::Suite.new(name, suite_options, &block) }
+  before :each do
+    Scraptacular.world.scrapers.stub(:[]).with(:some_identifier).and_return(scraper)
+    scraper.stub(:run)
+  end
+  describe "#initialize" do
+    context ":with argument" do
+      it "sets the default scraper, if supplied" do
+        suite.default_scraper.should == scraper
+      end
+      it "raises an ArgumentError if the with parameter is not supplied" do
+        suite_options.delete(:with)
+        expect { suite }.to raise_error(ArgumentError)
+      end
+      it "raises an ArgumentError if the selected scraper is not defined" do
+        Scraptacular.world.scrapers.stub(:[]).and_return(nil)
+        expect { suite }.to raise_error(ArgumentError)
+      end
+    end
+    it "initializes an @urls attribute as an Array" do
+      suite.instance_variable_get(:@urls).should be_instance_of(Array)
+    end
+    it "executes the passed block" do
+      suite.instance_variable_get(:@test_var).should == "HI!"
+    end
+  end
+  describe ".run" do
+    let(:url1) { Scraptacular::URL.new("test_url_1") }
+    let(:url2) { Scraptacular::URL.new("test_url_2") }
+    let(:page1) { stub('Page 1') }
+    let(:page2) { stub('Page 2') }
+    let(:fake_result) { stub('Scraptacular::Result')}
+    before :each do
+      suite.urls << url1
+      suite.urls << url2
+      Mechanize.any_instance.stub(:get).with(url1.path).and_return(page1)
+      Mechanize.any_instance.stub(:get).with(url2.path).and_return(page2)
+      scraper.stub(:run).and_return(fake_result)
+    end
+    it "iterates through all of the URLs" do
+      Mechanize.any_instance.should_receive(:get).with(url1.path)
+      Mechanize.any_instance.should_receive(:get).with(url2.path)
+      suite.run($stdout)
+    end
+    it "uses the URL's scraper, if defined" do
+      sc = Scraptacular.world.register_scraper(:url2_scraper, &Proc.new {})
+      url2.instance_variable_set :@scraper, sc
+      sc.should_receive(:run).with(page2)
+      suite.run($stdout)
+    end
+    it "uses the default scraper if a url-specified scraper is not defined" do
+      scraper.should_receive(:run).with(page1).exactly(1).times
+      scraper.should_receive(:run).with(page2).exactly(1).times
+      suite.run($stdout)
+    end
+    it "returns an Array of results" do
+      suite.run($stdout).should == [fake_result, fake_result]
+    end
+  end
+  describe ".url(path, options = {})" do
+    it "appends a new Scraptacular::URL object to the @urls list" do
+      suite.urls.should_receive(:<<).with(an_instance_of(Scraptacular::URL))
+      suite.url("dummy path", {})
+    end
+  end
+end

data/spec/scraptacular/url_spec.rb ADDED

@@ -0,0 +1,22 @@
+require 'spec_helper.rb'
+describe Scraptacular::URL do
+  let(:path) { "http://www.google.com" }
+  describe "#initialize" do
+    it "raises an argument error if the :with option does not reference a valid scraper" do
+      expect { Scraptacular::URL.new(path,{with: :does_not_exist}) }.to raise_error(ArgumentError)
+    end
+    it "does not raise an argument error if the :with options references a valid scraper" do
+      Scraptacular.world.scrapers[:exists] = "nothing interesting"
+      expect { Scraptacular::URL.new(path,{with: :exists}) }.not_to raise_error(ArgumentError)
+    end
+    it "sets the internal @path attribute" do
+      described_class.new(path).path.should == path
+    end
+  end
+end

data/spec/scraptacular/world_spec.rb ADDED

@@ -0,0 +1,58 @@
+require 'spec_helper.rb'
+describe Scraptacular::World do
+  let(:world) { described_class.new }
+  let(:name) { "Name "}
+  let(:block) { Proc.new {} }
+  describe ".register_group" do
+    it "Creates a new group object" do
+      Scraptacular::Group.should_receive(:new)
+      world.register_group(name, &block)
+    end
+    it "adds the group object to the world's known group list" do
+      expect { world.register_group(name, &block) }.to change { world.groups.count }.to(1)
+    end
+  end
+  describe ".register_scraper" do
+    it "creates a new scraper object" do
+      Scraptacular::Scraper.should_receive(:new)
+      world.register_scraper(name, &block)
+    end
+    it "adds the scraper object the world's known scraper list" do
+      expect { world.register_scraper(name, &block) }.to change{world.scrapers.count }.to(1)
+    end
+  end
+  describe ".run(options)" do
+    let(:group1) { Scraptacular::Group.new "Group 1", &Proc.new {}}
+    let(:group2) { Scraptacular::Group.new "Group 2", &Proc.new {}}
+    before :all do
+      world.groups << group1
+      world.groups << group2
+    end
+    it "runs all groups if :group is not specified" do
+      group1.should_receive(:run)
+      group2.should_receive(:run)
+      world.run({})
+    end
+    it "runs only a single group if specified, and exists" do
+      group1.should_receive(:run)
+      group2.should_not_receive(:run)
+      world.run({group: "Group 1"})
+    end
+    it "returns a hash of results" do
+      p world.run({})
+    end
+  end
+end

data/spec/spec_helper.rb ADDED

@@ -0,0 +1,25 @@
+require 'simplecov'
+SimpleCov.start
+require 'scraptacular'
+RSpec.configure do |config|
+  # Redirect the programs output elsewhere
+  # See : https://gist.github.com/adamstegman/926858
+  config.before :all do
+    @orig_stderr = $stderr
+    @orig_stdout = $stdout
+    # redirect stderr and stdout to /dev/null
+    $stderr = File.new('/dev/null', 'w')
+    $stdout = File.new('/dev/null', 'w')
+  end
+  config.after :all do
+    $stderr = @orig_stderr
+    $stdout = @orig_stdout
+    @orig_stderr = nil
+    @orig_stdout = nil
+  end
+end

metadata ADDED

@@ -0,0 +1,108 @@
+--- !ruby/object:Gem::Specification
+name: scraptacular
+version: !ruby/object:Gem::Version
+  version: 0.0.2
+platform: ruby
+authors:
+- Roger Vandervort
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2013-03-12 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: mechanize
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '2.5'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '2.5'
+- !ruby/object:Gem::Dependency
+  name: simplecov
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 0.7.1
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 0.7.1
+description: Organized web scraping
+email:
+- rvandervort@gmail.com
+executables:
+- scraptacular
+extensions: []
+extra_rdoc_files: []
+files:
+- .gitignore
+- .rspec
+- Gemfile
+- LICENSE.txt
+- README.md
+- Rakefile
+- bin/scraptacular
+- lib/scraptacular.rb
+- lib/scraptacular/command_line.rb
+- lib/scraptacular/dsl.rb
+- lib/scraptacular/group.rb
+- lib/scraptacular/result.rb
+- lib/scraptacular/scraper.rb
+- lib/scraptacular/suite.rb
+- lib/scraptacular/url.rb
+- lib/scraptacular/version.rb
+- lib/scraptacular/world.rb
+- scraptacular.gemspec
+- spec/scraptacular/command_line_spec.rb
+- spec/scraptacular/dsl_spec.rb
+- spec/scraptacular/group_spec.rb
+- spec/scraptacular/result_spec.rb
+- spec/scraptacular/scraper_spec.rb
+- spec/scraptacular/suite_spec.rb
+- spec/scraptacular/url_spec.rb
+- spec/scraptacular/world_spec.rb
+- spec/spec_helper.rb
+homepage: https://github.com/rvandervort/scraptacular
+licenses: []
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.0.3
+signing_key:
+specification_version: 4
+summary: Organized web scraping
+test_files:
+- spec/scraptacular/command_line_spec.rb
+- spec/scraptacular/dsl_spec.rb
+- spec/scraptacular/group_spec.rb
+- spec/scraptacular/result_spec.rb
+- spec/scraptacular/scraper_spec.rb
+- spec/scraptacular/suite_spec.rb
+- spec/scraptacular/url_spec.rb
+- spec/scraptacular/world_spec.rb
+- spec/spec_helper.rb