scraptacular 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ ZDYyMzNjY2M5ZGRjYmUyNGJjYzkyNGU5OWNmZTlmZTg4NWU3OTkyOQ==
5
+ data.tar.gz: !binary |-
6
+ OTljY2E2NWNmOGYxMmE3NThkZjQxMmJlMGY3NzA4MGMxOTZjOWI5NQ==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ OGY2ZWE1YjJkNGNkMDRmMWUyYmNlMzk5YWNkYWJjY2U5YjIwNmE0YmUzNjhi
10
+ OTFkM2U0YjcwZGM4MWY3NDhjMTg4MzZjYzQwZmNmZDRjNjI0ZTVlNjM0NmM5
11
+ OTVmNmEwNjM2MGZhZDkxZDg3M2QzMGE5YTU5NjYwZGJmZjhiYTI=
12
+ data.tar.gz: !binary |-
13
+ ZjI0YWFmZTc3Y2VlMWYzMjJmZDhiM2RlZjBhOWMxMmE1MTU3ZWMxZjBlYTVj
14
+ MWY4MzliZWU4Mjc1Y2U1ZmQxMzlmMDBiYjZmZjRmZDI0YjM0ZjIwZjkwMWM4
15
+ MjZhOTk5MmZlNmMzODg4MjI5N2Q2ZjU4ZjU3NjUzNmQ3MzI1ZmE=
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --colour
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in scraptacular.gemspec
4
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Roger Vandervort
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,118 @@
1
+ # Scraptacular
2
+
3
+ Organized web-scraping.
4
+ ## Installation
5
+
6
+ Add this line to your application's Gemfile:
7
+
8
+ gem 'scraptacular'
9
+
10
+ And then execute:
11
+
12
+ $ bundle
13
+
14
+ Or install it yourself as:
15
+
16
+ $ gem install scraptacular
17
+
18
+ ## Usage
19
+
20
+ ### Defining Scrapers
21
+ The scraper describes what content should be plucked from the page and returned in the result.
22
+
23
+ *Example 1 : Basic Usage*
24
+ ```ruby
25
+ scraper :yahoo_front_page do
26
+ result do
27
+ highest_trending_url { page.search("ol.trending_now_trend_list li a").first.attributes["href"].value }
28
+ anything { "My returned value" }
29
+ end
30
+ end
31
+ ```
32
+
33
+ *Example 2 : Multiple Level Scraping*
34
+ ```ruby
35
+ scraper :event_index_page do
36
+ # Find URLs, scrape the contents of those pages using the :event_detail_page scraper
37
+ scrape_links("a.css_selector_for_links", with: :event_detail_page).each do |link|
38
+ result do
39
+
40
+ # Provide partial result from the index page
41
+ event_title { page.search("h4").first.text }
42
+
43
+ # Merge results from the detail page
44
+ merge(link)
45
+ end
46
+ end
47
+ end
48
+
49
+ scraper :event_detail_page do
50
+ result do
51
+ date { ... }
52
+ price { ... }
53
+ end
54
+ end
55
+ ```
56
+ Scraping a page returns a Scraptacular::Result object :
57
+ ```ruby
58
+ result = results.first # See section below on running a scraping session
59
+
60
+ result.class # Scraptacular::Result
61
+ result.to_h # {:highest_trending_url => "http://www.harlemshakevideos.com", :anything => "My returned value" }
62
+ ```
63
+
64
+ ### Setting Up Scraping Sessions
65
+
66
+ Scraping sessions are divided into groups and suites. The group is a logical separation by content topic.
67
+ The suite generally refers to a set of urls which should be scraped using the same scraper
68
+
69
+ ```ruby
70
+ scrape_group "Ruby Sites" do
71
+ suite "Google", with: :google_result_index do
72
+
73
+ # The url will be scraped using the :google_result_index scraper
74
+ url "https://www.google.com/search?q=Ruby"
75
+
76
+ # Tell Scraptacular to use a different scraper for an individual URL
77
+ url "https://www.google.com/search?q=Ruby+On+Rails", with: :google_alternate_index
78
+ end
79
+ end
80
+ ```
81
+
82
+ ### Running From The Command Line
83
+ Scraptacular comes with its own command line utility. Currently the only supported output format is JSON:
84
+ See scraptacular --help for more info.
85
+ ```
86
+ $ scraptacular -d /path/to/scraper_definitions.rb -s /path/to/sessions.rb -o /path/to/outout.json
87
+ ```
88
+
89
+ ### Inside Your Project
90
+ ```ruby
91
+ require 'scraptacular'
92
+
93
+ # Set up the definitions an sessions
94
+ scraper :my_scraper do
95
+ end
96
+
97
+ scrape_group "My Group" do
98
+ suite "My Suite", with: :my_scraper do
99
+ url "http..."
100
+ url "http..."
101
+ end
102
+ end
103
+
104
+ # Run all groups
105
+ results = Scraptacular.world.run
106
+
107
+ # Run a single group
108
+ results = Scraptacular.world.run({group: "My Group"})
109
+
110
+ ```
111
+
112
+ ## Contributing
113
+
114
+ 1. Fork it
115
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
116
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
117
+ 4. Push to the branch (`git push origin my-new-feature`)
118
+ 5. Create new Pull Request
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'scraptacular'
4
+
5
+ # --
6
+ Scraptacular::CommandLine.new
@@ -0,0 +1,36 @@
1
+ require 'mechanize'
2
+
3
+ class String
4
+ def clear
5
+ self.tr_s("\n\r\t"," ")
6
+ end
7
+ end
8
+
9
+ module Scraptacular
10
+ class << self
11
+ def [](scraper_identifier)
12
+ world.scrapers[scraper_identifier]
13
+ end
14
+
15
+ def agent
16
+ @agent ||= Mechanize.new
17
+ end
18
+
19
+ def define_scraper(identifier, &block)
20
+ Scraptacular.world.scrapers[identifier.to_sym] = Scraptacular::Scraper.new(identifier, &block)
21
+ end
22
+
23
+ def world
24
+ @world ||= Scraptacular::World.new
25
+ end
26
+ end
27
+ end
28
+
29
+ require 'scraptacular/dsl'
30
+ require 'scraptacular/world'
31
+ require 'scraptacular/group'
32
+ require 'scraptacular/suite'
33
+ require 'scraptacular/url'
34
+ require 'scraptacular/scraper'
35
+ require 'scraptacular/command_line'
36
+ require 'scraptacular/result'
@@ -0,0 +1,100 @@
1
+ # http://www.ruby-doc.org/stdlib/libdoc/optparse/rdoc/classes/OptionParserr.html
2
+ require 'optparse'
3
+ require 'json'
4
+
5
+ module Scraptacular
6
+ class CommandLine
7
+ def initialize
8
+ parse_options
9
+
10
+ at_exit do
11
+ next unless $!.nil? || $!.kind_of?(SystemExit)
12
+
13
+ status = run(options).to_i
14
+ exit status if status != 0
15
+ end
16
+ end
17
+
18
+ def load_file(file_name)
19
+ end
20
+
21
+ def options
22
+ @options ||= {}
23
+ end
24
+
25
+ def parse_options
26
+ @options = {}
27
+
28
+ OptionParser.new do |parser|
29
+ parser.banner = "Usage: scraptacular -d DEFINITION_FILE -s SESSION_FILE"
30
+
31
+ parser.on('-d', '--definition-file DEFINITION_FILE', 'Specify a file container scraper definitions') do |file|
32
+ @options[:definition_file] = file
33
+ end
34
+ parser.on('-s','--session-file SESSION_FILE','Specify groups, suites, and URLs to be scraped') do |file|
35
+ @options[:session_file] = file
36
+ end
37
+ parser.on('-o','--output-file [OUTPUT_FILE]', 'Scrape result output file. Only useful for text output') do |file|
38
+ @options[:output_file] = file
39
+ end
40
+
41
+ parser.on('-g GROUP', '--group', 'Specify a single group to scrape') do |group|
42
+ @options[:only_group] = group
43
+ end
44
+
45
+ parser.on('-f', '--format [FORMAT]', 'only "json" is supported') do |format|
46
+ @options[:format] = "json"
47
+ end
48
+
49
+ parser.on_tail('-h','--help','The help file') do
50
+ puts parser
51
+ exit
52
+ end
53
+ end.parse!
54
+ end
55
+
56
+ def run(options, out = $stdout)
57
+ return 1 unless (validate_file(options[:definition_file], "definition file", out) &&
58
+ validate_file(options[:session_file], "session file", out) )
59
+
60
+ res = Scraptacular.world.run(options)
61
+
62
+ #TODO: Replace with "Formatters" e.g. JSONFormatter, XMLFormatter, whatever
63
+ if options[:format] == "json"
64
+ res.each do |group, suites|
65
+ suites.each do |suite, results|
66
+ suites[suite] = results.map!(&:to_h)
67
+ end
68
+ end
69
+
70
+ res = res.to_json
71
+ end
72
+
73
+ # TODO : Replace with "Outputters".
74
+ if options[:output_file]
75
+ File.open(options[:output_file], 'w') { |file| file.write(res) }
76
+ else
77
+ out.puts res
78
+ end
79
+
80
+ return 0
81
+ end
82
+
83
+ def validate_file(file_name, caption, out)
84
+ if file_name.nil?
85
+ out.puts "You must specify a #{caption}. See --help for more information"
86
+ return false
87
+ else
88
+ if File.exists?(file_name)
89
+ load file_name
90
+ return true
91
+ else
92
+ out.puts "The #{caption} specified does not exist: #{file_name}"
93
+ return false
94
+ end
95
+
96
+ end
97
+ end
98
+ end
99
+ end
100
+
@@ -0,0 +1,20 @@
1
+ module Scraptacular
2
+ module Core
3
+ module DSL
4
+
5
+ # Define a new grouping
6
+ def scrape_group(name, &group_block)
7
+ Scraptacular.world.register_group name, &group_block
8
+ end
9
+
10
+ # Define a new scraper
11
+ def scraper(name, &scraper_block)
12
+ Scraptacular.world.register_scraper name, &scraper_block
13
+ end
14
+
15
+ end
16
+ end
17
+ end
18
+
19
+ extend Scraptacular::Core::DSL
20
+ Module.send :include, Scraptacular::Core::DSL
@@ -0,0 +1,30 @@
1
+ module Scraptacular
2
+ class Group
3
+ attr_accessor :name, :suites
4
+
5
+ def initialize(name, &block)
6
+ @name = name
7
+ @suites = []
8
+
9
+ instance_eval(&block)
10
+ end
11
+
12
+ def run(out)
13
+ out.puts "Group: #{self.name}"
14
+
15
+ results = {}
16
+
17
+ suites.each do |suite|
18
+ results[suite.name] = suite.run(out)
19
+ end
20
+
21
+ results
22
+ end
23
+
24
+ def suite(name, options = {}, &block)
25
+ suite = Scraptacular::Suite.new(name, options, &block)
26
+ @suites << suite
27
+ suite
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,31 @@
1
+ module Scraptacular
2
+ class Result
3
+ attr_reader :page
4
+
5
+ def initialize(page)
6
+ @page = page
7
+ @result = {}
8
+ end
9
+
10
+ def merge(other_result, priority = :other)
11
+ if priority == :other
12
+ @result.merge! other_result.to_h
13
+ else
14
+ @result = other_result.to_h.merge @result
15
+ end
16
+ end
17
+
18
+ def method_missing(method_name, *args, &block)
19
+ # field_name { page.search(lskdjfskdf) }
20
+ @result[method_name] = instance_eval(&block)
21
+ end
22
+
23
+ def to_h
24
+ @result
25
+ end
26
+
27
+ def to_json
28
+ @result.to_json
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,48 @@
1
+ module Scraptacular
2
+
3
+
4
+ class Scraper
5
+ attr_reader :name, :page
6
+
7
+ def initialize(name, &block)
8
+ @name = name
9
+ @block = block
10
+ end
11
+
12
+ def result(&block)
13
+ retval = Scraptacular::Result.new(@page)
14
+ retval.instance_eval(&block)
15
+
16
+ retval.send :remove_instance_variable, :@page
17
+ @results << retval
18
+ end
19
+
20
+ def run(page)
21
+ @page = page
22
+ @results = []
23
+ instance_eval &@block
24
+
25
+
26
+ @results
27
+ end
28
+
29
+ def scrape_links(selector, options = {})
30
+ if options[:with]
31
+ unless scraper = Scraptacular.world.scrapers[options[:with]]
32
+ raise ArgumentError, "scraper #{options[:with]} does not exist"
33
+ end
34
+ else
35
+ raise ArgumentError, "You must supply a scraper using the :with option"
36
+ end
37
+
38
+ retval = []
39
+
40
+ page.search(selector).each do |link|
41
+ subpage = Scraptacular.agent.get(link.attributes["href"].value)
42
+ retval += [*scraper.run(subpage)]
43
+ end
44
+
45
+ retval
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,44 @@
1
+ module Scraptacular
2
+ class Suite
3
+ attr_accessor :default_scraper, :urls, :name
4
+
5
+ def initialize(name, options = {}, &block)
6
+ @name = name
7
+
8
+ if !options.has_key?(:with)
9
+ raise ArgumentError, "You must supply a default scraper using :with"
10
+ end
11
+
12
+ @default_scraper = Scraptacular.world.scrapers[options[:with]]
13
+ if @default_scraper.nil?
14
+ raise ArgumentError, "The supplied scraper :#{options[:with]} does not exist"
15
+ end
16
+
17
+ @urls = []
18
+
19
+ instance_eval(&block)
20
+ end
21
+
22
+ def run(out)
23
+ out.puts " Suite: #{self.name}"
24
+
25
+ results = []
26
+
27
+ urls.each do |url|
28
+ out.puts " URL: #{url.path}"
29
+
30
+ scraper = url.scraper
31
+ scraper ||= default_scraper
32
+
33
+ page = Scraptacular.agent.get(url.path)
34
+ results += [*scraper.run(page)]
35
+ end
36
+
37
+ results
38
+ end
39
+
40
+ def url(url_path, options = {})
41
+ urls << Scraptacular::URL.new(url_path, options)
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,17 @@
1
+ module Scraptacular
2
+ class URL
3
+ attr_reader :path, :scraper
4
+
5
+ def initialize(path, options = {})
6
+ @path = path
7
+
8
+ if options[:with]
9
+ unless @scraper = Scraptacular.world.scrapers[options[:with]]
10
+ raise ArgumentError, "The supplied scraper :#{options[:with]} does not exist"
11
+ end
12
+
13
+ end
14
+
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,3 @@
1
+ module Scraptacular
2
+ VERSION = "0.0.2"
3
+ end
@@ -0,0 +1,42 @@
1
+ module Scraptacular
2
+ class World
3
+ attr_accessor :groups, :scrapers
4
+
5
+ def initialize
6
+ @groups = []
7
+ @scrapers = {}
8
+ end
9
+
10
+ def register_group(name, &block)
11
+ group = Scraptacular::Group.new(name, &block)
12
+ @groups << group
13
+ group
14
+ end
15
+
16
+ def register_scraper(identifier, &block)
17
+ scraper = Scraptacular::Scraper.new(identifier, &block)
18
+ @scrapers[identifier] = scraper
19
+ scraper
20
+ end
21
+
22
+ def reset
23
+ @results = {}
24
+ end
25
+
26
+ def run(options, out = $stdout)
27
+ reset
28
+
29
+ if options[:group]
30
+ groups_to_run = @groups.select { |g| g.name == options[:group] }
31
+ else
32
+ groups_to_run = @groups
33
+ end
34
+
35
+ groups_to_run.each do |group|
36
+ @results[group.name] = group.run(out)
37
+ end
38
+
39
+ @results
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,22 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'scraptacular/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "scraptacular"
8
+ gem.version = Scraptacular::VERSION
9
+ gem.authors = ["Roger Vandervort"]
10
+ gem.email = ["rvandervort@gmail.com"]
11
+ gem.description = %q{Organized web scraping}
12
+ gem.summary = %q{Organized web scraping}
13
+ gem.homepage = "https://github.com/rvandervort/scraptacular"
14
+
15
+ gem.files = `git ls-files`.split($/)
16
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
+ gem.require_paths = ["lib"]
19
+
20
+ gem.add_runtime_dependency 'mechanize', '~> 2.5'
21
+ gem.add_development_dependency 'simplecov', '~> 0.7.1'
22
+ end
@@ -0,0 +1,40 @@
1
+ require 'spec_helper.rb'
2
+
3
+ describe Scraptacular::CommandLine do
4
+ describe ".run" do
5
+ let(:options) { {definition_file: 'def_file', session_file: 'sess_file' } }
6
+ let(:cl) { described_class.new }
7
+
8
+ before :each do
9
+ described_class.any_instance.stub(:parse_options)
10
+ cl.instance_variable_set :@options, options
11
+ end
12
+
13
+ context "with valid inputs" do
14
+ before :each do
15
+ cl.stub(:validate_file).with('def_file', 'definition file', $stdout).and_return(true)
16
+ cl.stub(:validate_file).with('sess_file', 'session file', $stdout).and_return(true)
17
+ end
18
+
19
+ it "tells the Scraptacular.world to run" do
20
+ Scraptacular.world.should_receive(:run).with(options)
21
+ cl.run(options)
22
+ end
23
+ end
24
+
25
+ context "with invalid inputs" do
26
+ before :each do
27
+ cl.stub(:validate_file).and_return(false)
28
+ end
29
+
30
+ it "tells the Scraptacular.world to run" do
31
+ Scraptacular.world.should_not_receive(:run)
32
+ cl.run(options)
33
+ end
34
+
35
+ end
36
+
37
+
38
+ end
39
+ end
40
+
@@ -0,0 +1,41 @@
1
+ require 'spec_helper.rb'
2
+
3
+ main = self
4
+
5
+ methods = [
6
+ :scrape_group,
7
+ :scraper
8
+ ]
9
+
10
+ methods.each do |method_name|
11
+ describe "##{method_name}" do
12
+ it "is not added to every object" do
13
+ expect(main).to respond_to(method_name)
14
+ expect(Module.new).to respond_to(method_name)
15
+ expect(Object.new).not_to respond_to(method_name)
16
+ end
17
+ end
18
+ end
19
+
20
+ describe "#scrape_group" do
21
+ let(:world) { Scraptacular::World.new}
22
+ let(:name) { "Test Group" }
23
+ let(:test_block) { Proc.new { } }
24
+
25
+ before :each do
26
+ Scraptacular.stub(:world).and_return(world)
27
+ end
28
+
29
+ it "registers a new ScrapeGroup object" do
30
+ world.should_receive(:register_group).with(name, &test_block)
31
+ Module.new.scrape_group(name, &test_block)
32
+ end
33
+
34
+ it "returns the new group" do
35
+ Module.new.scrape_group(name, &test_block).should be_instance_of(Scraptacular::Group)
36
+ end
37
+
38
+ end
39
+
40
+ describe "#scraper" do
41
+ end
@@ -0,0 +1,62 @@
1
+ require 'spec_helper.rb'
2
+
3
+ describe Scraptacular::Group do
4
+ let(:name) { "Test Group" }
5
+ let(:block) { Proc.new { @test_instance_var = 42 } }
6
+ let(:group) { described_class.new(name, &block) }
7
+
8
+ before :all do
9
+ Scraptacular.world.register_scraper :test_scraper, &Proc.new {}
10
+ end
11
+
12
+ describe "initialize" do
13
+ it "sets the group name" do
14
+ group.name.should == name
15
+ end
16
+
17
+ it "creates an array of suites" do
18
+ group.suites.should be_instance_of(Array)
19
+ end
20
+
21
+ it "executes the block to register the suites" do
22
+ group.instance_variable_get(:@test_instance_var).should == 42
23
+ end
24
+ end
25
+
26
+ describe ".run" do
27
+ let(:suite) { Scraptacular::Suite.new "Test Suite", {with: :test_scraper}, &Proc.new {}}
28
+
29
+ before :each do
30
+ suite.stub(:run).and_return([1,2,3])
31
+ group.suites << suite
32
+ end
33
+
34
+ it "runs each suite" do
35
+ suite.should_receive(:run)
36
+ group.run($stdout)
37
+ end
38
+
39
+ it "returns a hash of results for the suites" do
40
+ group.run($stdout).should == {"Test Suite" => [1,2,3]}
41
+ end
42
+ end
43
+
44
+ describe ".suite" do
45
+ let(:suite_name) { "Test Suite" }
46
+ let(:suite_options) {{with: :default_scraper } }
47
+ let(:suite_block) { Proc.new {} }
48
+
49
+ before :each do
50
+ Scraptacular.world.scrapers.stub(:[]).and_return(true)
51
+ end
52
+
53
+ it "creates a new Scraptacular::Suite" do
54
+ group.suites.should_receive(:<<).with(an_instance_of(Scraptacular::Suite))
55
+ group.suite(suite_name, suite_options, &suite_block)
56
+ end
57
+
58
+ it "returns the new Suite object" do
59
+ group.suite(suite_name, suite_options, &suite_block).should be_instance_of(Scraptacular::Suite)
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,48 @@
1
+ require 'spec_helper.rb'
2
+
3
+ describe Scraptacular::Result do
4
+ let(:page) { stub('Page') }
5
+ let(:result) { described_class.new(page) }
6
+
7
+ describe "initialize" do
8
+ it "sets the internal page reference" do
9
+ result.instance_variable_get(:@page).should == page
10
+ end
11
+
12
+ it "sets the result to an empty hash" do
13
+ result.instance_variable_get(:@result).should == {}
14
+ end
15
+ end
16
+
17
+ describe ".merge(other_result, priority = :other)" do
18
+ let(:other_result) { described_class.new(page) }
19
+
20
+ before :each do
21
+ result.instance_variable_set :@result, {url: "url1", attribute: "test"}
22
+ other_result.instance_variable_set :@result, {url: "url2", attribute2: "test"}
23
+ end
24
+
25
+ it "adds values from the other_result to the current result hash" do
26
+ result.merge other_result, :other
27
+ result.to_h.should == {url: "url2", attribute: "test", attribute2: "test"}
28
+ end
29
+
30
+ context "priority is self" do
31
+ it "does not overwrite values in the current result wiht those from the other" do
32
+ result.merge other_result, :self
33
+ result.to_h.should == {url: "url1", attribute: "test", attribute2: "test"}
34
+ end
35
+ end
36
+ end
37
+
38
+ describe ".method_missing" do
39
+ it "evaluates the method name as a new key in the result hash" do
40
+ result.new_field { "test value" }
41
+ result.to_h.has_key?(:new_field).should be_true
42
+ result.to_h[:new_field].should == "test value"
43
+ end
44
+ end
45
+
46
+ describe ".to_h" do
47
+ end
48
+ end
@@ -0,0 +1,84 @@
1
+ require 'spec_helper.rb'
2
+
3
+ describe Scraptacular::Scraper do
4
+ let(:name) { :scraper_name }
5
+ let(:block) { Proc.new { @test_val = "Hello" } }
6
+ let(:scraper) { described_class.new(name, &block) }
7
+
8
+
9
+ describe "initialize" do
10
+ it "sets the internal identifier" do
11
+ scraper.name.should == name
12
+ end
13
+ it "sets the internal block" do
14
+ scraper.instance_variable_get(:@block).should == block
15
+ end
16
+ end
17
+
18
+ describe ".scrape_links(selector, options = {})" do
19
+
20
+ let(:sub_result1) { stub('Node', attributes: {"href" => stub('Attribute', value: "url1") }) }
21
+ let(:sub_result2) { stub('Node', attributes: {"href" => stub('Attribute', value: "url2") }) }
22
+
23
+ let(:page) { stub('Mechanize::Page') }
24
+ let(:sub_page1) { stub('Mechanize::Page') }
25
+ let(:sub_page2) { stub('Mechanize::Page') }
26
+
27
+ let(:sub_scraper) do
28
+ Scraptacular::define_scraper :exists do
29
+ result do
30
+ end
31
+ end
32
+ end
33
+ before :each do
34
+ sub_scraper
35
+ scraper.instance_variable_set :@page, page
36
+ page.stub(:search).and_return([sub_result1, sub_result2])
37
+ Mechanize.any_instance.stub(:get).with("url1").and_return(sub_page1)
38
+ Mechanize.any_instance.stub(:get).with("url2").and_return(sub_page2)
39
+ end
40
+
41
+ it "raises an ArgumentError if no :with parameter was supplied" do
42
+ expect { scraper.scrape_links("a.test_links", {})}.to raise_error(ArgumentError)
43
+ end
44
+ it "raises an ArgumentError if the :with scraper is not registered" do
45
+ expect { scraper.scrape_links("a.test_links", {with: :does_not_exist})}.to raise_error(ArgumentError)
46
+ end
47
+ it "does not raise an ArgumentError if the :with scraper is registered" do
48
+ expect { scraper.scrape_links("a.test_links", {with: :exists})}.not_to raise_error(ArgumentError)
49
+ end
50
+ it "searches the page for links matching the selector" do
51
+ page.should_receive(:search).with("a.test_links")
52
+ scraper.scrape_links "a.test_links", {with: :exists}
53
+ end
54
+ it "retrieves the page contents for each found URL" do
55
+ Mechanize.any_instance.should_receive(:get).with("url1").exactly(1).times
56
+ Mechanize.any_instance.should_receive(:get).with("url2").exactly(1).times
57
+
58
+ scraper.scrape_links "a.test_links", {with: :exists}
59
+ end
60
+
61
+ it "scrapes the sub-pages using the supplied :with scraper" do
62
+ sub_scraper.should_receive(:run).with(sub_page1).exactly(1).times
63
+ sub_scraper.should_receive(:run).with(sub_page2).exactly(1).times
64
+ scraper.scrape_links "a.test_links", {with: :exists}
65
+ end
66
+
67
+ it "returns an array of results" do
68
+ scraper.scrape_links("a.test_links", {with: :exists}).should be_instance_of(Array)
69
+ end
70
+ end
71
+
72
+ describe ".run" do
73
+ let(:page) { stub 'Mechanize::Page' }
74
+ it "evalutes the block" do
75
+ expect { scraper.run page }.to change { scraper.instance_variable_get(:@test_val) }.to("Hello")
76
+ end
77
+ it "returns an array of results" do
78
+ scraper.run(page).should be_instance_of(Array)
79
+ end
80
+ end
81
+
82
+ describe ".result" do
83
+ end
84
+ end
@@ -0,0 +1,94 @@
1
+ require 'spec_helper.rb'
2
+
3
+ describe Scraptacular::Suite do
4
+ let(:name) { "Test Suite" }
5
+ let(:block) { Proc.new { @test_var = "HI!" } }
6
+ let(:scraper) { stub("Scraptacular::Scraper") }
7
+ let(:suite_options) {{with: :some_identifier }}
8
+
9
+ let(:suite) { Scraptacular::Suite.new(name, suite_options, &block) }
10
+
11
+ before :each do
12
+ Scraptacular.world.scrapers.stub(:[]).with(:some_identifier).and_return(scraper)
13
+ scraper.stub(:run)
14
+ end
15
+
16
+ describe "#initialize" do
17
+ context ":with argument" do
18
+ it "sets the default scraper, if supplied" do
19
+ suite.default_scraper.should == scraper
20
+ end
21
+
22
+ it "raises an ArgumentError if the with parameter is not supplied" do
23
+ suite_options.delete(:with)
24
+ expect { suite }.to raise_error(ArgumentError)
25
+ end
26
+
27
+ it "raises an ArgumentError if the selected scraper is not defined" do
28
+ Scraptacular.world.scrapers.stub(:[]).and_return(nil)
29
+ expect { suite }.to raise_error(ArgumentError)
30
+ end
31
+ end
32
+
33
+ it "initializes an @urls attribute as an Array" do
34
+ suite.instance_variable_get(:@urls).should be_instance_of(Array)
35
+ end
36
+
37
+ it "executes the passed block" do
38
+ suite.instance_variable_get(:@test_var).should == "HI!"
39
+ end
40
+ end
41
+
42
+ describe ".run" do
43
+ let(:url1) { Scraptacular::URL.new("test_url_1") }
44
+ let(:url2) { Scraptacular::URL.new("test_url_2") }
45
+ let(:page1) { stub('Page 1') }
46
+ let(:page2) { stub('Page 2') }
47
+
48
+ let(:fake_result) { stub('Scraptacular::Result')}
49
+
50
+
51
+ before :each do
52
+ suite.urls << url1
53
+ suite.urls << url2
54
+
55
+ Mechanize.any_instance.stub(:get).with(url1.path).and_return(page1)
56
+ Mechanize.any_instance.stub(:get).with(url2.path).and_return(page2)
57
+
58
+ scraper.stub(:run).and_return(fake_result)
59
+ end
60
+
61
+ it "iterates through all of the URLs" do
62
+ Mechanize.any_instance.should_receive(:get).with(url1.path)
63
+ Mechanize.any_instance.should_receive(:get).with(url2.path)
64
+
65
+ suite.run($stdout)
66
+ end
67
+
68
+ it "uses the URL's scraper, if defined" do
69
+ sc = Scraptacular.world.register_scraper(:url2_scraper, &Proc.new {})
70
+ url2.instance_variable_set :@scraper, sc
71
+
72
+ sc.should_receive(:run).with(page2)
73
+ suite.run($stdout)
74
+ end
75
+
76
+ it "uses the default scraper if a url-specified scraper is not defined" do
77
+ scraper.should_receive(:run).with(page1).exactly(1).times
78
+ scraper.should_receive(:run).with(page2).exactly(1).times
79
+
80
+ suite.run($stdout)
81
+ end
82
+
83
+ it "returns an Array of results" do
84
+ suite.run($stdout).should == [fake_result, fake_result]
85
+ end
86
+ end
87
+
88
+ describe ".url(path, options = {})" do
89
+ it "appends a new Scraptacular::URL object to the @urls list" do
90
+ suite.urls.should_receive(:<<).with(an_instance_of(Scraptacular::URL))
91
+ suite.url("dummy path", {})
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,22 @@
1
+ require 'spec_helper.rb'
2
+
3
+ describe Scraptacular::URL do
4
+ let(:path) { "http://www.google.com" }
5
+
6
+ describe "#initialize" do
7
+ it "raises an argument error if the :with option does not reference a valid scraper" do
8
+ expect { Scraptacular::URL.new(path,{with: :does_not_exist}) }.to raise_error(ArgumentError)
9
+ end
10
+ it "does not raise an argument error if the :with options references a valid scraper" do
11
+ Scraptacular.world.scrapers[:exists] = "nothing interesting"
12
+
13
+ expect { Scraptacular::URL.new(path,{with: :exists}) }.not_to raise_error(ArgumentError)
14
+ end
15
+
16
+ it "sets the internal @path attribute" do
17
+ described_class.new(path).path.should == path
18
+ end
19
+ end
20
+
21
+
22
+ end
@@ -0,0 +1,58 @@
1
+ require 'spec_helper.rb'
2
+
3
+ describe Scraptacular::World do
4
+ let(:world) { described_class.new }
5
+ let(:name) { "Name "}
6
+ let(:block) { Proc.new {} }
7
+
8
+ describe ".register_group" do
9
+ it "Creates a new group object" do
10
+ Scraptacular::Group.should_receive(:new)
11
+ world.register_group(name, &block)
12
+ end
13
+
14
+ it "adds the group object to the world's known group list" do
15
+ expect { world.register_group(name, &block) }.to change { world.groups.count }.to(1)
16
+ end
17
+ end
18
+
19
+ describe ".register_scraper" do
20
+ it "creates a new scraper object" do
21
+ Scraptacular::Scraper.should_receive(:new)
22
+ world.register_scraper(name, &block)
23
+ end
24
+
25
+ it "adds the scraper object the world's known scraper list" do
26
+ expect { world.register_scraper(name, &block) }.to change{world.scrapers.count }.to(1)
27
+ end
28
+ end
29
+
30
+ describe ".run(options)" do
31
+ let(:group1) { Scraptacular::Group.new "Group 1", &Proc.new {}}
32
+ let(:group2) { Scraptacular::Group.new "Group 2", &Proc.new {}}
33
+
34
+ before :all do
35
+ world.groups << group1
36
+ world.groups << group2
37
+ end
38
+
39
+ it "runs all groups if :group is not specified" do
40
+ group1.should_receive(:run)
41
+ group2.should_receive(:run)
42
+
43
+ world.run({})
44
+ end
45
+
46
+ it "runs only a single group if specified, and exists" do
47
+ group1.should_receive(:run)
48
+ group2.should_not_receive(:run)
49
+
50
+ world.run({group: "Group 1"})
51
+ end
52
+
53
+ it "returns a hash of results" do
54
+ p world.run({})
55
+
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,25 @@
1
+ require 'simplecov'
2
+ SimpleCov.start
3
+
4
+ require 'scraptacular'
5
+
6
+ RSpec.configure do |config|
7
+
8
+ # Redirect the programs output elsewhere
9
+ # See : https://gist.github.com/adamstegman/926858
10
+ config.before :all do
11
+ @orig_stderr = $stderr
12
+ @orig_stdout = $stdout
13
+
14
+ # redirect stderr and stdout to /dev/null
15
+ $stderr = File.new('/dev/null', 'w')
16
+ $stdout = File.new('/dev/null', 'w')
17
+ end
18
+
19
+ config.after :all do
20
+ $stderr = @orig_stderr
21
+ $stdout = @orig_stdout
22
+ @orig_stderr = nil
23
+ @orig_stdout = nil
24
+ end
25
+ end
metadata ADDED
@@ -0,0 +1,108 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: scraptacular
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ platform: ruby
6
+ authors:
7
+ - Roger Vandervort
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-03-12 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: mechanize
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '2.5'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '2.5'
27
+ - !ruby/object:Gem::Dependency
28
+ name: simplecov
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: 0.7.1
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: 0.7.1
41
+ description: Organized web scraping
42
+ email:
43
+ - rvandervort@gmail.com
44
+ executables:
45
+ - scraptacular
46
+ extensions: []
47
+ extra_rdoc_files: []
48
+ files:
49
+ - .gitignore
50
+ - .rspec
51
+ - Gemfile
52
+ - LICENSE.txt
53
+ - README.md
54
+ - Rakefile
55
+ - bin/scraptacular
56
+ - lib/scraptacular.rb
57
+ - lib/scraptacular/command_line.rb
58
+ - lib/scraptacular/dsl.rb
59
+ - lib/scraptacular/group.rb
60
+ - lib/scraptacular/result.rb
61
+ - lib/scraptacular/scraper.rb
62
+ - lib/scraptacular/suite.rb
63
+ - lib/scraptacular/url.rb
64
+ - lib/scraptacular/version.rb
65
+ - lib/scraptacular/world.rb
66
+ - scraptacular.gemspec
67
+ - spec/scraptacular/command_line_spec.rb
68
+ - spec/scraptacular/dsl_spec.rb
69
+ - spec/scraptacular/group_spec.rb
70
+ - spec/scraptacular/result_spec.rb
71
+ - spec/scraptacular/scraper_spec.rb
72
+ - spec/scraptacular/suite_spec.rb
73
+ - spec/scraptacular/url_spec.rb
74
+ - spec/scraptacular/world_spec.rb
75
+ - spec/spec_helper.rb
76
+ homepage: https://github.com/rvandervort/scraptacular
77
+ licenses: []
78
+ metadata: {}
79
+ post_install_message:
80
+ rdoc_options: []
81
+ require_paths:
82
+ - lib
83
+ required_ruby_version: !ruby/object:Gem::Requirement
84
+ requirements:
85
+ - - ! '>='
86
+ - !ruby/object:Gem::Version
87
+ version: '0'
88
+ required_rubygems_version: !ruby/object:Gem::Requirement
89
+ requirements:
90
+ - - ! '>='
91
+ - !ruby/object:Gem::Version
92
+ version: '0'
93
+ requirements: []
94
+ rubyforge_project:
95
+ rubygems_version: 2.0.3
96
+ signing_key:
97
+ specification_version: 4
98
+ summary: Organized web scraping
99
+ test_files:
100
+ - spec/scraptacular/command_line_spec.rb
101
+ - spec/scraptacular/dsl_spec.rb
102
+ - spec/scraptacular/group_spec.rb
103
+ - spec/scraptacular/result_spec.rb
104
+ - spec/scraptacular/scraper_spec.rb
105
+ - spec/scraptacular/suite_spec.rb
106
+ - spec/scraptacular/url_spec.rb
107
+ - spec/scraptacular/world_spec.rb
108
+ - spec/spec_helper.rb