hash_spidey 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,19 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ .DS_Store
19
+
data/Gemfile ADDED
@@ -0,0 +1,10 @@
1
+ source 'https://rubygems.org'
2
+
3
+ group :test do
4
+ gem 'rspec', :group => 'test'
5
+ gem "fakeweb", ["~> 1.3"], group: 'test'
6
+ end
7
+
8
+
9
+ # Specify your gem's dependencies in hash_spidey.gemspec
10
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 dannguyen
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # HashSpidey
2
+
3
+ A sloppy implementation of [joeyAghion's Spidey](https://github.com/joeyAghion/spidey) abstract web crawling, using in-memory Hash to save pages and links. Very smelly and unstable until I figure out the best API.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'hash_spidey'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install hash_spidey
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,20 @@
1
+ require 'rdoc/task'
2
+ require 'rubygems'
3
+ require 'rubygems/package_task'
4
+ require 'rspec/core/rake_task'
5
+
6
+ desc 'Default: run specs.'
7
+ task :default => :rspec
8
+
9
+ desc 'Run the specs'
10
+ RSpec::Core::RakeTask.new(:rspec) do |t|
11
+ t.rspec_opts = ['--color']
12
+ t.pattern = './spec/**/*_spec.rb'
13
+ end
14
+
15
+ spec = Gem::Specification.load("#{File.dirname(__FILE__)}/hash_spidey.gemspec")
16
+
17
+ desc "Package gem."
18
+ Gem::PackageTask.new(spec) do |pkg|
19
+ pkg.gem_spec = spec
20
+ end
@@ -0,0 +1,29 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'hash_spidey/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "hash_spidey"
8
+ spec.version = HashSpidey::VERSION
9
+ spec.authors = ["dannguyen"]
10
+ spec.email = ["dansonguyen@gmail.com"]
11
+ spec.description = %q{An implementation of joeyAghion's Spidey class at Artsy}
12
+ spec.summary = %q{Uses a Hash object to store crawling process, which it can then dump to an external store}
13
+ spec.homepage = "http://github.com/dannguyen"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.3"
22
+ spec.add_development_dependency "rake"
23
+
24
+
25
+ spec.add_dependency 'spidey', '~> 0.1'
26
+ spec.add_dependency 'hashie'
27
+ spec.add_dependency 'addressable'
28
+
29
+ end
@@ -0,0 +1,12 @@
1
+ require "hash_spidey/version"
2
+
3
+ require 'hashie'
4
+ require 'spidey'
5
+ require_relative 'hash_spidey/hash_url_record'
6
+ require_relative 'hash_spidey/strategies/hash_store_strategy'
7
+
8
+ module HashSpidey
9
+ class AbstractSpider < Spidey::AbstractSpider
10
+ include HashSpidey::Strategies::HashStore
11
+ end
12
+ end
@@ -0,0 +1,35 @@
1
+ require 'hashie'
2
+ require 'mechanize'
3
+
4
+ module HashSpidey
5
+
6
+ class CrawlRecord < BasicObject
7
+
8
+ META_ATTS = %w(crawled_timestamp title header code response_header_charset meta_charset detected_encoding content_type)
9
+ attr_reader :crawled_timestamp
10
+
11
+ def initialize(obj, timestamp)
12
+ @crawled_timestamp = timestamp
13
+ @page_object = obj
14
+ end
15
+
16
+ def to_hash
17
+ msh = Hashie::Mash.new
18
+ META_ATTS.each do |att|
19
+ msh[att] = self.send(att) if self.respond_to?(att)
20
+ end
21
+ return msh
22
+ end
23
+
24
+ protected
25
+
26
+ def method_missing(name, *args, &block)
27
+ if @page_object.respond_to?(name)
28
+ @page_object.send(name, *args, &block)
29
+ else
30
+ super
31
+ end
32
+ end
33
+
34
+ end
35
+ end
@@ -0,0 +1,73 @@
1
+ require 'addressable/uri'
2
+ require_relative 'crawl_record'
3
+
4
+ module HashSpidey
5
+ class HashUrlRecord
6
+
7
+ attr_reader :url, :code,
8
+ :initialized_timestamp, :crawled_timestamp, :recorded_timestamp,
9
+ :content, :handler, :spider, :handle_data,
10
+ :crawl_metadata
11
+
12
+
13
+ # convenience name for spidey
14
+ def self.spidey_handle(url, handler, spider, opts)
15
+ mash_opts = Hashie::Mash.new opts
16
+ mash_opts.spider = spider
17
+ mash_opts.handler = handler
18
+
19
+ return HashUrlRecord.new url, mash_opts
20
+ end
21
+
22
+ def initialize(url, opts={})
23
+ @url = url
24
+ @addressable_uri = Addressable::URI.parse(@url)
25
+ @initialized_timestamp = Time.now
26
+
27
+ mash_opts = Hashie::Mash.new(opts)
28
+ @spider = mash_opts.delete :spider
29
+ @handler = mash_opts.delete :handler
30
+ @handle_data = mash_opts.delete :handle_data # not sure if needed?...
31
+ end
32
+
33
+
34
+ def record_content(ct)
35
+ @content = ct
36
+ @recorded_timestamp = Time.now
37
+ end
38
+
39
+ # saves data related
40
+ def mark_as_crawled(page_obj={})
41
+ @crawled_timestamp = Time.now
42
+ # do something with mechanized page object
43
+ @crawl_metadata = HashSpidey::CrawlRecord.new(page_obj, @crawled_timestamp)
44
+ end
45
+
46
+ def recorded?
47
+ !(@recorded_timestamp.nil?)
48
+ end
49
+
50
+ def crawled?
51
+ !(crawled_timestamp.nil?)
52
+ end
53
+
54
+
55
+ ## this is just an alias
56
+
57
+ # obvious smells
58
+ def collected_timestamp; @recorded_timestamp; end
59
+ def header; @crawl_metadata.header unless @crawl_metadata.nil? ; end
60
+ def code; @crawl_metadata.code unless @crawl_metadata.nil? ; end
61
+
62
+ #### url inspection methods
63
+ [:host, :port, :query, :scheme, :path ].each do |foo|
64
+ define_method foo do
65
+ @addressable_uri.send foo
66
+ end
67
+ end
68
+
69
+ def query_values
70
+ @addressable_uri.query_values
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,112 @@
1
+ module HashSpidey
2
+ module Strategies
3
+
4
+ module HashStore
5
+
6
+ def initialize(attrs = {})
7
+ @url_collection = {}
8
+ @error_collection = []
9
+
10
+ super(attrs)
11
+ end
12
+
13
+ #### process strategies
14
+
15
+
16
+ ## conveinence methods
17
+ def crawls
18
+ @url_collection.select{|k,v| v.crawled?}
19
+ end
20
+
21
+
22
+ def uncrawled
23
+ @url_collection.reject{|k,v| v.crawled?}
24
+ end
25
+
26
+ def records
27
+ @url_collection.select{|k,v| v.recorded?}
28
+ end
29
+
30
+ def process_crawl(url, page)
31
+ h_url = @url_collection[url]
32
+ h_url.mark_as_crawled(page)
33
+ end
34
+
35
+
36
+ def crawl(options = {})
37
+ @crawl_started_at = Time.now
38
+ @until = Time.now + options[:crawl_for] if options[:crawl_for]
39
+
40
+ i = 0
41
+ each_url do |url, handler, default_data|
42
+ break if options[:max_urls] && i >= options[:max_urls]
43
+ begin
44
+ page = agent.get(url)
45
+ Spidey.logger.info "Handling #{url.inspect}"
46
+ process_crawl(url, page)
47
+ send handler, page, default_data
48
+ rescue => ex
49
+ add_error url: url, handler: handler, error: ex
50
+ end
51
+ sleep request_interval if request_interval > 0
52
+ i += 1
53
+ end
54
+ end
55
+
56
+
57
+ def handle(url, handler, handle_data = {})
58
+ Spidey.logger.info "Queueing #{url.inspect[0..200]}..."
59
+
60
+ spider_name = self.class.name
61
+ @url_collection[url] ||= HashUrlRecord.spidey_handle( url, handler, spider_name, handle_data )
62
+ end
63
+
64
+ # expects @url_collection to have :url, but if not, creates new HashUrlRecord
65
+ def record(data_hashie)
66
+ url = data_hashie.url
67
+ h_url = @url_collection[url] || HashUrlRecord.new(url)
68
+
69
+ # set the content and record_timestamp of the HashUrlRecord
70
+ h_url.record_content(data_hashie.content)
71
+
72
+ # reassign, update collection
73
+ @url_collection[url] = h_url
74
+ end
75
+
76
+
77
+ # wrapper around #record
78
+ def record_page(page, default_data={})
79
+ msh = Hashie::Mash.new(default_data)
80
+ msh.url = page.uri.to_s
81
+ msh.content = page.content
82
+
83
+ record(msh)
84
+ end
85
+
86
+ def each_url(&block)
87
+ while h_url = get_next_url_hash
88
+ yield h_url.url, h_url.handler, h_url.handle_data
89
+ end
90
+ end
91
+
92
+ protected
93
+
94
+ def add_error(attrs)
95
+ @error_collection << attrs
96
+ Spidey.logger.error "Error on #{attrs[:url]}. #{attrs[:error].class}: #{attrs[:error].message}"
97
+ end
98
+
99
+
100
+ private
101
+
102
+ def get_next_url_hash
103
+ return nil if (@until && Time.now >= @until) # exceeded time bound
104
+
105
+ # uncrawled is a filtered collection
106
+ uncrawled.values.first
107
+ end
108
+
109
+
110
+ end
111
+ end
112
+ end
@@ -0,0 +1,3 @@
1
+ module HashSpidey
2
+ VERSION = "0.0.1"
3
+ end
data/spec/spec.rake ADDED
@@ -0,0 +1,17 @@
1
+ begin
2
+ require 'rspec/core/rake_task'
3
+
4
+ spec_tasks = Dir['spec/*/'].map { |d| File.basename(d) }
5
+
6
+ spec_tasks.each do |folder|
7
+ RSpec::Core::RakeTask.new("spec:#{folder}") do |t|
8
+ t.pattern = "./spec/#{folder}/**/*_spec.rb"
9
+ t.rspec_opts = %w(-fs --color)
10
+ end
11
+ end
12
+
13
+ desc "Run complete application spec suite"
14
+ task 'spec' => spec_tasks.map { |f| "spec:#{f}" }
15
+ rescue LoadError
16
+ puts "RSpec is not part of this bundle, skip specs."
17
+ end
@@ -0,0 +1,15 @@
1
+ require 'hash_spidey'
2
+ require 'fakeweb'
3
+
4
+ RSpec.configure do |config|
5
+ config.filter_run_excluding :skip => true
6
+ config.formatter = :documentation # :progress, :html, :textmate
7
+ config.fail_fast = true
8
+ config.before(:each) do
9
+ end
10
+
11
+ config.after(:each) do
12
+ end
13
+ end
14
+
15
+
@@ -0,0 +1,98 @@
1
+ require 'spec_helper'
2
+
3
+ describe HashSpidey::Strategies::HashStore do
4
+
5
+ before(:each) do
6
+
7
+ end
8
+
9
+
10
+ class TestSpider < HashSpidey::AbstractSpider
11
+ DEFAULT_REQUEST_INTERVAL = 0.001
12
+
13
+ include HashSpidey::Strategies::HashStore
14
+ def process_size(npage, data={})
15
+ npage.inspect
16
+ end
17
+
18
+ end
19
+
20
+ context 'generic #handle' do
21
+
22
+ before(:each) do
23
+ FakeWeb.register_uri(:get, "http://www.example.com/", :body => "Hello World", code: 200,
24
+ "content-type"=>"text/html; charset=UTF-8"
25
+ )
26
+ @spider = TestSpider.new request_interval: 0
27
+ @spider.handle "http://www.example.com/", :process_size
28
+ @spider.crawl
29
+ end
30
+
31
+ describe '#crawls' do
32
+ it 'should only add to #crawls' do
33
+ expect( @spider.crawls.count ).to eq 1
34
+ expect( @spider.records.count ).to eq 0
35
+ end
36
+
37
+ it 'should update #crawled_timestamp' do
38
+ @crawled_url = @spider.crawls.values.first
39
+ expect( @crawled_url.url ).to eq 'http://www.example.com/'
40
+ expect( @crawled_url.crawled_timestamp > @crawled_url.initialized_timestamp).to be_true
41
+ end
42
+
43
+ it 'should have #crawls act as a Hash' do
44
+ expect( @spider.crawls['http://www.example.com/'].url).to eq 'http://www.example.com/'
45
+ end
46
+
47
+ it "should not add duplicate URLs" do
48
+ @spider.handle "http://www.example.com/", :process_something_else # second time
49
+ expect( @spider.crawls.count ).to eq 1
50
+ end
51
+
52
+ context '@crawl_record' do
53
+
54
+ before(:each) do
55
+ @crawled_url = @spider.crawls["http://www.example.com/"]
56
+ end
57
+
58
+ it 'should respond to #code' do
59
+ expect(@crawled_url.code).to eq '200'
60
+ end
61
+
62
+ it 'should respond to header#content-type' do
63
+ expect(@crawled_url.header['content-type']).to eq "text/html; charset=UTF-8"
64
+ end
65
+ end
66
+ end
67
+
68
+
69
+
70
+ end
71
+
72
+
73
+ context 'generic #record' do
74
+ describe '#records' do
75
+ before(:each) do
76
+
77
+ @data = Hashie::Mash.new url: 'http://www.example.com/', content: 'Hello World'
78
+ @spider = TestSpider.new request_interval: 0
79
+ @spider.record @data
80
+ end
81
+
82
+ it "should add to records" do
83
+ expect(@spider.records.count).to eq 1
84
+ expect(@spider.records['http://www.example.com/'].content).to eq 'Hello World'
85
+ end
86
+
87
+ it 'should update existing result' do
88
+ @spider.record Hashie::Mash.new url: 'http://www.example.com/', content: 'Bye World'
89
+ expect(@spider.records['http://www.example.com/'].content).to eq 'Bye World'
90
+ expect(@spider.records.count).to eq 1
91
+ end
92
+ end
93
+ end
94
+
95
+
96
+
97
+
98
+ end
@@ -0,0 +1,73 @@
1
+ require 'spec_helper'
2
+
3
+ include HashSpidey
4
+ describe HashSpidey::HashUrlRecord do
5
+
6
+
7
+ context "delegate URI methods to Addressable::URI" do
8
+
9
+ before(:each) do
10
+ @hurl = HashUrlRecord.new 'http://www.example.com:80/stuff/?q=1&a=2&b=hello'
11
+ end
12
+
13
+
14
+ it 'should have #host' do
15
+ expect( @hurl.host ).to eq 'www.example.com'
16
+ end
17
+
18
+ it 'should have #port' do
19
+ expect( @hurl.port ).to eq 80
20
+ end
21
+
22
+ it 'should have #query' do
23
+ expect( @hurl.query ).to eq 'q=1&a=2&b=hello'
24
+ end
25
+
26
+ it 'should have #scheme' do
27
+ expect( @hurl.scheme ).to eq 'http'
28
+ end
29
+
30
+ it 'should have #path' do
31
+ expect( @hurl.path ).to eq '/stuff/'
32
+ end
33
+ end
34
+
35
+ context "state changes upon record and crawl" do
36
+ before(:each) do
37
+ @hurl = HashUrlRecord.new "http://www.example.com"
38
+ end
39
+
40
+ describe '#record_content' do
41
+ before(:each) do
42
+ @hurl.record_content 'hello'
43
+ end
44
+
45
+ it 'should set @recorded_timestamp' do
46
+ expect( @hurl.recorded_timestamp ).to be_within(2).of Time.now
47
+ end
48
+
49
+ it 'should set @content' do
50
+ expect( @hurl.content ).to eq 'hello'
51
+ end
52
+
53
+ it 'should have #recorded? be true' do
54
+ expect( @hurl.recorded?).to be_true
55
+ end
56
+ end
57
+
58
+ describe '#mark_as_crawled' do
59
+ before(:each) do
60
+ @hurl.mark_as_crawled
61
+ end
62
+
63
+ it 'should set @crawled_timestamp' do
64
+ expect( @hurl.crawled_timestamp ).to be_within(2).of Time.now
65
+ end
66
+
67
+ it 'should have #crawled? be true' do
68
+ expect( @hurl.crawled?).to be_true
69
+ end
70
+ end
71
+ end
72
+
73
+ end
metadata ADDED
@@ -0,0 +1,146 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: hash_spidey
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - dannguyen
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-06-16 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: bundler
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: '1.3'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: '1.3'
30
+ - !ruby/object:Gem::Dependency
31
+ name: rake
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: spidey
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ~>
52
+ - !ruby/object:Gem::Version
53
+ version: '0.1'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: '0.1'
62
+ - !ruby/object:Gem::Dependency
63
+ name: hashie
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: addressable
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :runtime
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ description: An implementation of joeyAghion's Spidey class at Artsy
95
+ email:
96
+ - dansonguyen@gmail.com
97
+ executables: []
98
+ extensions: []
99
+ extra_rdoc_files: []
100
+ files:
101
+ - .gitignore
102
+ - Gemfile
103
+ - LICENSE.txt
104
+ - README.md
105
+ - Rakefile
106
+ - hash_spidey.gemspec
107
+ - lib/hash_spidey.rb
108
+ - lib/hash_spidey/crawl_record.rb
109
+ - lib/hash_spidey/hash_url_record.rb
110
+ - lib/hash_spidey/strategies/hash_store_strategy.rb
111
+ - lib/hash_spidey/version.rb
112
+ - spec/spec.rake
113
+ - spec/spec_helper.rb
114
+ - spec/spiders/hash_store_strategy_spec.rb
115
+ - spec/unit/hash_url_record_spec.rb
116
+ homepage: http://github.com/dannguyen
117
+ licenses:
118
+ - MIT
119
+ post_install_message:
120
+ rdoc_options: []
121
+ require_paths:
122
+ - lib
123
+ required_ruby_version: !ruby/object:Gem::Requirement
124
+ none: false
125
+ requirements:
126
+ - - ! '>='
127
+ - !ruby/object:Gem::Version
128
+ version: '0'
129
+ required_rubygems_version: !ruby/object:Gem::Requirement
130
+ none: false
131
+ requirements:
132
+ - - ! '>='
133
+ - !ruby/object:Gem::Version
134
+ version: '0'
135
+ requirements: []
136
+ rubyforge_project:
137
+ rubygems_version: 1.8.23
138
+ signing_key:
139
+ specification_version: 3
140
+ summary: Uses a Hash object to store crawling process, which it can then dump to an
141
+ external store
142
+ test_files:
143
+ - spec/spec.rake
144
+ - spec/spec_helper.rb
145
+ - spec/spiders/hash_store_strategy_spec.rb
146
+ - spec/unit/hash_url_record_spec.rb