RubyGems - hash_spidey - Versions diffs - 0.0.1 → 0.0.2 - Mend

hash_spidey 0.0.1 → 0.0.2

Files changed (4) hide show

data/lib/hash_spidey/strategies/hash_store_strategy.rb +70 -74
data/lib/hash_spidey/version.rb +1 -1
data/spec/spiders/hash_store_strategy_spec.rb +9 -17
metadata +1 -1

data/lib/hash_spidey/strategies/hash_store_strategy.rb CHANGED Viewed

@@ -1,112 +1,108 @@
 module HashSpidey
 	module Strategies
 		module HashStore
-			 def initialize(attrs = {})
-			    @url_collection = {}
-			    @error_collection = []
-			    super(attrs)
-			 end
+			def initialize(attrs = {})
+				@url_collection = {}
+				@error_collection = []
-			 #### process strategies
+				super(attrs)
+			end
+#### process strategies
-			 ## conveinence methods
-			 def crawls
-			   @url_collection.select{|k,v| v.crawled?}
-			 end
+			## conveinence methods
+			def crawls
+				@url_collection.select{|k,v| v.crawled?}
+			end
-			 def uncrawled
-			   @url_collection.reject{|k,v| v.crawled?}
-			 end
-			 def records
-			   @url_collection.select{|k,v| v.recorded?}
-			 end
+			def uncrawled
+				@url_collection.reject{|k,v| v.crawled?}
+			end
-			 def process_crawl(url, page)
-			 	h_url = @url_collection[url]
-			 	h_url.mark_as_crawled(page)
-			 end
+			def records
+				@url_collection.select{|k,v| v.recorded?}
+			end
+			def process_crawl(url, page)
+				h_url = @url_collection[url]
+				h_url.mark_as_crawled(page)
+			end
-		    def crawl(options = {})
-		    	@crawl_started_at = Time.now
-			   @until = Time.now + options[:crawl_for] if options[:crawl_for]
-		      i = 0
-		      each_url do |url, handler, default_data|
-		        break if options[:max_urls] && i >= options[:max_urls]
-		        begin
-		          page = agent.get(url)
-		          Spidey.logger.info "Handling #{url.inspect}"
-		          process_crawl(url, page)
-		          send handler, page, default_data
-		        rescue => ex
-		          add_error url: url, handler: handler, error: ex
-		        end
-		        sleep request_interval if request_interval > 0
-		        i += 1
-		      end
-		    end
+			def crawl(options = {})
+				@crawl_started_at = Time.now
+				@until = Time.now + options[:crawl_for] if options[:crawl_for]
+				i = 0
+				each_url do |url, handler, default_data|
+					break if options[:max_urls] && i >= options[:max_urls]
+					begin
+						page = agent.get(url)
+						Spidey.logger.info "Handling #{url.inspect}"
+						process_crawl(url, page)
+						send handler, page, default_data
+					rescue => ex
+						add_error url: url, handler: handler, error: ex
+					end
+					sleep request_interval if request_interval > 0
+					i += 1
+				end
+			end
-			 def handle(url, handler, handle_data = {})
-			   Spidey.logger.info "Queueing #{url.inspect[0..200]}..."
+			def handle(url, handler, handle_data = {})
+				Spidey.logger.info "Queueing #{url.inspect[0..200]}..."
-			   spider_name = self.class.name
-			   @url_collection[url] ||= HashUrlRecord.spidey_handle( url, handler, spider_name, handle_data )
-			 end
+				spider_name = self.class.name
+				@url_collection[url] ||= HashUrlRecord.spidey_handle( url, handler, spider_name, handle_data )
+			end
-			 # expects @url_collection to have :url, but if not, creates new HashUrlRecord
-			 def record(data_hashie)
-			   url = data_hashie.url
-			   h_url = @url_collection[url] || HashUrlRecord.new(url)
+			# expects @url_collection to have :url, but if not, creates new HashUrlRecord
+			def record(data_hashie)
+				url = data_hashie.url
+				h_url = @url_collection[url] || HashUrlRecord.new(url)
-			   # set the content and record_timestamp of the HashUrlRecord
-			   h_url.record_content(data_hashie.content)
+				# set the content and record_timestamp of the HashUrlRecord
+				h_url.record_content(data_hashie.content)
-			   # reassign, update collection
-			   @url_collection[url] = h_url
-			 end
+				# reassign, update collection
+				@url_collection[url] = h_url
+			end
-			 # wrapper around #record
-			 def record_page(page, default_data={})
-			   msh = Hashie::Mash.new(default_data)
-			   msh.url = page.uri.to_s
-			   msh.content = page.content
+			# wrapper around #record
+			def record_page(page, default_data={})
+				msh = Hashie::Mash.new(default_data)
+				msh.url = page.uri.to_s
+				msh.content = page.content
-			   record(msh)
-			 end
+				record(msh)
+			end
-			 def each_url(&block)
-			   while h_url = get_next_url_hash
-			     yield h_url.url, h_url.handler, h_url.handle_data
-			   end
-			 end
+			def each_url(&block)
+				while h_url = get_next_url_hash
+					yield h_url.url, h_url.handler, h_url.handle_data
+				end
+			end
-			 protected
+			protected
-			 def add_error(attrs)
-		      @error_collection << attrs
-      		Spidey.logger.error "Error on #{attrs[:url]}. #{attrs[:error].class}: #{attrs[:error].message}"
-    		 end
+			def add_error(attrs)
+				@error_collection << attrs
+				Spidey.logger.error "Error on #{attrs[:url]}. #{attrs[:error].class}: #{attrs[:error].message}"
+			end
 			private
 			def get_next_url_hash
 				return nil if (@until && Time.now >= @until)  # exceeded time bound
 				# uncrawled is a filtered collection
 				uncrawled.values.first
 			end
 		end
 	end
 end

data/lib/hash_spidey/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module HashSpidey
-  VERSION = "0.0.1"
+  VERSION = "0.0.2"
 end

data/spec/spiders/hash_store_strategy_spec.rb CHANGED Viewed

@@ -16,13 +16,13 @@ describe HashSpidey::Strategies::HashStore do
 		end
 	end
 	context 'generic #handle' do
 		before(:each) do
 			FakeWeb.register_uri(:get, "http://www.example.com/", :body => "Hello World", code: 200,
 				"content-type"=>"text/html; charset=UTF-8"
-			)
+				)
 			@spider = TestSpider.new request_interval: 0
 			@spider.handle "http://www.example.com/", :process_size
 			@spider.crawl
@@ -36,21 +36,20 @@ describe HashSpidey::Strategies::HashStore do
 			it 'should update #crawled_timestamp' do
 				@crawled_url = @spider.crawls.values.first
-	  			expect( @crawled_url.url ).to eq 'http://www.example.com/'
-	  		 	expect( @crawled_url.crawled_timestamp > @crawled_url.initialized_timestamp).to be_true
+				expect( @crawled_url.url ).to eq 'http://www.example.com/'
+				expect( @crawled_url.crawled_timestamp > @crawled_url.initialized_timestamp).to be_true
 			end
 			it 'should have #crawls act as a Hash' do
 				expect( @spider.crawls['http://www.example.com/'].url).to eq 'http://www.example.com/'
 			end
 			it "should not add duplicate URLs" do
-		    @spider.handle "http://www.example.com/", :process_something_else # second time
-		    expect( @spider.crawls.count ).to eq 1
+				@spider.handle "http://www.example.com/", :process_something_else # second time
+				expect( @spider.crawls.count ).to eq 1
 			end
-			context '@crawl_record' do
+			context '@crawl_record' do
 				before(:each) do
 					@crawled_url = @spider.crawls["http://www.example.com/"]
 				end
@@ -60,13 +59,10 @@ describe HashSpidey::Strategies::HashStore do
 				end
 				it 'should respond to header#content-type' do
-					expect(@crawled_url.header['content-type']).to eq "text/html; charset=UTF-8"
+					expect(@crawled_url.crawleheader['content-type']).to eq "text/html; charset=UTF-8"
 				end
 			end
 		end
 	end
@@ -91,8 +87,4 @@ describe HashSpidey::Strategies::HashStore do
 			end
 		end
 	end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: hash_spidey
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.0.2
   prerelease:
 platform: ruby
 authors: