RubyGems - hash_spidey - Versions diffs - 0.0.1 → 0.0.2 - Mend

hash_spidey 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

data/lib/hash_spidey/strategies/hash_store_strategy.rb +70 -74
data/lib/hash_spidey/version.rb +1 -1
data/spec/spiders/hash_store_strategy_spec.rb +9 -17
metadata +1 -1

data/lib/hash_spidey/strategies/hash_store_strategy.rb CHANGED Viewed

@@ -1,112 +1,108 @@
 module HashSpidey
 	module Strategies
 		module HashStore
-			 def initialize(attrs = {})
-			    @url_collection = {}
-			    @error_collection = []
-			    super(attrs)
-			 end
+			def initialize(attrs = {})
+				@url_collection = {}
+				@error_collection = []
-			 #### process strategies
+				super(attrs)
+			end
+#### process strategies
-			 ## conveinence methods
-			 def crawls
-			   @url_collection.select{|k,v| v.crawled?}
-			 end
+			## conveinence methods
+			def crawls
+				@url_collection.select{|k,v| v.crawled?}
+			end
-			 def uncrawled
-			   @url_collection.reject{|k,v| v.crawled?}
-			 end
-			 def records
-			   @url_collection.select{|k,v| v.recorded?}
-			 end
+			def uncrawled
+				@url_collection.reject{|k,v| v.crawled?}
+			end
-			 def process_crawl(url, page)
-			 	h_url = @url_collection[url]
-			 	h_url.mark_as_crawled(page)
-			 end
+			def records
+				@url_collection.select{|k,v| v.recorded?}
+			end
+			def process_crawl(url, page)
+				h_url = @url_collection[url]
+				h_url.mark_as_crawled(page)
+			end
-		    def crawl(options = {})
-		    	@crawl_started_at = Time.now
-			   @until = Time.now + options[:crawl_for] if options[:crawl_for]
-		      i = 0
-		      each_url do |url, handler, default_data|
-		        break if options[:max_urls] && i >= options[:max_urls]
-		        begin
-		          page = agent.get(url)
-		          Spidey.logger.info "Handling #{url.inspect}"
-		          process_crawl(url, page)
-		          send handler, page, default_data
-		        rescue => ex
-		          add_error url: url, handler: handler, error: ex
-		        end
-		        sleep request_interval if request_interval > 0
-		        i += 1
-		      end
-		    end
+			def crawl(options = {})
+				@crawl_started_at = Time.now
+				@until = Time.now + options[:crawl_for] if options[:crawl_for]
+				i = 0
+				each_url do |url, handler, default_data|
+					break if options[:max_urls] && i >= options[:max_urls]
+					begin
+						page = agent.get(url)
+						Spidey.logger.info "Handling #{url.inspect}"
+						process_crawl(url, page)
+						send handler, page, default_data
+					rescue => ex
+						add_error url: url, handler: handler, error: ex
+					end
+					sleep request_interval if request_interval > 0
+					i += 1
+				end
+			end
-			 def handle(url, handler, handle_data = {})
-			   Spidey.logger.info "Queueing #{url.inspect[0..200]}..."
+			def handle(url, handler, handle_data = {})
+				Spidey.logger.info "Queueing #{url.inspect[0..200]}..."
-			   spider_name = self.class.name
-			   @url_collection[url] ||= HashUrlRecord.spidey_handle( url, handler, spider_name, handle_data )
-			 end
+				spider_name = self.class.name
+				@url_collection[url] ||= HashUrlRecord.spidey_handle( url, handler, spider_name, handle_data )
+			end
-			 # expects @url_collection to have :url, but if not, creates new HashUrlRecord
-			 def record(data_hashie)
-			   url = data_hashie.url
-			   h_url = @url_collection[url] || HashUrlRecord.new(url)
+			# expects @url_collection to have :url, but if not, creates new HashUrlRecord
+			def record(data_hashie)
+				url = data_hashie.url
+				h_url = @url_collection[url] || HashUrlRecord.new(url)
-			   # set the content and record_timestamp of the HashUrlRecord
-			   h_url.record_content(data_hashie.content)
+				# set the content and record_timestamp of the HashUrlRecord
+				h_url.record_content(data_hashie.content)
-			   # reassign, update collection
-			   @url_collection[url] = h_url
-			 end
+				# reassign, update collection
+				@url_collection[url] = h_url
+			end
-			 # wrapper around #record
-			 def record_page(page, default_data={})
-			   msh = Hashie::Mash.new(default_data)
-			   msh.url = page.uri.to_s
-			   msh.content = page.content
+			# wrapper around #record
+			def record_page(page, default_data={})
+				msh = Hashie::Mash.new(default_data)
+				msh.url = page.uri.to_s
+				msh.content = page.content
-			   record(msh)
-			 end
+				record(msh)
+			end
-			 def each_url(&block)
-			   while h_url = get_next_url_hash
-			     yield h_url.url, h_url.handler, h_url.handle_data
-			   end
-			 end
+			def each_url(&block)
+				while h_url = get_next_url_hash
+					yield h_url.url, h_url.handler, h_url.handle_data
+				end
+			end
-			 protected
+			protected
-			 def add_error(attrs)
-		      @error_collection << attrs
-      		Spidey.logger.error "Error on #{attrs[:url]}. #{attrs[:error].class}: #{attrs[:error].message}"
-    		 end
+			def add_error(attrs)
+				@error_collection << attrs
+				Spidey.logger.error "Error on #{attrs[:url]}. #{attrs[:error].class}: #{attrs[:error].message}"
+			end
 			private
 			def get_next_url_hash
 				return nil if (@until && Time.now >= @until)  # exceeded time bound
 				# uncrawled is a filtered collection
 				uncrawled.values.first
 			end
 		end
 	end
 end

data/lib/hash_spidey/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module HashSpidey
-  VERSION = "0.0.1"
+  VERSION = "0.0.2"
 end

data/spec/spiders/hash_store_strategy_spec.rb CHANGED Viewed

@@ -16,13 +16,13 @@ describe HashSpidey::Strategies::HashStore do
 		end
 	end
 	context 'generic #handle' do
 		before(:each) do
 			FakeWeb.register_uri(:get, "http://www.example.com/", :body => "Hello World", code: 200,
 				"content-type"=>"text/html; charset=UTF-8"
-			)
+				)
 			@spider = TestSpider.new request_interval: 0
 			@spider.handle "http://www.example.com/", :process_size
 			@spider.crawl
@@ -36,21 +36,20 @@ describe HashSpidey::Strategies::HashStore do
 			it 'should update #crawled_timestamp' do
 				@crawled_url = @spider.crawls.values.first
-	  			expect( @crawled_url.url ).to eq 'http://www.example.com/'
-	  		 	expect( @crawled_url.crawled_timestamp > @crawled_url.initialized_timestamp).to be_true
+				expect( @crawled_url.url ).to eq 'http://www.example.com/'
+				expect( @crawled_url.crawled_timestamp > @crawled_url.initialized_timestamp).to be_true
 			end
 			it 'should have #crawls act as a Hash' do
 				expect( @spider.crawls['http://www.example.com/'].url).to eq 'http://www.example.com/'
 			end
 			it "should not add duplicate URLs" do
-		    @spider.handle "http://www.example.com/", :process_something_else # second time
-		    expect( @spider.crawls.count ).to eq 1
+				@spider.handle "http://www.example.com/", :process_something_else # second time
+				expect( @spider.crawls.count ).to eq 1
 			end
-			context '@crawl_record' do
+			context '@crawl_record' do
 				before(:each) do
 					@crawled_url = @spider.crawls["http://www.example.com/"]
 				end
@@ -60,13 +59,10 @@ describe HashSpidey::Strategies::HashStore do
 				end
 				it 'should respond to header#content-type' do
-					expect(@crawled_url.header['content-type']).to eq "text/html; charset=UTF-8"
+					expect(@crawled_url.crawleheader['content-type']).to eq "text/html; charset=UTF-8"
 				end
 			end
 		end
 	end
@@ -91,8 +87,4 @@ describe HashSpidey::Strategies::HashStore do
 			end
 		end
 	end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: hash_spidey
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.0.2
   prerelease:
 platform: ruby
 authors: