RubyGems - hash_spidey - Versions diffs - 0.0.2 → 0.0.3 - Mend

hash_spidey 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

data/examples/wikipedia_article_spider.rb +28 -0
data/hash_spidey.gemspec +1 -1
data/lib/hash_spidey/crawl_record.rb +11 -8
data/lib/hash_spidey/hash_url_record.rb +9 -3
data/lib/hash_spidey/strategies/hash_store_strategy.rb +15 -11
data/lib/hash_spidey/version.rb +1 -1
data/spec/spiders/hash_store_strategy_spec.rb +10 -5
data/spec/unit/hash_url_record_spec.rb +2 -2
metadata +3 -2

data/examples/wikipedia_article_spider.rb ADDED Viewed

@@ -0,0 +1,28 @@
+# obviously, you should just use Wikipedia's API
+class WikipediaArticleSpider < HashSpidey::AbstractSpider
+	def initialize(first_article_url, opts)
+		super(opts)
+		handle first_article_url, :process_article
+	end
+	def process_article(page, default_opts={})
+		record_page(page)
+		page.search('a').select{|a| a['href'] =~ /wiki\/Category:/}.each do |a|
+			href = resolve_url( a['href'], page)
+			handle href, :process_category_page
+		end
+	end
+	def process_category_page(page, default_opts={})
+		title = page.title
+		page_count_text = page.search('#mw-pages > p')[0].text.match(/[\d,]+ total\./)
+		datastr = "#{title} has #{page_count_text}"
+		record_data(page, datastr)
+	end
+end

data/hash_spidey.gemspec CHANGED Viewed

@@ -8,7 +8,7 @@ Gem::Specification.new do |spec|
   spec.version       = HashSpidey::VERSION
   spec.authors       = ["dannguyen"]
   spec.email         = ["dansonguyen@gmail.com"]
-  spec.description   = %q{An implementation of joeyAghion's Spidey class at Artsy}
+  spec.description   = %q{An implementation of Artsy's joeyAghion's Spidey::AbstractSpider}
   spec.summary       = %q{Uses a Hash object to store crawling process, which it can then dump to an external store}
   spec.homepage      = "http://github.com/dannguyen"
   spec.license       = "MIT"

data/lib/hash_spidey/crawl_record.rb CHANGED Viewed

@@ -3,22 +3,25 @@ require 'mechanize'
 module HashSpidey
-	class CrawlRecord < BasicObject
+	class CrawlRecord
 		META_ATTS = %w(crawled_timestamp title header code response_header_charset meta_charset detected_encoding content_type)
 		attr_reader :crawled_timestamp
 		def initialize(obj, timestamp)
 			@crawled_timestamp = timestamp
-			@page_object = obj
-		end
-		def to_hash
-			msh = Hashie::Mash.new
-			META_ATTS.each do |att|
-				msh[att] = self.send(att) if self.respond_to?(att)
+			@page_object = META_ATTS.inject(Hashie::Mash.new) do |msh, att|
+				msh[att] = obj.send(att) if obj.respond_to?(att)
+				msh
 			end
-			return msh
+			@page_object.crawled_timestamp = @crawled_timestamp
+		end
+		def to_hash
+			return @page_object
 		end
 		protected

data/lib/hash_spidey/hash_url_record.rb CHANGED Viewed

@@ -7,7 +7,7 @@ module HashSpidey
 		attr_reader :url, :code,
 			:initialized_timestamp, :crawled_timestamp, :recorded_timestamp,
 			:content, :handler, :spider, :handle_data,
-			:crawl_metadata
+			:crawl_metadata, :parsed_data
 		# convenience name for spidey
@@ -31,8 +31,11 @@ module HashSpidey
 		end
-		def record_content(ct)
-			@content = ct
+		def mark_record(obj)
+			obj = Hashie::Mash.new(obj) if obj.is_a?(Hash)
+			@content = obj.content if obj.respond_to?(:content)
+			@parsed_data = obj.parsed_data if obj.respond_to?(:parsed_data)
 			@recorded_timestamp = Time.now
 		end
@@ -51,6 +54,9 @@ module HashSpidey
 			!(crawled_timestamp.nil?)
 		end
+		def has_content?
+			!(@content.nil? || @content.empty?)
+		end
 		## this is just an alias

data/lib/hash_spidey/strategies/hash_store_strategy.rb CHANGED Viewed

@@ -5,6 +5,7 @@ module HashSpidey
 			def initialize(attrs = {})
 				@url_collection = {}
 				@error_collection = []
+				agent.user_agent = "Abstract Spider"
 				super(attrs)
 			end
@@ -42,8 +43,8 @@ module HashSpidey
 					begin
 						page = agent.get(url)
 						Spidey.logger.info "Handling #{url.inspect}"
-						process_crawl(url, page)
 						send handler, page, default_data
+						process_crawl(url, page)
 					rescue => ex
 						add_error url: url, handler: handler, error: ex
 					end
@@ -61,27 +62,30 @@ module HashSpidey
 			end
 			# expects @url_collection to have :url, but if not, creates new HashUrlRecord
-			def record(data_hashie)
-				url = data_hashie.url
+			# data_hashie should have :content and/or :parsed_data
+			def record(url, data_hashie)
 				h_url = @url_collection[url] || HashUrlRecord.new(url)
 				# set the content and record_timestamp of the HashUrlRecord
-				h_url.record_content(data_hashie.content)
+				h_url.mark_record(data_hashie)
 				# reassign, update collection
 				@url_collection[url] = h_url
 			end
+			# convenience method, expecting :page to be a Nokogiri::Page
+			def record_page(page)
+				url = page.uri.to_s
+				record(url, content: page.content)
+			end
-			# wrapper around #record
-			def record_page(page, default_data={})
-				msh = Hashie::Mash.new(default_data)
-				msh.url = page.uri.to_s
-				msh.content = page.content
-				record(msh)
+			def record_data(page, data)
+				url = page.uri.to_s
+				record(url, parsed_data: data)
 			end
 			def each_url(&block)
 				while h_url = get_next_url_hash
 					yield h_url.url, h_url.handler, h_url.handle_data

data/lib/hash_spidey/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module HashSpidey
-  VERSION = "0.0.2"
+  VERSION = "0.0.3"
 end

data/spec/spiders/hash_store_strategy_spec.rb CHANGED Viewed

@@ -59,20 +59,23 @@ describe HashSpidey::Strategies::HashStore do
 				end
 				it 'should respond to header#content-type' do
-					expect(@crawled_url.crawleheader['content-type']).to eq "text/html; charset=UTF-8"
+					expect(@crawled_url.header['content-type']).to eq "text/html; charset=UTF-8"
 				end
 			end
 		end
 	end
-	context 'generic #record' do
+	context 'generic #record_page' do
 		describe '#records' do
 			before(:each) do
+				FakeWeb.register_uri(:get, "http://www.example.com/", :body => "Hello World", code: 200,
+				"content-type"=>"text/html; charset=UTF-8"
+				)
-				@data = Hashie::Mash.new url: 'http://www.example.com/', content: 'Hello World'
 				@spider = TestSpider.new request_interval: 0
-				@spider.record @data
+				@page = Mechanize.new.get("http://www.example.com/")
+				@spider.record_page @page
 			end
 			it "should add to records" do
@@ -81,7 +84,9 @@ describe HashSpidey::Strategies::HashStore do
 			end
 			it 'should update existing result' do
-				@spider.record Hashie::Mash.new url: 'http://www.example.com/', content: 'Bye World'
+				@page.stub(:content){ 'Bye World' }
+				@spider.record_page @page
 				expect(@spider.records['http://www.example.com/'].content).to eq 'Bye World'
 				expect(@spider.records.count).to eq 1
 			end

data/spec/unit/hash_url_record_spec.rb CHANGED Viewed

@@ -37,9 +37,9 @@ describe HashSpidey::HashUrlRecord do
 			@hurl = HashUrlRecord.new "http://www.example.com"
 		end
-		describe '#record_content' do
+		describe '#mark_record' do
 			before(:each) do
-				@hurl.record_content 'hello'
+				@hurl.mark_record content: 'hello'
 			end
 			it 'should set @recorded_timestamp' do

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: hash_spidey
 version: !ruby/object:Gem::Version
-  version: 0.0.2
+  version: 0.0.3
   prerelease:
 platform: ruby
 authors:
@@ -91,7 +91,7 @@ dependencies:
     - - ! '>='
       - !ruby/object:Gem::Version
         version: '0'
-description: An implementation of joeyAghion's Spidey class at Artsy
+description: An implementation of Artsy's joeyAghion's Spidey::AbstractSpider
 email:
 - dansonguyen@gmail.com
 executables: []
@@ -103,6 +103,7 @@ files:
 - LICENSE.txt
 - README.md
 - Rakefile
+- examples/wikipedia_article_spider.rb
 - hash_spidey.gemspec
 - lib/hash_spidey.rb
 - lib/hash_spidey/crawl_record.rb