RubyGems - hash_spidey - Versions diffs - 0.0.1 - Mend

hash_spidey 0.0.1

Files changed (16) hide show

data/.gitignore +19 -0
data/Gemfile +10 -0
data/LICENSE.txt +22 -0
data/README.md +29 -0
data/Rakefile +20 -0
data/hash_spidey.gemspec +29 -0
data/lib/hash_spidey.rb +12 -0
data/lib/hash_spidey/crawl_record.rb +35 -0
data/lib/hash_spidey/hash_url_record.rb +73 -0
data/lib/hash_spidey/strategies/hash_store_strategy.rb +112 -0
data/lib/hash_spidey/version.rb +3 -0
data/spec/spec.rake +17 -0
data/spec/spec_helper.rb +15 -0
data/spec/spiders/hash_store_strategy_spec.rb +98 -0
data/spec/unit/hash_url_record_spec.rb +73 -0
metadata +146 -0

data/.gitignore ADDED Viewed

@@ -0,0 +1,19 @@
+*.gem
+*.rbc
+.bundle
+.config
+.yardoc
+Gemfile.lock
+InstalledFiles
+_yardoc
+coverage
+doc/
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp
+.DS_Store

data/Gemfile ADDED Viewed

@@ -0,0 +1,10 @@
+source 'https://rubygems.org'
+group :test do
+	gem 'rspec', :group => 'test'
+	gem "fakeweb", ["~> 1.3"], group: 'test'
+end
+# Specify your gem's dependencies in hash_spidey.gemspec
+gemspec

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,22 @@
+Copyright (c) 2013 dannguyen
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,29 @@
+# HashSpidey
+A sloppy implementation of [joeyAghion's Spidey](https://github.com/joeyAghion/spidey) abstract web crawling, using in-memory Hash to save pages and links. Very smelly and unstable until I figure out the best API.
+## Installation
+Add this line to your application's Gemfile:
+    gem 'hash_spidey'
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install hash_spidey
+## Usage
+TODO: Write usage instructions here
+## Contributing
+1. Fork it
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create new Pull Request

data/Rakefile ADDED Viewed

@@ -0,0 +1,20 @@
+require 'rdoc/task'
+require 'rubygems'
+require 'rubygems/package_task'
+require 'rspec/core/rake_task'
+desc 'Default: run specs.'
+task :default => :rspec
+desc 'Run the specs'
+RSpec::Core::RakeTask.new(:rspec) do |t|
+  t.rspec_opts = ['--color']
+  t.pattern = './spec/**/*_spec.rb'
+end
+spec = Gem::Specification.load("#{File.dirname(__FILE__)}/hash_spidey.gemspec")
+desc "Package gem."
+Gem::PackageTask.new(spec) do |pkg|
+  pkg.gem_spec = spec
+end

data/hash_spidey.gemspec ADDED Viewed

@@ -0,0 +1,29 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'hash_spidey/version'
+Gem::Specification.new do |spec|
+  spec.name          = "hash_spidey"
+  spec.version       = HashSpidey::VERSION
+  spec.authors       = ["dannguyen"]
+  spec.email         = ["dansonguyen@gmail.com"]
+  spec.description   = %q{An implementation of joeyAghion's Spidey class at Artsy}
+  spec.summary       = %q{Uses a Hash object to store crawling process, which it can then dump to an external store}
+  spec.homepage      = "http://github.com/dannguyen"
+  spec.license       = "MIT"
+  spec.files         = `git ls-files`.split($/)
+  spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
+  spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
+  spec.require_paths = ["lib"]
+  spec.add_development_dependency "bundler", "~> 1.3"
+  spec.add_development_dependency "rake"
+  spec.add_dependency 'spidey', '~> 0.1'
+  spec.add_dependency 'hashie'
+  spec.add_dependency 'addressable'
+end

data/lib/hash_spidey.rb ADDED Viewed

@@ -0,0 +1,12 @@
+require "hash_spidey/version"
+require 'hashie'
+require 'spidey'
+require_relative 'hash_spidey/hash_url_record'
+require_relative 'hash_spidey/strategies/hash_store_strategy'
+module HashSpidey
+	class AbstractSpider < Spidey::AbstractSpider
+		include HashSpidey::Strategies::HashStore
+	end
+end

data/lib/hash_spidey/crawl_record.rb ADDED Viewed

@@ -0,0 +1,35 @@
+require 'hashie'
+require 'mechanize'
+module HashSpidey
+	class CrawlRecord < BasicObject
+		META_ATTS = %w(crawled_timestamp title header code response_header_charset meta_charset detected_encoding content_type)
+		attr_reader :crawled_timestamp
+		def initialize(obj, timestamp)
+			@crawled_timestamp = timestamp
+			@page_object = obj
+		end
+		def to_hash
+			msh = Hashie::Mash.new
+			META_ATTS.each do |att|
+				msh[att] = self.send(att) if self.respond_to?(att)
+			end
+			return msh
+		end
+		protected
+		def method_missing(name, *args, &block)
+			if @page_object.respond_to?(name)
+			  @page_object.send(name, *args, &block)
+			else
+			  super
+			end
+		end
+	end
+end

data/lib/hash_spidey/hash_url_record.rb ADDED Viewed

@@ -0,0 +1,73 @@
+require 'addressable/uri'
+require_relative 'crawl_record'
+module HashSpidey
+	class HashUrlRecord
+		attr_reader :url, :code,
+			:initialized_timestamp, :crawled_timestamp, :recorded_timestamp,
+			:content, :handler, :spider, :handle_data,
+			:crawl_metadata
+		# convenience name for spidey
+		def self.spidey_handle(url, handler, spider, opts)
+			mash_opts = Hashie::Mash.new opts
+			mash_opts.spider = spider
+			mash_opts.handler = handler
+			return HashUrlRecord.new url, mash_opts
+		end
+		def initialize(url, opts={})
+			@url = url
+			@addressable_uri = Addressable::URI.parse(@url)
+			@initialized_timestamp = Time.now
+			mash_opts = Hashie::Mash.new(opts)
+			@spider = mash_opts.delete :spider
+			@handler = mash_opts.delete :handler
+			@handle_data = mash_opts.delete :handle_data # not sure if needed?...
+		end
+		def record_content(ct)
+			@content = ct
+			@recorded_timestamp = Time.now
+		end
+		# saves data related
+		def mark_as_crawled(page_obj={})
+			@crawled_timestamp = Time.now
+			# do something with mechanized page object
+			@crawl_metadata = HashSpidey::CrawlRecord.new(page_obj, @crawled_timestamp)
+		end
+		def recorded?
+			!(@recorded_timestamp.nil?)
+		end
+		def crawled?
+			!(crawled_timestamp.nil?)
+		end
+		## this is just an alias
+		# obvious smells
+		def collected_timestamp; @recorded_timestamp; end
+		def header; @crawl_metadata.header unless @crawl_metadata.nil? ; end
+		def code; @crawl_metadata.code unless @crawl_metadata.nil? ; end
+		#### url inspection methods
+		[:host, :port, :query, :scheme, :path ].each do |foo|
+			define_method foo do
+				@addressable_uri.send foo
+			end
+		end
+		def query_values
+			@addressable_uri.query_values
+		end
+	end
+end

data/lib/hash_spidey/strategies/hash_store_strategy.rb ADDED Viewed

@@ -0,0 +1,112 @@
+module HashSpidey
+	module Strategies
+		module HashStore
+			 def initialize(attrs = {})
+			    @url_collection = {}
+			    @error_collection = []
+			    super(attrs)
+			 end
+			 #### process strategies
+			 ## conveinence methods
+			 def crawls
+			   @url_collection.select{|k,v| v.crawled?}
+			 end
+			 def uncrawled
+			   @url_collection.reject{|k,v| v.crawled?}
+			 end
+			 def records
+			   @url_collection.select{|k,v| v.recorded?}
+			 end
+			 def process_crawl(url, page)
+			 	h_url = @url_collection[url]
+			 	h_url.mark_as_crawled(page)
+			 end
+		    def crawl(options = {})
+		    	@crawl_started_at = Time.now
+			   @until = Time.now + options[:crawl_for] if options[:crawl_for]
+		      i = 0
+		      each_url do |url, handler, default_data|
+		        break if options[:max_urls] && i >= options[:max_urls]
+		        begin
+		          page = agent.get(url)
+		          Spidey.logger.info "Handling #{url.inspect}"
+		          process_crawl(url, page)
+		          send handler, page, default_data
+		        rescue => ex
+		          add_error url: url, handler: handler, error: ex
+		        end
+		        sleep request_interval if request_interval > 0
+		        i += 1
+		      end
+		    end
+			 def handle(url, handler, handle_data = {})
+			   Spidey.logger.info "Queueing #{url.inspect[0..200]}..."
+			   spider_name = self.class.name
+			   @url_collection[url] ||= HashUrlRecord.spidey_handle( url, handler, spider_name, handle_data )
+			 end
+			 # expects @url_collection to have :url, but if not, creates new HashUrlRecord
+			 def record(data_hashie)
+			   url = data_hashie.url
+			   h_url = @url_collection[url] || HashUrlRecord.new(url)
+			   # set the content and record_timestamp of the HashUrlRecord
+			   h_url.record_content(data_hashie.content)
+			   # reassign, update collection
+			   @url_collection[url] = h_url
+			 end
+			 # wrapper around #record
+			 def record_page(page, default_data={})
+			   msh = Hashie::Mash.new(default_data)
+			   msh.url = page.uri.to_s
+			   msh.content = page.content
+			   record(msh)
+			 end
+			 def each_url(&block)
+			   while h_url = get_next_url_hash
+			     yield h_url.url, h_url.handler, h_url.handle_data
+			   end
+			 end
+			 protected
+			 def add_error(attrs)
+		      @error_collection << attrs
+      		Spidey.logger.error "Error on #{attrs[:url]}. #{attrs[:error].class}: #{attrs[:error].message}"
+    		 end
+			private
+			def get_next_url_hash
+				return nil if (@until && Time.now >= @until)  # exceeded time bound
+				# uncrawled is a filtered collection
+				uncrawled.values.first
+			end
+		end
+	end
+end

data/lib/hash_spidey/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module HashSpidey
+  VERSION = "0.0.1"
+end

data/spec/spec.rake ADDED Viewed

@@ -0,0 +1,17 @@
+begin
+  require 'rspec/core/rake_task'
+  spec_tasks = Dir['spec/*/'].map { |d| File.basename(d) }
+  spec_tasks.each do |folder|
+    RSpec::Core::RakeTask.new("spec:#{folder}") do |t|
+      t.pattern = "./spec/#{folder}/**/*_spec.rb"
+      t.rspec_opts = %w(-fs --color)
+    end
+  end
+  desc "Run complete application spec suite"
+  task 'spec' => spec_tasks.map { |f| "spec:#{f}" }
+rescue LoadError
+  puts "RSpec is not part of this bundle, skip specs."
+end

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,15 @@
+require 'hash_spidey'
+require 'fakeweb'
+RSpec.configure do |config|
+	config.filter_run_excluding :skip => true
+	config.formatter = :documentation # :progress, :html, :textmate
+	config.fail_fast = true
+	config.before(:each) do
+	end
+	config.after(:each) do
+	end
+end

data/spec/spiders/hash_store_strategy_spec.rb ADDED Viewed

@@ -0,0 +1,98 @@
+require 'spec_helper'
+describe HashSpidey::Strategies::HashStore do
+	before(:each) do
+	end
+	class TestSpider < HashSpidey::AbstractSpider
+		DEFAULT_REQUEST_INTERVAL = 0.001
+		include HashSpidey::Strategies::HashStore
+		def process_size(npage, data={})
+			npage.inspect
+		end
+	end
+	context 'generic #handle' do
+		before(:each) do
+			FakeWeb.register_uri(:get, "http://www.example.com/", :body => "Hello World", code: 200,
+				"content-type"=>"text/html; charset=UTF-8"
+			)
+			@spider = TestSpider.new request_interval: 0
+			@spider.handle "http://www.example.com/", :process_size
+			@spider.crawl
+		end
+		describe '#crawls' do
+			it 'should only add to #crawls' do
+				expect( @spider.crawls.count ).to eq 1
+				expect( @spider.records.count ).to eq 0
+			end
+			it 'should update #crawled_timestamp' do
+				@crawled_url = @spider.crawls.values.first
+	  			expect( @crawled_url.url ).to eq 'http://www.example.com/'
+	  		 	expect( @crawled_url.crawled_timestamp > @crawled_url.initialized_timestamp).to be_true
+			end
+			it 'should have #crawls act as a Hash' do
+				expect( @spider.crawls['http://www.example.com/'].url).to eq 'http://www.example.com/'
+			end
+			it "should not add duplicate URLs" do
+		    @spider.handle "http://www.example.com/", :process_something_else # second time
+		    expect( @spider.crawls.count ).to eq 1
+			end
+			context '@crawl_record' do
+				before(:each) do
+					@crawled_url = @spider.crawls["http://www.example.com/"]
+				end
+				it 'should respond to #code' do
+					expect(@crawled_url.code).to eq '200'
+				end
+				it 'should respond to header#content-type' do
+					expect(@crawled_url.header['content-type']).to eq "text/html; charset=UTF-8"
+				end
+			end
+		end
+	end
+	context 'generic #record' do
+		describe '#records' do
+			before(:each) do
+				@data = Hashie::Mash.new url: 'http://www.example.com/', content: 'Hello World'
+				@spider = TestSpider.new request_interval: 0
+				@spider.record @data
+			end
+			it "should add to records" do
+				expect(@spider.records.count).to eq 1
+				expect(@spider.records['http://www.example.com/'].content).to eq 'Hello World'
+			end
+			it 'should update existing result' do
+				@spider.record Hashie::Mash.new url: 'http://www.example.com/', content: 'Bye World'
+				expect(@spider.records['http://www.example.com/'].content).to eq 'Bye World'
+				expect(@spider.records.count).to eq 1
+			end
+		end
+	end
+end

data/spec/unit/hash_url_record_spec.rb ADDED Viewed

@@ -0,0 +1,73 @@
+require 'spec_helper'
+include HashSpidey
+describe HashSpidey::HashUrlRecord do
+	context "delegate URI methods to Addressable::URI" do
+		before(:each) do
+			@hurl = HashUrlRecord.new 'http://www.example.com:80/stuff/?q=1&a=2&b=hello'
+		end
+		it 'should have #host' do
+			expect( @hurl.host ).to eq 'www.example.com'
+		end
+		it 'should have #port' do
+			expect( @hurl.port ).to eq 80
+		end
+		it 'should have #query' do
+			expect( @hurl.query ).to eq 'q=1&a=2&b=hello'
+		end
+		it 'should have #scheme' do
+			expect( @hurl.scheme ).to eq 'http'
+		end
+		it 'should have #path' do
+			expect( @hurl.path ).to eq '/stuff/'
+		end
+	end
+	context "state changes upon record and crawl" do
+		before(:each) do
+			@hurl = HashUrlRecord.new "http://www.example.com"
+		end
+		describe '#record_content' do
+			before(:each) do
+				@hurl.record_content 'hello'
+			end
+			it 'should set @recorded_timestamp' do
+				expect( @hurl.recorded_timestamp ).to be_within(2).of Time.now
+			end
+			it 'should set @content' do
+				expect( @hurl.content ).to eq 'hello'
+			end
+			it 'should have #recorded? be true' do
+				expect( @hurl.recorded?).to be_true
+			end
+		end
+		describe '#mark_as_crawled' do
+			before(:each) do
+				@hurl.mark_as_crawled
+			end
+			it 'should set @crawled_timestamp' do
+				expect( @hurl.crawled_timestamp ).to be_within(2).of Time.now
+			end
+			it 'should have #crawled? be true' do
+				expect( @hurl.crawled?).to be_true
+			end
+		end
+	end
+end

metadata ADDED Viewed

@@ -0,0 +1,146 @@
+--- !ruby/object:Gem::Specification
+name: hash_spidey
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+  prerelease:
+platform: ruby
+authors:
+- dannguyen
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2013-06-16 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.3'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.3'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: spidey
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '0.1'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '0.1'
+- !ruby/object:Gem::Dependency
+  name: hashie
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: addressable
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+description: An implementation of joeyAghion's Spidey class at Artsy
+email:
+- dansonguyen@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- .gitignore
+- Gemfile
+- LICENSE.txt
+- README.md
+- Rakefile
+- hash_spidey.gemspec
+- lib/hash_spidey.rb
+- lib/hash_spidey/crawl_record.rb
+- lib/hash_spidey/hash_url_record.rb
+- lib/hash_spidey/strategies/hash_store_strategy.rb
+- lib/hash_spidey/version.rb
+- spec/spec.rake
+- spec/spec_helper.rb
+- spec/spiders/hash_store_strategy_spec.rb
+- spec/unit/hash_url_record_spec.rb
+homepage: http://github.com/dannguyen
+licenses:
+- MIT
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 1.8.23
+signing_key:
+specification_version: 3
+summary: Uses a Hash object to store crawling process, which it can then dump to an
+  external store
+test_files:
+- spec/spec.rake
+- spec/spec_helper.rb
+- spec/spiders/hash_store_strategy_spec.rb
+- spec/unit/hash_url_record_spec.rb